1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-05 01:28:39 +00:00

working on space optimization

This commit is contained in:
Lu Wang 2013-03-21 12:18:26 +08:00
parent b83611bd65
commit 8ef466714e
4 changed files with 147 additions and 36 deletions

View File

@ -36,7 +36,7 @@ public:
long long id; long long id;
bool use_tounicode; bool use_tounicode;
int em_size; int em_size;
int space_width; double space_width;
double ascent, descent; double ascent, descent;
bool is_type3; bool is_type3;
}; };

View File

@ -7,6 +7,7 @@
*/ */
#include <vector> #include <vector>
#include <cmath>
#include <algorithm> #include <algorithm>
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
@ -26,6 +27,7 @@ using std::ostream;
using std::cerr; using std::cerr;
using std::endl; using std::endl;
using std::find; using std::find;
using std::abs;
void HTMLRenderer::TextLineBuffer::reset(GfxState * state) void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
{ {
@ -74,14 +76,6 @@ void HTMLRenderer::TextLineBuffer::flush(void)
optimize(); optimize();
for(auto iter = states.begin(); iter != states.end(); ++iter)
iter->hash();
states.resize(states.size() + 1);
states.back().start_idx = text.size();
offsets.push_back(Offset({text.size(), 0}));
double max_ascent = 0; double max_ascent = 0;
for(auto iter = states.begin(); iter != states.end(); ++iter) for(auto iter = states.begin(); iter != states.end(); ++iter)
{ {
@ -89,6 +83,16 @@ void HTMLRenderer::TextLineBuffer::flush(void)
max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size); max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size);
} }
// append a dummy state for convenience
states.resize(states.size() + 1);
states.back().start_idx = text.size();
for(auto iter = states.begin(); iter != states.end(); ++iter)
iter->hash();
// append a dummy offset for convenience
offsets.push_back(Offset({text.size(), 0}));
ostream & out = renderer->f_pages.fs; ostream & out = renderer->f_pages.fs;
renderer->height_manager.install(max_ascent); renderer->height_manager.install(max_ascent);
renderer->left_manager .install(x); renderer->left_manager .install(x);
@ -153,21 +157,30 @@ void HTMLRenderer::TextLineBuffer::flush(void)
{ {
double target = cur_offset_iter->width + dx; double target = cur_offset_iter->width + dx;
auto & wm = renderer->whitespace_manager; if(equal(target, stack.back()->single_space_offset()))
wm.install(target); {
auto wid = wm.get_id(); Unicode u = ' ';
double w = wm.get_actual_value(); outputUnicodes(out, &u, 1);
dx = 0;
}
else
{
auto & wm = renderer->whitespace_manager;
wm.install(target);
auto wid = wm.get_id();
double w = wm.get_actual_value();
if(w < 0) if(w < 0)
last_text_pos_with_negative_offset = cur_text_idx; last_text_pos_with_negative_offset = cur_text_idx;
auto * p = stack.back(); auto * p = stack.back();
double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold); double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);
out << "<span class=\"" << CSS::WHITESPACE_CN out << "<span class=\"" << CSS::WHITESPACE_CN
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>"; << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
dx = target - w; dx = target - w;
}
++ cur_offset_iter; ++ cur_offset_iter;
} }
@ -205,21 +218,114 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state)
state.ids[State::RISE_ID] = renderer->rise_manager.get_id(); state.ids[State::RISE_ID] = renderer->rise_manager.get_id();
state.font_info = renderer->cur_font_info; state.font_info = renderer->cur_font_info;
state.draw_font_size = renderer->font_size_manager.get_value(); state.draw_font_size = renderer->font_size_manager.get_actual_value();
state.letter_space = renderer->letter_space_manager.get_actual_value();
state.word_space = renderer->word_space_manager.get_actual_value();
} }
void HTMLRenderer::TextLineBuffer::optimize(void) void HTMLRenderer::TextLineBuffer::optimize(void)
{ {
assert(!states.empty()); assert(!states.empty());
// TODO
// set proper hash_umask // set proper hash_umask
long long word_space_umask = ((long long)0xff) << (8*((int)State::WORD_SPACE_ID));
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
auto text_iter1 = text.begin() + (iter->start_idx);
auto next_iter = iter;
++next_iter;
auto text_iter2 = (next_iter == states.end()) ? (text.end()) : (text.begin() + (next_iter->start_idx));
if(find(text_iter1, text_iter2, ' ') == text_iter2)
{
// if there's no space, word_space does not matter;
iter->hash_umask |= word_space_umask;
}
}
// clean zero offsets
{
auto write_iter = offsets.begin();
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
{
if(!equal(iter->width, 0))
{
*write_iter = *iter;
++write_iter;
}
}
offsets.erase(write_iter, offsets.end());
}
// In some PDF files all spaces are converted into positionig shifts // In some PDF files all spaces are converted into positionig shifts
// We may try to change them to ' ' and adjusted word_spaces // We may try to change them to ' ' and adjusted word_spaces
// This can also be applied when param->space_as_offset is set // This can also be applied when param->space_as_offset is set
// for now, we cosider only the no-space scenario
if(offsets.size() > 0)
{
// Since GCC 4.4.6 is suported, I cannot use all_of + lambda here
bool all_ws_umask = true;
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
if(!(iter->hash_umask & word_space_umask))
{
all_ws_umask = false;
break;
}
}
if(all_ws_umask)
{
double avg_width = 0;
int posive_offset_count = 0;
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
{
if(is_positive(iter->width))
{
++posive_offset_count;
avg_width += iter->width;
}
}
avg_width /= posive_offset_count;
// now check if the width of offsets are close enough
// TODO: it might make more sense if the threshold is proportion to the font size
bool ok = true;
double accum_off = 0;
double orig_accum_off = 0;
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
{
orig_accum_off += iter->width;
accum_off += avg_width;
if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps)
{
ok = false;
break;
}
}
if(ok)
{
// ok, make all offsets equi-width
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
{
if(is_positive(iter->width))
iter->width = avg_width;
}
// set new word_space
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
double new_word_space = avg_width - iter->single_space_offset();
// install new word_space
// we might introduce more variance here
auto & wm = renderer->word_space_manager;
wm.install(new_word_space);
iter->ids[State::WORD_SPACE_ID] = wm.get_id();
iter->word_space = wm.get_actual_value();
iter->hash_umask &= (~word_space_umask);
}
}
}
}
} }
// this state will be converted to a child node of the node of prev_state // this state will be converted to a child node of the node of prev_state
@ -312,6 +418,11 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
return d; return d;
} }
double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const
{
return letter_space + font_info->space_width * draw_font_size;
}
// the order should be the same as in the enum // the order should be the same as in the enum
const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = { const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = {
CSS::FONT_FAMILY_CN, CSS::FONT_FAMILY_CN,

View File

@ -29,6 +29,8 @@ public:
void hash(void); void hash(void);
// calculate the difference between another State // calculate the difference between another State
int diff(const State & s) const; int diff(const State & s) const;
// the offset cause by a single ' ' char
double single_space_offset(void) const;
enum { enum {
FONT_ID, FONT_ID,
@ -46,6 +48,7 @@ public:
const FontInfo * font_info; const FontInfo * font_info;
double draw_font_size; double draw_font_size;
double letter_space;
double word_space; double word_space;
size_t start_idx; // index of the first Text using this state size_t start_idx; // index of the first Text using this state

View File

@ -206,15 +206,14 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
if(!font->isCIDFont()) if(!font->isCIDFont())
{ {
if(font_8bit) font_8bit = dynamic_cast<Gfx8BitFont*>(font);
{ info.space_width = font_8bit->getWidth(' ');
info.space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5); }
} else
else {
{ font_cid = dynamic_cast<GfxCIDFont*>(font);
char buf[2] = {0, ' '}; char buf[2] = {0, ' '};
info.space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5); info.space_width = font_cid->getWidth(buf, 2);
}
} }
if(get_metric_only) if(get_metric_only)
@ -241,9 +240,8 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
* for CID Truetype * for CID Truetype
* same as 8bitTrueType, except for that we have to check 65536 charcodes * same as 8bitTrueType, except for that we have to check 65536 charcodes
*/ */
if(!font->isCIDFont()) if(font_8bit)
{ {
font_8bit = dynamic_cast<Gfx8BitFont*>(font);
maxcode = 0xff; maxcode = 0xff;
if(is_truetype_suffix(suffix)) if(is_truetype_suffix(suffix))
{ {
@ -296,7 +294,6 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
} }
else else
{ {
font_cid = dynamic_cast<GfxCIDFont*>(font);
maxcode = 0xffff; maxcode = 0xffff;
if(is_truetype_suffix(suffix)) if(is_truetype_suffix(suffix))
@ -437,7 +434,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
// Might be a problem if ' ' is in the font, but not empty // Might be a problem if ' ' is in the font, but not empty
if(!has_space) if(!has_space)
{ {
ffw_add_empty_char((int32_t)' ', info.space_width); ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5));
} }
if(ctu) if(ctu)