From b83611bd65e52799b0ea0474b629b0cb29bf52d3 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Wed, 20 Mar 2013 23:46:58 +0800 Subject: [PATCH 1/4] working --- src/HTMLRenderer/HTMLRenderer.h | 2 +- src/HTMLRenderer/TextLineBuffer.cc | 59 +++++++++++++++++++++++++----- src/HTMLRenderer/TextLineBuffer.h | 7 +++- src/HTMLRenderer/font.cc | 27 ++++++++------ 4 files changed, 71 insertions(+), 24 deletions(-) diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index b079800..131d824 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -30,13 +30,13 @@ namespace pdf2htmlEX { -// we may need more info of a font in the future class FontInfo { public: long long id; bool use_tounicode; int em_size; + int space_width; double ascent, descent; bool is_type3; }; diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc index 2a678f8..f8ef903 100644 --- a/src/HTMLRenderer/TextLineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -7,6 +7,7 @@ */ #include +#include #include "HTMLRenderer.h" #include "TextLineBuffer.h" @@ -24,6 +25,7 @@ using std::vector; using std::ostream; using std::cerr; using std::endl; +using std::find; void HTMLRenderer::TextLineBuffer::reset(GfxState * state) { @@ -50,6 +52,7 @@ void HTMLRenderer::TextLineBuffer::append_state(void) { states.resize(states.size() + 1); states.back().start_idx = text.size(); + states.back().hash_umask = 0; } set_state(states.back()); @@ -69,6 +72,8 @@ void HTMLRenderer::TextLineBuffer::flush(void) return; } + optimize(); + for(auto iter = states.begin(); iter != states.end(); ++iter) iter->hash(); @@ -81,7 +86,7 @@ void HTMLRenderer::TextLineBuffer::flush(void) for(auto iter = states.begin(); iter != states.end(); ++iter) { const auto & s = *iter; - max_ascent = max(max_ascent, s.ascent * s.draw_font_size); + max_ascent = max(max_ascent, s.font_info->ascent * s.draw_font_size); } ostream & out = renderer->f_pages.fs; @@ -157,7 +162,7 @@ void HTMLRenderer::TextLineBuffer::flush(void) last_text_pos_with_negative_offset = cur_text_idx; auto * p = stack.back(); - double threshold = p->draw_font_size * (p->ascent - p->descent) * (renderer->param->space_threshold); + double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold); out << "" << (target > (threshold - EPS) ? " " : "") << ""; @@ -199,18 +204,47 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state) state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id(); state.ids[State::RISE_ID] = renderer->rise_manager.get_id(); - const FontInfo * info = renderer->cur_font_info; - state.ascent = info->ascent; - state.descent = info->descent; + state.font_info = renderer->cur_font_info; state.draw_font_size = renderer->font_size_manager.get_value(); } +void HTMLRenderer::TextLineBuffer::optimize(void) +{ + assert(!states.empty()); + + // TODO + + // set proper hash_umask + + // In some PDF files all spaces are converted into positionig shifts + // We may try to change them to ' ' and adjusted word_spaces + // This can also be applied when param->space_as_offset is set + +} + +// this state will be converted to a child node of the node of prev_state +// dump the difference between previous state +// also clone corresponding states void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state) { + long long cur_mask = 0xff; bool first = true; - for(int i = 0; i < ID_COUNT; ++i) + for(int i = 0; i < ID_COUNT; ++i, cur_mask<<=8) { - if(prev_state && (prev_state->ids[i] == ids[i])) + if(hash_umask & cur_mask) // we don't care about this ID + { + if (prev_state && (!(prev_state->hash_umask & cur_mask))) // if prev_state have it set + { + // we have to inherit it + ids[i] = prev_state->ids[i]; + hash_umask &= (~cur_mask); + } + //anyway we don't have to output it + continue; + } + + // now we care about the ID + if(prev_state && (!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i])) continue; if(first) @@ -231,7 +265,7 @@ void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * pr out << ids[i]; } - if(first) + if(first) // we actually just inherit the whole prev_state { need_close = false; } @@ -264,12 +298,17 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const * it could be wrong when there are more then 256 classes, * in which case the output may not be optimal, but still 'correct' in terms of HTML */ - if(hash_value == s.hash_value) return 0; + long long common_mask = ~(hash_umask | s.hash_umask); + if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0; + long long cur_mask = 0xff; int d = 0; for(int i = 0; i < ID_COUNT; ++i) - if(ids[i] != s.ids[i]) + { + if((common_mask & cur_mask) && (ids[i] != s.ids[i])) ++ d; + cur_mask <<= 8; + } return d; } diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h index 2cc288a..7051e0b 100644 --- a/src/HTMLRenderer/TextLineBuffer.h +++ b/src/HTMLRenderer/TextLineBuffer.h @@ -44,13 +44,14 @@ public: long long ids[ID_COUNT]; - double ascent; - double descent; + const FontInfo * font_info; double draw_font_size; + double word_space; size_t start_idx; // index of the first Text using this state // for optimzation long long hash_value; + long long hash_umask; // some states may not be actually used bool need_close; static const char * const css_class_names []; // class names for each id @@ -73,6 +74,8 @@ private: // retrieve state from renderer void set_state(State & state); + void optimize(void); + HTMLRenderer * renderer; double x, y; diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc index ca4eaa0..f686871 100644 --- a/src/HTMLRenderer/font.cc +++ b/src/HTMLRenderer/font.cc @@ -204,6 +204,19 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo info.em_size = ffw_get_em_size(); + if(!font->isCIDFont()) + { + if(font_8bit) + { + info.space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5); + } + else + { + char buf[2] = {0, ' '}; + info.space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5); + } + } + if(get_metric_only) { ffw_metric(&info.ascent, &info.descent); @@ -424,17 +437,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo // Might be a problem if ' ' is in the font, but not empty if(!has_space) { - int space_width; - if(font_8bit) - { - space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5); - } - else - { - char buf[2] = {0, ' '}; - space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5); - } - ffw_add_empty_char((int32_t)' ', space_width); + ffw_add_empty_char((int32_t)' ', info.space_width); } if(ctu) @@ -525,6 +528,8 @@ const FontInfo * HTMLRenderer::install_font(GfxFont * font) if(font == nullptr) { + new_font_info.em_size = 0; + new_font_info.space_width = 0; new_font_info.ascent = 0; new_font_info.descent = 0; new_font_info.is_type3 = false; From 8ef466714e0d03a30bbb3efdeb71258bd7afa32f Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 21 Mar 2013 12:18:26 +0800 Subject: [PATCH 2/4] working on space optimization --- src/HTMLRenderer/HTMLRenderer.h | 2 +- src/HTMLRenderer/TextLineBuffer.cc | 155 +++++++++++++++++++++++++---- src/HTMLRenderer/TextLineBuffer.h | 3 + src/HTMLRenderer/font.cc | 23 ++--- 4 files changed, 147 insertions(+), 36 deletions(-) diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 131d824..32bfb33 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -36,7 +36,7 @@ public: long long id; bool use_tounicode; int em_size; - int space_width; + double space_width; double ascent, descent; bool is_type3; }; diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc index f8ef903..f1c956f 100644 --- a/src/HTMLRenderer/TextLineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -7,6 +7,7 @@ */ #include +#include #include #include "HTMLRenderer.h" @@ -26,6 +27,7 @@ using std::ostream; using std::cerr; using std::endl; using std::find; +using std::abs; void HTMLRenderer::TextLineBuffer::reset(GfxState * state) { @@ -74,14 +76,6 @@ void HTMLRenderer::TextLineBuffer::flush(void) optimize(); - for(auto iter = states.begin(); iter != states.end(); ++iter) - iter->hash(); - - states.resize(states.size() + 1); - states.back().start_idx = text.size(); - - offsets.push_back(Offset({text.size(), 0})); - double max_ascent = 0; for(auto iter = states.begin(); iter != states.end(); ++iter) { @@ -89,6 +83,16 @@ void HTMLRenderer::TextLineBuffer::flush(void) max_ascent = max(max_ascent, s.font_info->ascent * s.draw_font_size); } + // append a dummy state for convenience + states.resize(states.size() + 1); + states.back().start_idx = text.size(); + + for(auto iter = states.begin(); iter != states.end(); ++iter) + iter->hash(); + + // append a dummy offset for convenience + offsets.push_back(Offset({text.size(), 0})); + ostream & out = renderer->f_pages.fs; renderer->height_manager.install(max_ascent); renderer->left_manager .install(x); @@ -153,21 +157,30 @@ void HTMLRenderer::TextLineBuffer::flush(void) { double target = cur_offset_iter->width + dx; - auto & wm = renderer->whitespace_manager; - wm.install(target); - auto wid = wm.get_id(); - double w = wm.get_actual_value(); + if(equal(target, stack.back()->single_space_offset())) + { + Unicode u = ' '; + outputUnicodes(out, &u, 1); + dx = 0; + } + else + { + auto & wm = renderer->whitespace_manager; + wm.install(target); + auto wid = wm.get_id(); + double w = wm.get_actual_value(); - if(w < 0) - last_text_pos_with_negative_offset = cur_text_idx; + if(w < 0) + last_text_pos_with_negative_offset = cur_text_idx; - auto * p = stack.back(); - double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold); + auto * p = stack.back(); + double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold); - out << "" << (target > (threshold - EPS) ? " " : "") << ""; + out << "" << (target > (threshold - EPS) ? " " : "") << ""; - dx = target - w; + dx = target - w; + } ++ cur_offset_iter; } @@ -205,21 +218,114 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state) state.ids[State::RISE_ID] = renderer->rise_manager.get_id(); state.font_info = renderer->cur_font_info; - state.draw_font_size = renderer->font_size_manager.get_value(); + state.draw_font_size = renderer->font_size_manager.get_actual_value(); + state.letter_space = renderer->letter_space_manager.get_actual_value(); + state.word_space = renderer->word_space_manager.get_actual_value(); } void HTMLRenderer::TextLineBuffer::optimize(void) { assert(!states.empty()); - // TODO - // set proper hash_umask + long long word_space_umask = ((long long)0xff) << (8*((int)State::WORD_SPACE_ID)); + for(auto iter = states.begin(); iter != states.end(); ++iter) + { + auto text_iter1 = text.begin() + (iter->start_idx); + auto next_iter = iter; + ++next_iter; + auto text_iter2 = (next_iter == states.end()) ? (text.end()) : (text.begin() + (next_iter->start_idx)); + if(find(text_iter1, text_iter2, ' ') == text_iter2) + { + // if there's no space, word_space does not matter; + iter->hash_umask |= word_space_umask; + } + } + + // clean zero offsets + { + auto write_iter = offsets.begin(); + for(auto iter = offsets.begin(); iter != offsets.end(); ++iter) + { + if(!equal(iter->width, 0)) + { + *write_iter = *iter; + ++write_iter; + } + } + offsets.erase(write_iter, offsets.end()); + } // In some PDF files all spaces are converted into positionig shifts // We may try to change them to ' ' and adjusted word_spaces // This can also be applied when param->space_as_offset is set + // for now, we cosider only the no-space scenario + if(offsets.size() > 0) + { + // Since GCC 4.4.6 is suported, I cannot use all_of + lambda here + bool all_ws_umask = true; + for(auto iter = states.begin(); iter != states.end(); ++iter) + { + if(!(iter->hash_umask & word_space_umask)) + { + all_ws_umask = false; + break; + } + } + if(all_ws_umask) + { + double avg_width = 0; + int posive_offset_count = 0; + for(auto iter = offsets.begin(); iter != offsets.end(); ++iter) + { + if(is_positive(iter->width)) + { + ++posive_offset_count; + avg_width += iter->width; + } + } + avg_width /= posive_offset_count; + + // now check if the width of offsets are close enough + // TODO: it might make more sense if the threshold is proportion to the font size + bool ok = true; + double accum_off = 0; + double orig_accum_off = 0; + for(auto iter = offsets.begin(); iter != offsets.end(); ++iter) + { + orig_accum_off += iter->width; + accum_off += avg_width; + if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps) + { + ok = false; + break; + } + } + if(ok) + { + // ok, make all offsets equi-width + for(auto iter = offsets.begin(); iter != offsets.end(); ++iter) + { + if(is_positive(iter->width)) + iter->width = avg_width; + } + // set new word_space + for(auto iter = states.begin(); iter != states.end(); ++iter) + { + double new_word_space = avg_width - iter->single_space_offset(); + + // install new word_space + // we might introduce more variance here + auto & wm = renderer->word_space_manager; + wm.install(new_word_space); + iter->ids[State::WORD_SPACE_ID] = wm.get_id(); + iter->word_space = wm.get_actual_value(); + iter->hash_umask &= (~word_space_umask); + } + } + } + } } // this state will be converted to a child node of the node of prev_state @@ -312,6 +418,11 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const return d; } +double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const +{ + return letter_space + font_info->space_width * draw_font_size; +} + // the order should be the same as in the enum const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = { CSS::FONT_FAMILY_CN, diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h index 7051e0b..c289eb7 100644 --- a/src/HTMLRenderer/TextLineBuffer.h +++ b/src/HTMLRenderer/TextLineBuffer.h @@ -29,6 +29,8 @@ public: void hash(void); // calculate the difference between another State int diff(const State & s) const; + // the offset cause by a single ' ' char + double single_space_offset(void) const; enum { FONT_ID, @@ -46,6 +48,7 @@ public: const FontInfo * font_info; double draw_font_size; + double letter_space; double word_space; size_t start_idx; // index of the first Text using this state diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc index f686871..f9d15b5 100644 --- a/src/HTMLRenderer/font.cc +++ b/src/HTMLRenderer/font.cc @@ -206,15 +206,14 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo if(!font->isCIDFont()) { - if(font_8bit) - { - info.space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5); - } - else - { - char buf[2] = {0, ' '}; - info.space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5); - } + font_8bit = dynamic_cast(font); + info.space_width = font_8bit->getWidth(' '); + } + else + { + font_cid = dynamic_cast(font); + char buf[2] = {0, ' '}; + info.space_width = font_cid->getWidth(buf, 2); } if(get_metric_only) @@ -241,9 +240,8 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo * for CID Truetype * same as 8bitTrueType, except for that we have to check 65536 charcodes */ - if(!font->isCIDFont()) + if(font_8bit) { - font_8bit = dynamic_cast(font); maxcode = 0xff; if(is_truetype_suffix(suffix)) { @@ -296,7 +294,6 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo } else { - font_cid = dynamic_cast(font); maxcode = 0xffff; if(is_truetype_suffix(suffix)) @@ -437,7 +434,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo // Might be a problem if ' ' is in the font, but not empty if(!has_space) { - ffw_add_empty_char((int32_t)' ', info.space_width); + ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5)); } if(ctu) From 821a65ac73619d455f9a865dff55bcc614901370 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 21 Mar 2013 12:27:07 +0800 Subject: [PATCH 3/4] fix space optimization --- src/HTMLRenderer/TextLineBuffer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc index f1c956f..cc2406f 100644 --- a/src/HTMLRenderer/TextLineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -313,7 +313,7 @@ void HTMLRenderer::TextLineBuffer::optimize(void) // set new word_space for(auto iter = states.begin(); iter != states.end(); ++iter) { - double new_word_space = avg_width - iter->single_space_offset(); + double new_word_space = avg_width - iter->single_space_offset() + iter->word_space; // install new word_space // we might introduce more variance here @@ -420,7 +420,7 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const { - return letter_space + font_info->space_width * draw_font_size; + return word_space + letter_space + font_info->space_width * draw_font_size; } // the order should be the same as in the enum From 7ea4f054bb2c46ef110db37ab6bc8311fe56c4cb Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 21 Mar 2013 12:30:45 +0800 Subject: [PATCH 4/4] todo --- TODO | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TODO b/TODO index d279edb..e0e5aab 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,7 @@ +non-trivial space optimization +(For each state whose word_space is free, set a proper value such that it may cover most whitespaces) +(Or just set word_space according to the first positive whitespace, but need to do this before the state inherit some value) + == Future: == Too difficult/complicated to implement: