diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index b079800..131d824 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -30,13 +30,13 @@ namespace pdf2htmlEX { -// we may need more info of a font in the future class FontInfo { public: long long id; bool use_tounicode; int em_size; + int space_width; double ascent, descent; bool is_type3; }; diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc index 2a678f8..f8ef903 100644 --- a/src/HTMLRenderer/TextLineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -7,6 +7,7 @@ */ #include +#include #include "HTMLRenderer.h" #include "TextLineBuffer.h" @@ -24,6 +25,7 @@ using std::vector; using std::ostream; using std::cerr; using std::endl; +using std::find; void HTMLRenderer::TextLineBuffer::reset(GfxState * state) { @@ -50,6 +52,7 @@ void HTMLRenderer::TextLineBuffer::append_state(void) { states.resize(states.size() + 1); states.back().start_idx = text.size(); + states.back().hash_umask = 0; } set_state(states.back()); @@ -69,6 +72,8 @@ void HTMLRenderer::TextLineBuffer::flush(void) return; } + optimize(); + for(auto iter = states.begin(); iter != states.end(); ++iter) iter->hash(); @@ -81,7 +86,7 @@ void HTMLRenderer::TextLineBuffer::flush(void) for(auto iter = states.begin(); iter != states.end(); ++iter) { const auto & s = *iter; - max_ascent = max(max_ascent, s.ascent * s.draw_font_size); + max_ascent = max(max_ascent, s.font_info->ascent * s.draw_font_size); } ostream & out = renderer->f_pages.fs; @@ -157,7 +162,7 @@ void HTMLRenderer::TextLineBuffer::flush(void) last_text_pos_with_negative_offset = cur_text_idx; auto * p = stack.back(); - double threshold = p->draw_font_size * (p->ascent - p->descent) * (renderer->param->space_threshold); + double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold); out << "" << (target > (threshold - EPS) ? " " : "") << ""; @@ -199,18 +204,47 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state) state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id(); state.ids[State::RISE_ID] = renderer->rise_manager.get_id(); - const FontInfo * info = renderer->cur_font_info; - state.ascent = info->ascent; - state.descent = info->descent; + state.font_info = renderer->cur_font_info; state.draw_font_size = renderer->font_size_manager.get_value(); } +void HTMLRenderer::TextLineBuffer::optimize(void) +{ + assert(!states.empty()); + + // TODO + + // set proper hash_umask + + // In some PDF files all spaces are converted into positionig shifts + // We may try to change them to ' ' and adjusted word_spaces + // This can also be applied when param->space_as_offset is set + +} + +// this state will be converted to a child node of the node of prev_state +// dump the difference between previous state +// also clone corresponding states void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state) { + long long cur_mask = 0xff; bool first = true; - for(int i = 0; i < ID_COUNT; ++i) + for(int i = 0; i < ID_COUNT; ++i, cur_mask<<=8) { - if(prev_state && (prev_state->ids[i] == ids[i])) + if(hash_umask & cur_mask) // we don't care about this ID + { + if (prev_state && (!(prev_state->hash_umask & cur_mask))) // if prev_state have it set + { + // we have to inherit it + ids[i] = prev_state->ids[i]; + hash_umask &= (~cur_mask); + } + //anyway we don't have to output it + continue; + } + + // now we care about the ID + if(prev_state && (!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i])) continue; if(first) @@ -231,7 +265,7 @@ void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * pr out << ids[i]; } - if(first) + if(first) // we actually just inherit the whole prev_state { need_close = false; } @@ -264,12 +298,17 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const * it could be wrong when there are more then 256 classes, * in which case the output may not be optimal, but still 'correct' in terms of HTML */ - if(hash_value == s.hash_value) return 0; + long long common_mask = ~(hash_umask | s.hash_umask); + if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0; + long long cur_mask = 0xff; int d = 0; for(int i = 0; i < ID_COUNT; ++i) - if(ids[i] != s.ids[i]) + { + if((common_mask & cur_mask) && (ids[i] != s.ids[i])) ++ d; + cur_mask <<= 8; + } return d; } diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h index 2cc288a..7051e0b 100644 --- a/src/HTMLRenderer/TextLineBuffer.h +++ b/src/HTMLRenderer/TextLineBuffer.h @@ -44,13 +44,14 @@ public: long long ids[ID_COUNT]; - double ascent; - double descent; + const FontInfo * font_info; double draw_font_size; + double word_space; size_t start_idx; // index of the first Text using this state // for optimzation long long hash_value; + long long hash_umask; // some states may not be actually used bool need_close; static const char * const css_class_names []; // class names for each id @@ -73,6 +74,8 @@ private: // retrieve state from renderer void set_state(State & state); + void optimize(void); + HTMLRenderer * renderer; double x, y; diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc index ca4eaa0..f686871 100644 --- a/src/HTMLRenderer/font.cc +++ b/src/HTMLRenderer/font.cc @@ -204,6 +204,19 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo info.em_size = ffw_get_em_size(); + if(!font->isCIDFont()) + { + if(font_8bit) + { + info.space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5); + } + else + { + char buf[2] = {0, ' '}; + info.space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5); + } + } + if(get_metric_only) { ffw_metric(&info.ascent, &info.descent); @@ -424,17 +437,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo // Might be a problem if ' ' is in the font, but not empty if(!has_space) { - int space_width; - if(font_8bit) - { - space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5); - } - else - { - char buf[2] = {0, ' '}; - space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5); - } - ffw_add_empty_char((int32_t)' ', space_width); + ffw_add_empty_char((int32_t)' ', info.space_width); } if(ctu) @@ -525,6 +528,8 @@ const FontInfo * HTMLRenderer::install_font(GfxFont * font) if(font == nullptr) { + new_font_info.em_size = 0; + new_font_info.space_width = 0; new_font_info.ascent = 0; new_font_info.descent = 0; new_font_info.is_type3 = false;