diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 131d824..32bfb33 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -36,7 +36,7 @@ public: long long id; bool use_tounicode; int em_size; - int space_width; + double space_width; double ascent, descent; bool is_type3; }; diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc index f8ef903..f1c956f 100644 --- a/src/HTMLRenderer/TextLineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -7,6 +7,7 @@ */ #include +#include #include #include "HTMLRenderer.h" @@ -26,6 +27,7 @@ using std::ostream; using std::cerr; using std::endl; using std::find; +using std::abs; void HTMLRenderer::TextLineBuffer::reset(GfxState * state) { @@ -74,14 +76,6 @@ void HTMLRenderer::TextLineBuffer::flush(void) optimize(); - for(auto iter = states.begin(); iter != states.end(); ++iter) - iter->hash(); - - states.resize(states.size() + 1); - states.back().start_idx = text.size(); - - offsets.push_back(Offset({text.size(), 0})); - double max_ascent = 0; for(auto iter = states.begin(); iter != states.end(); ++iter) { @@ -89,6 +83,16 @@ void HTMLRenderer::TextLineBuffer::flush(void) max_ascent = max(max_ascent, s.font_info->ascent * s.draw_font_size); } + // append a dummy state for convenience + states.resize(states.size() + 1); + states.back().start_idx = text.size(); + + for(auto iter = states.begin(); iter != states.end(); ++iter) + iter->hash(); + + // append a dummy offset for convenience + offsets.push_back(Offset({text.size(), 0})); + ostream & out = renderer->f_pages.fs; renderer->height_manager.install(max_ascent); renderer->left_manager .install(x); @@ -153,21 +157,30 @@ void HTMLRenderer::TextLineBuffer::flush(void) { double target = cur_offset_iter->width + dx; - auto & wm = renderer->whitespace_manager; - wm.install(target); - auto wid = wm.get_id(); - double w = wm.get_actual_value(); + if(equal(target, stack.back()->single_space_offset())) + { + Unicode u = ' '; + outputUnicodes(out, &u, 1); + dx = 0; + } + else + { + auto & wm = renderer->whitespace_manager; + wm.install(target); + auto wid = wm.get_id(); + double w = wm.get_actual_value(); - if(w < 0) - last_text_pos_with_negative_offset = cur_text_idx; + if(w < 0) + last_text_pos_with_negative_offset = cur_text_idx; - auto * p = stack.back(); - double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold); + auto * p = stack.back(); + double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold); - out << "" << (target > (threshold - EPS) ? " " : "") << ""; + out << "" << (target > (threshold - EPS) ? " " : "") << ""; - dx = target - w; + dx = target - w; + } ++ cur_offset_iter; } @@ -205,21 +218,114 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state) state.ids[State::RISE_ID] = renderer->rise_manager.get_id(); state.font_info = renderer->cur_font_info; - state.draw_font_size = renderer->font_size_manager.get_value(); + state.draw_font_size = renderer->font_size_manager.get_actual_value(); + state.letter_space = renderer->letter_space_manager.get_actual_value(); + state.word_space = renderer->word_space_manager.get_actual_value(); } void HTMLRenderer::TextLineBuffer::optimize(void) { assert(!states.empty()); - // TODO - // set proper hash_umask + long long word_space_umask = ((long long)0xff) << (8*((int)State::WORD_SPACE_ID)); + for(auto iter = states.begin(); iter != states.end(); ++iter) + { + auto text_iter1 = text.begin() + (iter->start_idx); + auto next_iter = iter; + ++next_iter; + auto text_iter2 = (next_iter == states.end()) ? (text.end()) : (text.begin() + (next_iter->start_idx)); + if(find(text_iter1, text_iter2, ' ') == text_iter2) + { + // if there's no space, word_space does not matter; + iter->hash_umask |= word_space_umask; + } + } + + // clean zero offsets + { + auto write_iter = offsets.begin(); + for(auto iter = offsets.begin(); iter != offsets.end(); ++iter) + { + if(!equal(iter->width, 0)) + { + *write_iter = *iter; + ++write_iter; + } + } + offsets.erase(write_iter, offsets.end()); + } // In some PDF files all spaces are converted into positionig shifts // We may try to change them to ' ' and adjusted word_spaces // This can also be applied when param->space_as_offset is set + // for now, we cosider only the no-space scenario + if(offsets.size() > 0) + { + // Since GCC 4.4.6 is suported, I cannot use all_of + lambda here + bool all_ws_umask = true; + for(auto iter = states.begin(); iter != states.end(); ++iter) + { + if(!(iter->hash_umask & word_space_umask)) + { + all_ws_umask = false; + break; + } + } + if(all_ws_umask) + { + double avg_width = 0; + int posive_offset_count = 0; + for(auto iter = offsets.begin(); iter != offsets.end(); ++iter) + { + if(is_positive(iter->width)) + { + ++posive_offset_count; + avg_width += iter->width; + } + } + avg_width /= posive_offset_count; + + // now check if the width of offsets are close enough + // TODO: it might make more sense if the threshold is proportion to the font size + bool ok = true; + double accum_off = 0; + double orig_accum_off = 0; + for(auto iter = offsets.begin(); iter != offsets.end(); ++iter) + { + orig_accum_off += iter->width; + accum_off += avg_width; + if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps) + { + ok = false; + break; + } + } + if(ok) + { + // ok, make all offsets equi-width + for(auto iter = offsets.begin(); iter != offsets.end(); ++iter) + { + if(is_positive(iter->width)) + iter->width = avg_width; + } + // set new word_space + for(auto iter = states.begin(); iter != states.end(); ++iter) + { + double new_word_space = avg_width - iter->single_space_offset(); + + // install new word_space + // we might introduce more variance here + auto & wm = renderer->word_space_manager; + wm.install(new_word_space); + iter->ids[State::WORD_SPACE_ID] = wm.get_id(); + iter->word_space = wm.get_actual_value(); + iter->hash_umask &= (~word_space_umask); + } + } + } + } } // this state will be converted to a child node of the node of prev_state @@ -312,6 +418,11 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const return d; } +double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const +{ + return letter_space + font_info->space_width * draw_font_size; +} + // the order should be the same as in the enum const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = { CSS::FONT_FAMILY_CN, diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h index 7051e0b..c289eb7 100644 --- a/src/HTMLRenderer/TextLineBuffer.h +++ b/src/HTMLRenderer/TextLineBuffer.h @@ -29,6 +29,8 @@ public: void hash(void); // calculate the difference between another State int diff(const State & s) const; + // the offset cause by a single ' ' char + double single_space_offset(void) const; enum { FONT_ID, @@ -46,6 +48,7 @@ public: const FontInfo * font_info; double draw_font_size; + double letter_space; double word_space; size_t start_idx; // index of the first Text using this state diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc index f686871..f9d15b5 100644 --- a/src/HTMLRenderer/font.cc +++ b/src/HTMLRenderer/font.cc @@ -206,15 +206,14 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo if(!font->isCIDFont()) { - if(font_8bit) - { - info.space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5); - } - else - { - char buf[2] = {0, ' '}; - info.space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5); - } + font_8bit = dynamic_cast(font); + info.space_width = font_8bit->getWidth(' '); + } + else + { + font_cid = dynamic_cast(font); + char buf[2] = {0, ' '}; + info.space_width = font_cid->getWidth(buf, 2); } if(get_metric_only) @@ -241,9 +240,8 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo * for CID Truetype * same as 8bitTrueType, except for that we have to check 65536 charcodes */ - if(!font->isCIDFont()) + if(font_8bit) { - font_8bit = dynamic_cast(font); maxcode = 0xff; if(is_truetype_suffix(suffix)) { @@ -296,7 +294,6 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo } else { - font_cid = dynamic_cast(font); maxcode = 0xffff; if(is_truetype_suffix(suffix)) @@ -437,7 +434,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo // Might be a problem if ' ' is in the font, but not empty if(!has_space) { - ffw_add_empty_char((int32_t)' ', info.space_width); + ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5)); } if(ctu)