From e191f8127dd08936c74e103b6bb70c291ac2fb07 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 6 Sep 2012 14:37:09 +0800 Subject: [PATCH] removed 'optmized' option, use greedy method automatically --- pdf2htmlEX.1 | 3 - src/HTMLRenderer.h | 6 +- src/HTMLRenderer/LineBuffer.cc | 171 +++++++++------------------------ src/Param.h | 1 - src/pdf2htmlEX.cc | 1 - test/test.py | 2 +- 6 files changed, 50 insertions(+), 134 deletions(-) diff --git a/pdf2htmlEX.1 b/pdf2htmlEX.1 index 09885e5..6692517 100644 --- a/pdf2htmlEX.1 +++ b/pdf2htmlEX.1 @@ -89,9 +89,6 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch. .TP -.B --optimize <0|1> (Default: 0) -Try to optimize the output HTML file, might be slow. -.TP .B --font-suffix (Default: ".ttf"), --font-format (Default: "truetype") Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. .TP diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index 4f33b1d..ed0cb40 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -293,7 +293,6 @@ class HTMLRenderer : public OutputDev size_t start_idx; // index of the first Text using this state // for optimzation long long hash_value; - int depth; // the depth in the state tree bool need_close; static const char * format_str; // class names for each id @@ -315,8 +314,6 @@ class HTMLRenderer : public OutputDev private: // retrieve state from renderer void set_state(State & state); - // build the state tree in order to minimize the size of output - void optimize_states(void); HTMLRenderer * renderer; @@ -327,6 +324,9 @@ class HTMLRenderer : public OutputDev std::vector offsets; std::vector text; + // for flush + std::vector stack; + } line_buf; friend class LineBuffer; diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc index 44a4e8e..06ffa63 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/LineBuffer.cc @@ -8,7 +8,6 @@ */ #include -#include #include "HTMLRenderer.h" #include "HTMLRenderer/namespace.h" @@ -16,7 +15,6 @@ using std::min; using std::max; using std::vector; -using std::stack; using std::function; void HTMLRenderer::LineBuffer::reset(GfxState * state) @@ -66,19 +64,8 @@ void HTMLRenderer::LineBuffer::flush(void) for(auto & s : states) s.hash(); - if((renderer->param->optimize) && (states.size() > 2)) - { - optimize_states(); - } - else - { - for(size_t i = 0; i < states.size(); ++i) - states[i].depth = i; - } - states.resize(states.size() + 1); states.back().start_idx = text.size(); - states.back().depth = 0; offsets.push_back({text.size(), 0}); @@ -100,27 +87,44 @@ void HTMLRenderer::LineBuffer::flush(void) //accumulated horizontal offset; double dx = 0; - stack stack; - stack.push(nullptr); - int last_depth = -1; + stack.clear(); + stack.push_back(nullptr); + + // whenever a negative offset appears, we should not pop out that + // otherwise the effect of negative margin-left would disappear + size_t last_text_pos_with_negative_offset = -1; size_t cur_text_idx = 0; while(cur_text_idx < text.size()) { if(cur_text_idx >= cur_state_iter->start_idx) { - int depth = cur_state_iter -> depth; - int cnt = last_depth + 1 - depth; - assert(cnt >= 0); - while(cnt--) + // greedy + int best_cost = State::ID_COUNT; + + // we have a nullptr at the beginning, so no need to check for rend + for(auto iter = stack.rbegin(); *iter; ++iter) { - stack.top()->end(out); - stack.pop(); - } + int cost = cur_state_iter->diff(**iter); + if(cost < best_cost) + { + while(stack.back() != *iter) + { + stack.back()->end(out); + stack.pop_back(); + } + best_cost = cost; - cur_state_iter->begin(out, stack.top()); - stack.push(&*cur_state_iter); - last_depth = depth; + if(best_cost == 0) + break; + } + + // cannot go further + if((*iter)->start_idx <= last_text_pos_with_negative_offset) + break; + } + cur_state_iter->begin(out, stack.back()); + stack.push_back(&*cur_state_iter); ++ cur_state_iter; } @@ -132,6 +136,9 @@ void HTMLRenderer::LineBuffer::flush(void) auto wid = renderer->install_whitespace(target, w); + if(w < 0) + last_text_pos_with_negative_offset = cur_text_idx; + double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold); out << format("%2%") % wid % (target > (threshold - EPS) ? " " : ""); @@ -146,10 +153,10 @@ void HTMLRenderer::LineBuffer::flush(void) } // we have a nullptr in the bottom - while(stack.top()) + while(stack.back()) { - stack.top()->end(out); - stack.pop(); + stack.back()->end(out); + stack.pop_back(); } out << ""; @@ -176,103 +183,8 @@ void HTMLRenderer::LineBuffer::set_state (State & state) state.draw_font_size = renderer->draw_font_size; } -class DPBufferEntry -{ -public: - int last_child; - int min_cost; -}; - -static vector flattened_dp_buffer; -static vector dp_buffer; - -void HTMLRenderer::LineBuffer::optimize_states (void) -{ - int n = states.size(); - - flattened_dp_buffer.resize(n*(n+1)/2); - dp_buffer.resize(n); - - { - int incre = n; - auto iter = dp_buffer.begin(); - DPBufferEntry * p = flattened_dp_buffer.data(); - while(incre > 0) - { - *(iter++) = p; - p += (incre--); - } - } - - // depth 0 - for(int i = 0; i < n; ++i) - flattened_dp_buffer[i].min_cost = 0; - - int last_at_this_depth = n; - for(int depth = 1; depth < n; ++depth) - { - --last_at_this_depth; - for(int i = 0; i < last_at_this_depth; ++i) - { - //determine dp_buffer[depth][i] - int best_last_child = i+1; - int best_min_cost = states[i].diff(states[i+1]) + dp_buffer[depth-1][i+1].min_cost; - // at depth, we consider [i+1, i+depth+1) as possible children of i - for(int j = 2; j <= depth; ++j) - { - int cost = dp_buffer[j-1][i].min_cost + dp_buffer[depth-j][i+j].min_cost; - // avoid calling diff() when possible - if (cost >= best_min_cost) continue; - - cost += states[i].diff(states[i+j]); - - if(cost < best_min_cost) - { - best_last_child = i+j; - best_min_cost = cost; - } - } - - dp_buffer[depth][i] = {best_last_child, best_min_cost}; - } - } - - // now fill in the depths - // use recursion for now, until someone finds a PDF that would causes this overflow - function func = [&](int idx, int depth, int tree_depth) -> void { - states[idx].depth = tree_depth; - while(depth > 0) - { - int last_child = dp_buffer[depth][idx].last_child; - assert((last_child > idx) && (last_child <= idx + depth)); - func(last_child, idx + depth - last_child, tree_depth + 1); - depth = last_child - idx - 1; - } - }; - - func(0, n-1, 0); -} - void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state) { - if(prev_state && (prev_state->hash_value == hash_value)) - { - // check ids again - int i; - for(i = 0; i < ID_COUNT; ++i) - if(ids[i] != prev_state->ids[i]) - break; - - if(i == ID_COUNT) - { - need_close = false; - return; - } - } - - need_close = true; - - out << ""; + if(first) + { + need_close = false; + } + else + { + out << "\">"; + need_close = true; + } } void HTMLRenderer::LineBuffer::State::end(ostream & out) const diff --git a/src/Param.h b/src/Param.h index 13754fb..136d6d6 100644 --- a/src/Param.h +++ b/src/Param.h @@ -36,7 +36,6 @@ struct Param double space_threshold; double font_size_multiplier; int always_apply_tounicode; - int optimize; std::string font_suffix, font_format; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 3132681..4bc971f 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -84,7 +84,6 @@ po::variables_map parse_options (int argc, char **argv) ("space-threshold", po::value(¶m.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character") ("font-size-multiplier", po::value(¶m.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy") ("always-apply-tounicode", po::value(¶m.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on") - ("optimize", po::value(¶m.optimize)->default_value(0), "Optimize HTML, might be very slow") ("font-suffix", po::value(¶m.font_suffix)->default_value(".ttf"), "suffix for extracted font files") ("font-format", po::value(¶m.font_format)->default_value("truetype"), "format for extracted font files") diff --git a/test/test.py b/test/test.py index d3c366f..fc06f49 100755 --- a/test/test.py +++ b/test/test.py @@ -12,7 +12,7 @@ for f in os.listdir(DIR): if not f.lower().endswith('.pdf'): continue print f - os.system('pdf2htmlEX --optimize 1 -l 10 --dest-dir html "%s/%s"' % (DIR,f)) + os.system('pdf2htmlEX -l 10 --dest-dir html "%s/%s"' % (DIR,f)) ff = f[:-3]+'html' outf.write('%s
' % (ff,ff)) outf.flush();