diff --git a/TODO b/TODO index 6470b74..9058b5c 100644 --- a/TODO +++ b/TODO @@ -6,7 +6,7 @@ option to break ligatures detect duplicate base fonts when embedding -compress div/span states +consider left-shift in optimization multiple charcode mapped to a same glyph re-encoded only used glyphs diff --git a/pdf2htmlEX.1 b/pdf2htmlEX.1 index 6692517..09885e5 100644 --- a/pdf2htmlEX.1 +++ b/pdf2htmlEX.1 @@ -89,6 +89,9 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch. .TP +.B --optimize <0|1> (Default: 0) +Try to optimize the output HTML file, might be slow. +.TP .B --font-suffix (Default: ".ttf"), --font-format (Default: "truetype") Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. .TP diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc index 2a8acb9..44a4e8e 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/LineBuffer.cc @@ -66,14 +66,14 @@ void HTMLRenderer::LineBuffer::flush(void) for(auto & s : states) s.hash(); - if(states.size() < 3) + if((renderer->param->optimize) && (states.size() > 2)) { - for(size_t i = 0; i < states.size(); ++i) - states[i].depth = i; + optimize_states(); } else { - optimize_states(); + for(size_t i = 0; i < states.size(); ++i) + states[i].depth = i; } states.resize(states.size() + 1); @@ -203,6 +203,10 @@ void HTMLRenderer::LineBuffer::optimize_states (void) p += (incre--); } } + + // depth 0 + for(int i = 0; i < n; ++i) + flattened_dp_buffer[i].min_cost = 0; int last_at_this_depth = n; for(int depth = 1; depth < n; ++depth) @@ -240,8 +244,8 @@ void HTMLRenderer::LineBuffer::optimize_states (void) while(depth > 0) { int last_child = dp_buffer[depth][idx].last_child; - assert(last_child > idx); - func(last_child, depth - last_child, tree_depth + 1); + assert((last_child > idx) && (last_child <= idx + depth)); + func(last_child, idx + depth - last_child, tree_depth + 1); depth = last_child - idx - 1; } }; @@ -310,7 +314,7 @@ int HTMLRenderer::LineBuffer::State::diff(const State & s) const /* * A quick check based on hash_value * it could be wrong when there are more then 256 classes, - * in which case the output may not be optimal, but still 'correct' + * in which case the output may not be optimal, but still 'correct' in terms of HTML */ if(hash_value == s.hash_value) return 0; diff --git a/src/Param.h b/src/Param.h index 136d6d6..13754fb 100644 --- a/src/Param.h +++ b/src/Param.h @@ -36,6 +36,7 @@ struct Param double space_threshold; double font_size_multiplier; int always_apply_tounicode; + int optimize; std::string font_suffix, font_format; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 4bc971f..3132681 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -84,6 +84,7 @@ po::variables_map parse_options (int argc, char **argv) ("space-threshold", po::value(¶m.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character") ("font-size-multiplier", po::value(¶m.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy") ("always-apply-tounicode", po::value(¶m.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on") + ("optimize", po::value(¶m.optimize)->default_value(0), "Optimize HTML, might be very slow") ("font-suffix", po::value(¶m.font_suffix)->default_value(".ttf"), "suffix for extracted font files") ("font-format", po::value(¶m.font_format)->default_value("truetype"), "format for extracted font files")