add an option 'optimize'

2024-12-22 04:50:09 +00:00 · 2012-09-05 16:19:01 +08:00 · 2012-09-05 16:19:01 +08:00 · 2d558253a5
commit 2d558253a5
parent bbe9b99b4e
5 changed files with 17 additions and 8 deletions
--- a/2
+++ b/2
@ -6,7 +6,7 @@ option to break ligatures
 detect duplicate base fonts when embedding
-compress div/span states
+consider left-shift in optimization
 multiple charcode mapped to a same glyph
 re-encoded only used glyphs
--- a/pdf2htmlEX.1
+++ b/pdf2htmlEX.1
@ -89,6 +89,9 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o
 However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch.
 .TP
 .B --optimize <0|1> (Default: 0)
 Try to optimize the output HTML file, might be slow.
 .TP
 .B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
 Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
 .TP
--- a/src/HTMLRenderer/LineBuffer.cc
+++ b/src/HTMLRenderer/LineBuffer.cc
@ -66,14 +66,14 @@ void HTMLRenderer::LineBuffer::flush(void)
    for(auto & s : states)
        s.hash();
-    if(states.size() < 3)
+    if((renderer->param->optimize) && (states.size() > 2))
    {
-        for(size_t i = 0; i < states.size(); ++i)
+        optimize_states();
            states[i].depth = i;
    }
    else
    {
-        optimize_states();
+        for(size_t i = 0; i < states.size(); ++i)
            states[i].depth = i;
    }
    states.resize(states.size() + 1);
@ -203,6 +203,10 @@ void HTMLRenderer::LineBuffer::optimize_states (void)
            p += (incre--);
        }
    }
    // depth 0
    for(int i = 0; i < n; ++i)
        flattened_dp_buffer[i].min_cost = 0;
    int last_at_this_depth = n;
    for(int depth = 1; depth < n; ++depth)
@ -240,8 +244,8 @@ void HTMLRenderer::LineBuffer::optimize_states (void)
        while(depth > 0)
        {
            int last_child = dp_buffer[depth][idx].last_child;
-            assert(last_child > idx);
+            assert((last_child > idx) && (last_child <= idx + depth));
-            func(last_child, depth - last_child, tree_depth + 1);
+            func(last_child, idx + depth - last_child, tree_depth + 1);
            depth = last_child - idx - 1;
        }
    };
@ -310,7 +314,7 @@ int HTMLRenderer::LineBuffer::State::diff(const State & s) const
    /*
     * A quick check based on hash_value
     * it could be wrong when there are more then 256 classes, 
-     * in which case the output may not be optimal, but still 'correct'
+     * in which case the output may not be optimal, but still 'correct' in terms of HTML
     */
    if(hash_value == s.hash_value) return 0;
--- a/src/Param.h
+++ b/src/Param.h
@ -36,6 +36,7 @@ struct Param
    double space_threshold;
    double font_size_multiplier;
    int always_apply_tounicode;
    int optimize;
    std::string font_suffix, font_format;
--- a/src/pdf2htmlEX.cc
+++ b/src/pdf2htmlEX.cc
@ -84,6 +84,7 @@ po::variables_map parse_options (int argc, char **argv)
        ("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character")
        ("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy")
        ("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
        ("optimize", po::value<int>(&param.optimize)->default_value(0), "Optimize HTML, might be very slow")
        ("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files")
        ("font-format", po::value<string>(&param.font_format)->default_value("truetype"), "format for extracted font files")