removed 'optmized' option, use greedy method automatically

2024-12-22 13:00:08 +00:00 · 2012-09-06 14:37:09 +08:00 · 2012-09-06 14:37:09 +08:00 · e191f8127d
commit e191f8127d
parent 95331d061b
6 changed files with 50 additions and 134 deletions
--- a/pdf2htmlEX.1
+++ b/pdf2htmlEX.1
@ -89,9 +89,6 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o
 However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch.
 .TP
 .B --optimize <0|1> (Default: 0)
 Try to optimize the output HTML file, might be slow.
 .TP
 .B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
 Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
 .TP
--- a/src/HTMLRenderer.h
+++ b/src/HTMLRenderer.h
@ -293,7 +293,6 @@ class HTMLRenderer : public OutputDev
                size_t start_idx; // index of the first Text using this state
                // for optimzation
                long long hash_value;
                int depth; // the depth in the state tree
                bool need_close;
                static const char * format_str; // class names for each id
@ -315,8 +314,6 @@ class HTMLRenderer : public OutputDev
        private:
            // retrieve state from renderer
            void set_state(State & state);
            // build the state tree in order to minimize the size of output
            void optimize_states(void);
            HTMLRenderer * renderer;
@ -327,6 +324,9 @@ class HTMLRenderer : public OutputDev
            std::vector<Offset> offsets;
            std::vector<Unicode> text;
            // for flush
            std::vector<State*> stack;
        } line_buf;
        friend class LineBuffer;
--- a/src/HTMLRenderer/LineBuffer.cc
+++ b/src/HTMLRenderer/LineBuffer.cc
@ -8,7 +8,6 @@
 */
 #include <vector>
 #include <stack>
 #include "HTMLRenderer.h"
 #include "HTMLRenderer/namespace.h"
@ -16,7 +15,6 @@
 using std::min;
 using std::max;
 using std::vector;
 using std::stack;
 using std::function;
 void HTMLRenderer::LineBuffer::reset(GfxState * state)
@ -66,19 +64,8 @@ void HTMLRenderer::LineBuffer::flush(void)
    for(auto & s : states)
        s.hash();
    if((renderer->param->optimize) && (states.size() > 2))
    {
        optimize_states();
    }
    else
    {
        for(size_t i = 0; i < states.size(); ++i)
            states[i].depth = i;
    }
    states.resize(states.size() + 1);
    states.back().start_idx = text.size();
    states.back().depth = 0;
    offsets.push_back({text.size(), 0});
@ -100,27 +87,44 @@ void HTMLRenderer::LineBuffer::flush(void)
    //accumulated horizontal offset;
    double dx = 0;
-    stack<State*> stack;
+    stack.clear();
-    stack.push(nullptr);
+    stack.push_back(nullptr);
-    int last_depth = -1;
+
    // whenever a negative offset appears, we should not pop out that <span>
    // otherwise the effect of negative margin-left would disappear
    size_t last_text_pos_with_negative_offset = -1;
    size_t cur_text_idx = 0;
    while(cur_text_idx < text.size())
    {
        if(cur_text_idx >= cur_state_iter->start_idx)
        {
-            int depth = cur_state_iter -> depth;
+            // greedy
-            int cnt = last_depth + 1 - depth;
+            int best_cost = State::ID_COUNT;
-            assert(cnt >= 0);
+            
-            while(cnt--)
+            // we have a nullptr at the beginning, so no need to check for rend
            for(auto iter = stack.rbegin(); *iter; ++iter)
            {
-                stack.top()->end(out);
+                int cost = cur_state_iter->diff(**iter);
-                stack.pop();
+                if(cost < best_cost)
-            }
+                {
                    while(stack.back() != *iter)
                    {
                        stack.back()->end(out);
                        stack.pop_back();
                    }
                    best_cost = cost;
-            cur_state_iter->begin(out, stack.top());
+                    if(best_cost == 0)
-            stack.push(&*cur_state_iter);
+                        break;
-            last_depth = depth;
+                }
                // cannot go further
                if((*iter)->start_idx <= last_text_pos_with_negative_offset)
                    break;
            }
            cur_state_iter->begin(out, stack.back());
            stack.push_back(&*cur_state_iter);
            ++ cur_state_iter;
        }
@ -132,6 +136,9 @@ void HTMLRenderer::LineBuffer::flush(void)
            auto wid = renderer->install_whitespace(target, w);
            if(w < 0)
                last_text_pos_with_negative_offset = cur_text_idx;
            double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold);
            out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
@ -146,10 +153,10 @@ void HTMLRenderer::LineBuffer::flush(void)
    }
    // we have a nullptr in the bottom
-    while(stack.top())
+    while(stack.back())
    {
-        stack.top()->end(out);
+        stack.back()->end(out);
-        stack.pop();
+        stack.pop_back();
    }
    out << "</div>";
@ -176,103 +183,8 @@ void HTMLRenderer::LineBuffer::set_state (State & state)
    state.draw_font_size = renderer->draw_font_size;
 }
 class DPBufferEntry
 {
 public:
    int last_child;
    int min_cost;
 };
 static vector<DPBufferEntry> flattened_dp_buffer;
 static vector<DPBufferEntry*> dp_buffer;
 void HTMLRenderer::LineBuffer::optimize_states (void)
 {
    int n = states.size();
    flattened_dp_buffer.resize(n*(n+1)/2);
    dp_buffer.resize(n);
    {
        int incre = n;
        auto iter = dp_buffer.begin();
        DPBufferEntry * p = flattened_dp_buffer.data();
        while(incre > 0)
        {
            *(iter++) = p;
            p += (incre--);
        }
    }
    // depth 0
    for(int i = 0; i < n; ++i)
        flattened_dp_buffer[i].min_cost = 0;
    int last_at_this_depth = n;
    for(int depth = 1; depth < n; ++depth)
    {
        --last_at_this_depth;
        for(int i = 0; i < last_at_this_depth; ++i)
        {
            //determine dp_buffer[depth][i]
            int best_last_child = i+1;   
            int best_min_cost = states[i].diff(states[i+1]) + dp_buffer[depth-1][i+1].min_cost;
            // at depth, we consider [i+1, i+depth+1) as possible children of i
            for(int j = 2; j <= depth; ++j)
            {
                int cost = dp_buffer[j-1][i].min_cost + dp_buffer[depth-j][i+j].min_cost;
                // avoid calling diff() when possible
                if (cost >= best_min_cost) continue;
                cost += states[i].diff(states[i+j]);
                if(cost < best_min_cost)
                {
                    best_last_child = i+j;
                    best_min_cost = cost;
                }
            }
            dp_buffer[depth][i] = {best_last_child, best_min_cost};
        }
    }
    // now fill in the depths
    // use recursion for now, until someone finds a PDF that would causes this overflow
    function<void(int,int,int)> func = [&](int idx, int depth, int tree_depth) -> void {
        states[idx].depth = tree_depth;
        while(depth > 0)
        {
            int last_child = dp_buffer[depth][idx].last_child;
            assert((last_child > idx) && (last_child <= idx + depth));
            func(last_child, idx + depth - last_child, tree_depth + 1);
            depth = last_child - idx - 1;
        }
    };
    func(0, n-1, 0);
 }
 void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state)
 {
    if(prev_state && (prev_state->hash_value == hash_value))
    {
        // check ids again
        int i;
        for(i = 0; i < ID_COUNT; ++i)
            if(ids[i] != prev_state->ids[i])
                break;
        if(i == ID_COUNT)
        {
            need_close = false;
            return;
        }
    }
    need_close = true;
    out << "<span class=\"";
    bool first = true;
    for(int i = 0; i < ID_COUNT; ++i)
    {
@ -281,6 +193,7 @@ void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_s
        if(first)
        { 
            out << "<span class=\"";
            first = false;
        }
        else
@ -291,7 +204,15 @@ void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_s
        out << format("%1%%|2$x|") % format_str[i] % ids[i];
    }
-    out << "\">";
+    if(first)
    {
        need_close = false;
    }
    else
    {
        out << "\">";
        need_close = true;
    }
 }
 void HTMLRenderer::LineBuffer::State::end(ostream & out) const
--- a/src/Param.h
+++ b/src/Param.h
@ -36,7 +36,6 @@ struct Param
    double space_threshold;
    double font_size_multiplier;
    int always_apply_tounicode;
    int optimize;
    std::string font_suffix, font_format;
--- a/src/pdf2htmlEX.cc
+++ b/src/pdf2htmlEX.cc
@ -84,7 +84,6 @@ po::variables_map parse_options (int argc, char **argv)
        ("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character")
        ("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy")
        ("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
        ("optimize", po::value<int>(&param.optimize)->default_value(0), "Optimize HTML, might be very slow")
        ("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files")
        ("font-format", po::value<string>(&param.font_format)->default_value("truetype"), "format for extracted font files")
--- a/test/test.py
+++ b/test/test.py
@ -12,7 +12,7 @@ for f in os.listdir(DIR):
    if not f.lower().endswith('.pdf'):
        continue
    print f
-    os.system('pdf2htmlEX --optimize 1 -l 10 --dest-dir html "%s/%s"' % (DIR,f))
+    os.system('pdf2htmlEX -l 10 --dest-dir html "%s/%s"' % (DIR,f))
    ff = f[:-3]+'html'
    outf.write('<a href="html/%s" target="pdf">%s</a><br/>' % (ff,ff))
    outf.flush();