diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index 54c4766..4f33b1d 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -268,8 +268,10 @@ class HTMLRenderer : public OutputDev class State { public: - void begin(std::ostream & out) const; - static void end(std::ostream & out); + void begin(std::ostream & out, const State * prev_state); + void end(std::ostream & out) const; + void hash(void); + int diff(const State & s) const; enum { FONT_ID, @@ -283,8 +285,16 @@ class HTMLRenderer : public OutputDev }; long long ids[ID_COUNT]; + double ascent; + double descent; + double draw_font_size; + size_t start_idx; // index of the first Text using this state + // for optimzation + long long hash_value; + int depth; // the depth in the state tree + bool need_close; static const char * format_str; // class names for each id }; @@ -305,6 +315,8 @@ class HTMLRenderer : public OutputDev private: // retrieve state from renderer void set_state(State & state); + // build the state tree in order to minimize the size of output + void optimize_states(void); HTMLRenderer * renderer; @@ -328,14 +340,10 @@ class HTMLRenderer : public OutputDev std::unordered_map font_name_map; std::map font_size_map; - std::map transform_matrix_map; - std::map letter_space_map; std::map word_space_map; - std::map color_map; - std::map whitespace_map; std::map rise_map; diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc index 3cc74cc..2a8acb9 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/LineBuffer.cc @@ -7,13 +7,17 @@ * 2012.09.04 */ +#include +#include + #include "HTMLRenderer.h" #include "HTMLRenderer/namespace.h" using std::min; using std::max; -using std::hex; -using std::dec; +using std::vector; +using std::stack; +using std::function; void HTMLRenderer::LineBuffer::reset(GfxState * state) { @@ -59,15 +63,28 @@ void HTMLRenderer::LineBuffer::flush(void) return; } + for(auto & s : states) + s.hash(); + + if(states.size() < 3) + { + for(size_t i = 0; i < states.size(); ++i) + states[i].depth = i; + } + else + { + optimize_states(); + } + states.resize(states.size() + 1); states.back().start_idx = text.size(); + states.back().depth = 0; offsets.push_back({text.size(), 0}); - // TODO: optimize state double max_ascent = 0; for(const State & s : states) - max_ascent = max(max_ascent, s.ascent); + max_ascent = max(max_ascent, s.ascent * s.draw_font_size); // TODO: class for height ? ostream & out = renderer->html_fout; @@ -80,17 +97,30 @@ void HTMLRenderer::LineBuffer::flush(void) auto cur_state_iter = states.begin(); auto cur_offset_iter = offsets.begin(); + //accumulated horizontal offset; double dx = 0; + stack stack; + stack.push(nullptr); + int last_depth = -1; + size_t cur_text_idx = 0; while(cur_text_idx < text.size()) { if(cur_text_idx >= cur_state_iter->start_idx) { - if(cur_text_idx) - State::end(out); + int depth = cur_state_iter -> depth; + int cnt = last_depth + 1 - depth; + assert(cnt >= 0); + while(cnt--) + { + stack.top()->end(out); + stack.pop(); + } - cur_state_iter->begin(out); + cur_state_iter->begin(out, stack.top()); + stack.push(&*cur_state_iter); + last_depth = depth; ++ cur_state_iter; } @@ -102,9 +132,7 @@ void HTMLRenderer::LineBuffer::flush(void) auto wid = renderer->install_whitespace(target, w); - // TODO -// double threshold = draw_font_size * (cur_font_info.ascent - cur_font_info.descent) * (param->space_threshold); - double threshold = 0; + double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold); out << format("%2%") % wid % (target > (threshold - EPS) ? " " : ""); dx = target - w; @@ -117,7 +145,13 @@ void HTMLRenderer::LineBuffer::flush(void) cur_text_idx = next_text_idx; } - State::end(out); + // we have a nullptr in the bottom + while(stack.top()) + { + stack.top()->end(out); + stack.pop(); + } + out << ""; @@ -136,23 +170,155 @@ void HTMLRenderer::LineBuffer::set_state (State & state) state.ids[State::WORD_SPACE_ID] = renderer->cur_ws_id; state.ids[State::RISE_ID] = renderer->cur_rise_id; - state.ascent = renderer->cur_font_info->ascent * renderer->draw_font_size; + const FontInfo * info = renderer->cur_font_info; + state.ascent = info->ascent; + state.descent = info->descent; + state.draw_font_size = renderer->draw_font_size; } -void HTMLRenderer::LineBuffer::State::begin (ostream & out) const +class DPBufferEntry { +public: + int last_child; + int min_cost; +}; + +static vector flattened_dp_buffer; +static vector dp_buffer; + +void HTMLRenderer::LineBuffer::optimize_states (void) +{ + int n = states.size(); + + flattened_dp_buffer.resize(n*(n+1)/2); + dp_buffer.resize(n); + + { + int incre = n; + auto iter = dp_buffer.begin(); + DPBufferEntry * p = flattened_dp_buffer.data(); + while(incre > 0) + { + *(iter++) = p; + p += (incre--); + } + } + + int last_at_this_depth = n; + for(int depth = 1; depth < n; ++depth) + { + --last_at_this_depth; + for(int i = 0; i < last_at_this_depth; ++i) + { + //determine dp_buffer[depth][i] + int best_last_child = i+1; + int best_min_cost = states[i].diff(states[i+1]) + dp_buffer[depth-1][i+1].min_cost; + // at depth, we consider [i+1, i+depth+1) as possible children of i + for(int j = 2; j <= depth; ++j) + { + int cost = dp_buffer[j-1][i].min_cost + dp_buffer[depth-j][i+j].min_cost; + // avoid calling diff() when possible + if (cost >= best_min_cost) continue; + + cost += states[i].diff(states[i+j]); + + if(cost < best_min_cost) + { + best_last_child = i+j; + best_min_cost = cost; + } + } + + dp_buffer[depth][i] = {best_last_child, best_min_cost}; + } + } + + // now fill in the depths + // use recursion for now, until someone finds a PDF that would causes this overflow + function func = [&](int idx, int depth, int tree_depth) -> void { + states[idx].depth = tree_depth; + while(depth > 0) + { + int last_child = dp_buffer[depth][idx].last_child; + assert(last_child > idx); + func(last_child, depth - last_child, tree_depth + 1); + depth = last_child - idx - 1; + } + }; + + func(0, n-1, 0); +} + +void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state) +{ + if(prev_state && (prev_state->hash_value == hash_value)) + { + // check ids again + int i; + for(i = 0; i < ID_COUNT; ++i) + if(ids[i] != prev_state->ids[i]) + break; + + if(i == ID_COUNT) + { + need_close = false; + return; + } + } + + need_close = true; + out << " 0) out << ' '; + if(prev_state && (prev_state->ids[i] == ids[i])) + continue; + + if(first) + { + first = false; + } + else + { + out << ' '; + } + out << format("%1%%|2$x|") % format_str[i] % ids[i]; } + out << "\">"; } -void HTMLRenderer::LineBuffer::State::end(ostream & out) +void HTMLRenderer::LineBuffer::State::end(ostream & out) const { - out << ""; + if(need_close) + out << ""; +} + +void HTMLRenderer::LineBuffer::State::hash(void) +{ + hash_value = 0; + for(int i = 0; i < ID_COUNT; ++i) + { + hash_value = (hash_value << 8) | (ids[i] & 0xff); + } +} + +int HTMLRenderer::LineBuffer::State::diff(const State & s) const +{ + /* + * A quick check based on hash_value + * it could be wrong when there are more then 256 classes, + * in which case the output may not be optimal, but still 'correct' + */ + if(hash_value == s.hash_value) return 0; + + int d = 0; + for(int i = 0; i < ID_COUNT; ++i) + if(ids[i] != s.ids[i]) + ++ d; + return d; } const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr";