diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h
index 54c4766..4f33b1d 100644
--- a/src/HTMLRenderer.h
+++ b/src/HTMLRenderer.h
@@ -268,8 +268,10 @@ class HTMLRenderer : public OutputDev
class State {
public:
- void begin(std::ostream & out) const;
- static void end(std::ostream & out);
+ void begin(std::ostream & out, const State * prev_state);
+ void end(std::ostream & out) const;
+ void hash(void);
+ int diff(const State & s) const;
enum {
FONT_ID,
@@ -283,8 +285,16 @@ class HTMLRenderer : public OutputDev
};
long long ids[ID_COUNT];
+
double ascent;
+ double descent;
+ double draw_font_size;
+
size_t start_idx; // index of the first Text using this state
+ // for optimzation
+ long long hash_value;
+ int depth; // the depth in the state tree
+ bool need_close;
static const char * format_str; // class names for each id
};
@@ -305,6 +315,8 @@ class HTMLRenderer : public OutputDev
private:
// retrieve state from renderer
void set_state(State & state);
+ // build the state tree in order to minimize the size of output
+ void optimize_states(void);
HTMLRenderer * renderer;
@@ -328,14 +340,10 @@ class HTMLRenderer : public OutputDev
std::unordered_map font_name_map;
std::map font_size_map;
-
std::map transform_matrix_map;
-
std::map letter_space_map;
std::map word_space_map;
-
std::map color_map;
-
std::map whitespace_map;
std::map rise_map;
diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc
index 3cc74cc..2a8acb9 100644
--- a/src/HTMLRenderer/LineBuffer.cc
+++ b/src/HTMLRenderer/LineBuffer.cc
@@ -7,13 +7,17 @@
* 2012.09.04
*/
+#include
+#include
+
#include "HTMLRenderer.h"
#include "HTMLRenderer/namespace.h"
using std::min;
using std::max;
-using std::hex;
-using std::dec;
+using std::vector;
+using std::stack;
+using std::function;
void HTMLRenderer::LineBuffer::reset(GfxState * state)
{
@@ -59,15 +63,28 @@ void HTMLRenderer::LineBuffer::flush(void)
return;
}
+ for(auto & s : states)
+ s.hash();
+
+ if(states.size() < 3)
+ {
+ for(size_t i = 0; i < states.size(); ++i)
+ states[i].depth = i;
+ }
+ else
+ {
+ optimize_states();
+ }
+
states.resize(states.size() + 1);
states.back().start_idx = text.size();
+ states.back().depth = 0;
offsets.push_back({text.size(), 0});
- // TODO: optimize state
double max_ascent = 0;
for(const State & s : states)
- max_ascent = max(max_ascent, s.ascent);
+ max_ascent = max(max_ascent, s.ascent * s.draw_font_size);
// TODO: class for height ?
ostream & out = renderer->html_fout;
@@ -80,17 +97,30 @@ void HTMLRenderer::LineBuffer::flush(void)
auto cur_state_iter = states.begin();
auto cur_offset_iter = offsets.begin();
+ //accumulated horizontal offset;
double dx = 0;
+ stack stack;
+ stack.push(nullptr);
+ int last_depth = -1;
+
size_t cur_text_idx = 0;
while(cur_text_idx < text.size())
{
if(cur_text_idx >= cur_state_iter->start_idx)
{
- if(cur_text_idx)
- State::end(out);
+ int depth = cur_state_iter -> depth;
+ int cnt = last_depth + 1 - depth;
+ assert(cnt >= 0);
+ while(cnt--)
+ {
+ stack.top()->end(out);
+ stack.pop();
+ }
- cur_state_iter->begin(out);
+ cur_state_iter->begin(out, stack.top());
+ stack.push(&*cur_state_iter);
+ last_depth = depth;
++ cur_state_iter;
}
@@ -102,9 +132,7 @@ void HTMLRenderer::LineBuffer::flush(void)
auto wid = renderer->install_whitespace(target, w);
- // TODO
-// double threshold = draw_font_size * (cur_font_info.ascent - cur_font_info.descent) * (param->space_threshold);
- double threshold = 0;
+ double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold);
out << format("%2%") % wid % (target > (threshold - EPS) ? " " : "");
dx = target - w;
@@ -117,7 +145,13 @@ void HTMLRenderer::LineBuffer::flush(void)
cur_text_idx = next_text_idx;
}
- State::end(out);
+ // we have a nullptr in the bottom
+ while(stack.top())
+ {
+ stack.top()->end(out);
+ stack.pop();
+ }
+
out << "";
@@ -136,23 +170,155 @@ void HTMLRenderer::LineBuffer::set_state (State & state)
state.ids[State::WORD_SPACE_ID] = renderer->cur_ws_id;
state.ids[State::RISE_ID] = renderer->cur_rise_id;
- state.ascent = renderer->cur_font_info->ascent * renderer->draw_font_size;
+ const FontInfo * info = renderer->cur_font_info;
+ state.ascent = info->ascent;
+ state.descent = info->descent;
+ state.draw_font_size = renderer->draw_font_size;
}
-void HTMLRenderer::LineBuffer::State::begin (ostream & out) const
+class DPBufferEntry
{
+public:
+ int last_child;
+ int min_cost;
+};
+
+static vector flattened_dp_buffer;
+static vector dp_buffer;
+
+void HTMLRenderer::LineBuffer::optimize_states (void)
+{
+ int n = states.size();
+
+ flattened_dp_buffer.resize(n*(n+1)/2);
+ dp_buffer.resize(n);
+
+ {
+ int incre = n;
+ auto iter = dp_buffer.begin();
+ DPBufferEntry * p = flattened_dp_buffer.data();
+ while(incre > 0)
+ {
+ *(iter++) = p;
+ p += (incre--);
+ }
+ }
+
+ int last_at_this_depth = n;
+ for(int depth = 1; depth < n; ++depth)
+ {
+ --last_at_this_depth;
+ for(int i = 0; i < last_at_this_depth; ++i)
+ {
+ //determine dp_buffer[depth][i]
+ int best_last_child = i+1;
+ int best_min_cost = states[i].diff(states[i+1]) + dp_buffer[depth-1][i+1].min_cost;
+ // at depth, we consider [i+1, i+depth+1) as possible children of i
+ for(int j = 2; j <= depth; ++j)
+ {
+ int cost = dp_buffer[j-1][i].min_cost + dp_buffer[depth-j][i+j].min_cost;
+ // avoid calling diff() when possible
+ if (cost >= best_min_cost) continue;
+
+ cost += states[i].diff(states[i+j]);
+
+ if(cost < best_min_cost)
+ {
+ best_last_child = i+j;
+ best_min_cost = cost;
+ }
+ }
+
+ dp_buffer[depth][i] = {best_last_child, best_min_cost};
+ }
+ }
+
+ // now fill in the depths
+ // use recursion for now, until someone finds a PDF that would causes this overflow
+ function func = [&](int idx, int depth, int tree_depth) -> void {
+ states[idx].depth = tree_depth;
+ while(depth > 0)
+ {
+ int last_child = dp_buffer[depth][idx].last_child;
+ assert(last_child > idx);
+ func(last_child, depth - last_child, tree_depth + 1);
+ depth = last_child - idx - 1;
+ }
+ };
+
+ func(0, n-1, 0);
+}
+
+void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state)
+{
+ if(prev_state && (prev_state->hash_value == hash_value))
+ {
+ // check ids again
+ int i;
+ for(i = 0; i < ID_COUNT; ++i)
+ if(ids[i] != prev_state->ids[i])
+ break;
+
+ if(i == ID_COUNT)
+ {
+ need_close = false;
+ return;
+ }
+ }
+
+ need_close = true;
+
out << " 0) out << ' ';
+ if(prev_state && (prev_state->ids[i] == ids[i]))
+ continue;
+
+ if(first)
+ {
+ first = false;
+ }
+ else
+ {
+ out << ' ';
+ }
+
out << format("%1%%|2$x|") % format_str[i] % ids[i];
}
+
out << "\">";
}
-void HTMLRenderer::LineBuffer::State::end(ostream & out)
+void HTMLRenderer::LineBuffer::State::end(ostream & out) const
{
- out << "";
+ if(need_close)
+ out << "";
+}
+
+void HTMLRenderer::LineBuffer::State::hash(void)
+{
+ hash_value = 0;
+ for(int i = 0; i < ID_COUNT; ++i)
+ {
+ hash_value = (hash_value << 8) | (ids[i] & 0xff);
+ }
+}
+
+int HTMLRenderer::LineBuffer::State::diff(const State & s) const
+{
+ /*
+ * A quick check based on hash_value
+ * it could be wrong when there are more then 256 classes,
+ * in which case the output may not be optimal, but still 'correct'
+ */
+ if(hash_value == s.hash_value) return 0;
+
+ int d = 0;
+ for(int i = 0; i < ID_COUNT; ++i)
+ if(ids[i] != s.ids[i])
+ ++ d;
+ return d;
}
const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr";