mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 04:50:09 +00:00
optimize states in HTML
This commit is contained in:
parent
6079a05e34
commit
bbe9b99b4e
@ -268,8 +268,10 @@ class HTMLRenderer : public OutputDev
|
||||
|
||||
class State {
|
||||
public:
|
||||
void begin(std::ostream & out) const;
|
||||
static void end(std::ostream & out);
|
||||
void begin(std::ostream & out, const State * prev_state);
|
||||
void end(std::ostream & out) const;
|
||||
void hash(void);
|
||||
int diff(const State & s) const;
|
||||
|
||||
enum {
|
||||
FONT_ID,
|
||||
@ -283,8 +285,16 @@ class HTMLRenderer : public OutputDev
|
||||
};
|
||||
|
||||
long long ids[ID_COUNT];
|
||||
|
||||
double ascent;
|
||||
double descent;
|
||||
double draw_font_size;
|
||||
|
||||
size_t start_idx; // index of the first Text using this state
|
||||
// for optimzation
|
||||
long long hash_value;
|
||||
int depth; // the depth in the state tree
|
||||
bool need_close;
|
||||
|
||||
static const char * format_str; // class names for each id
|
||||
};
|
||||
@ -305,6 +315,8 @@ class HTMLRenderer : public OutputDev
|
||||
private:
|
||||
// retrieve state from renderer
|
||||
void set_state(State & state);
|
||||
// build the state tree in order to minimize the size of output
|
||||
void optimize_states(void);
|
||||
|
||||
HTMLRenderer * renderer;
|
||||
|
||||
@ -328,14 +340,10 @@ class HTMLRenderer : public OutputDev
|
||||
|
||||
std::unordered_map<long long, FontInfo> font_name_map;
|
||||
std::map<double, long long> font_size_map;
|
||||
|
||||
std::map<TM, long long> transform_matrix_map;
|
||||
|
||||
std::map<double, long long> letter_space_map;
|
||||
std::map<double, long long> word_space_map;
|
||||
|
||||
std::map<GfxRGB, long long> color_map;
|
||||
|
||||
std::map<double, long long> whitespace_map;
|
||||
std::map<double, long long> rise_map;
|
||||
|
||||
|
@ -7,13 +7,17 @@
|
||||
* 2012.09.04
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <stack>
|
||||
|
||||
#include "HTMLRenderer.h"
|
||||
#include "HTMLRenderer/namespace.h"
|
||||
|
||||
using std::min;
|
||||
using std::max;
|
||||
using std::hex;
|
||||
using std::dec;
|
||||
using std::vector;
|
||||
using std::stack;
|
||||
using std::function;
|
||||
|
||||
void HTMLRenderer::LineBuffer::reset(GfxState * state)
|
||||
{
|
||||
@ -59,15 +63,28 @@ void HTMLRenderer::LineBuffer::flush(void)
|
||||
return;
|
||||
}
|
||||
|
||||
for(auto & s : states)
|
||||
s.hash();
|
||||
|
||||
if(states.size() < 3)
|
||||
{
|
||||
for(size_t i = 0; i < states.size(); ++i)
|
||||
states[i].depth = i;
|
||||
}
|
||||
else
|
||||
{
|
||||
optimize_states();
|
||||
}
|
||||
|
||||
states.resize(states.size() + 1);
|
||||
states.back().start_idx = text.size();
|
||||
states.back().depth = 0;
|
||||
|
||||
offsets.push_back({text.size(), 0});
|
||||
|
||||
// TODO: optimize state
|
||||
double max_ascent = 0;
|
||||
for(const State & s : states)
|
||||
max_ascent = max(max_ascent, s.ascent);
|
||||
max_ascent = max(max_ascent, s.ascent * s.draw_font_size);
|
||||
|
||||
// TODO: class for height ?
|
||||
ostream & out = renderer->html_fout;
|
||||
@ -80,17 +97,30 @@ void HTMLRenderer::LineBuffer::flush(void)
|
||||
auto cur_state_iter = states.begin();
|
||||
auto cur_offset_iter = offsets.begin();
|
||||
|
||||
//accumulated horizontal offset;
|
||||
double dx = 0;
|
||||
|
||||
stack<State*> stack;
|
||||
stack.push(nullptr);
|
||||
int last_depth = -1;
|
||||
|
||||
size_t cur_text_idx = 0;
|
||||
while(cur_text_idx < text.size())
|
||||
{
|
||||
if(cur_text_idx >= cur_state_iter->start_idx)
|
||||
{
|
||||
if(cur_text_idx)
|
||||
State::end(out);
|
||||
int depth = cur_state_iter -> depth;
|
||||
int cnt = last_depth + 1 - depth;
|
||||
assert(cnt >= 0);
|
||||
while(cnt--)
|
||||
{
|
||||
stack.top()->end(out);
|
||||
stack.pop();
|
||||
}
|
||||
|
||||
cur_state_iter->begin(out);
|
||||
cur_state_iter->begin(out, stack.top());
|
||||
stack.push(&*cur_state_iter);
|
||||
last_depth = depth;
|
||||
|
||||
++ cur_state_iter;
|
||||
}
|
||||
@ -102,9 +132,7 @@ void HTMLRenderer::LineBuffer::flush(void)
|
||||
|
||||
auto wid = renderer->install_whitespace(target, w);
|
||||
|
||||
// TODO
|
||||
// double threshold = draw_font_size * (cur_font_info.ascent - cur_font_info.descent) * (param->space_threshold);
|
||||
double threshold = 0;
|
||||
double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold);
|
||||
out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
|
||||
|
||||
dx = target - w;
|
||||
@ -117,7 +145,13 @@ void HTMLRenderer::LineBuffer::flush(void)
|
||||
cur_text_idx = next_text_idx;
|
||||
}
|
||||
|
||||
State::end(out);
|
||||
// we have a nullptr in the bottom
|
||||
while(stack.top())
|
||||
{
|
||||
stack.top()->end(out);
|
||||
stack.pop();
|
||||
}
|
||||
|
||||
out << "</div>";
|
||||
|
||||
|
||||
@ -136,23 +170,155 @@ void HTMLRenderer::LineBuffer::set_state (State & state)
|
||||
state.ids[State::WORD_SPACE_ID] = renderer->cur_ws_id;
|
||||
state.ids[State::RISE_ID] = renderer->cur_rise_id;
|
||||
|
||||
state.ascent = renderer->cur_font_info->ascent * renderer->draw_font_size;
|
||||
const FontInfo * info = renderer->cur_font_info;
|
||||
state.ascent = info->ascent;
|
||||
state.descent = info->descent;
|
||||
state.draw_font_size = renderer->draw_font_size;
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::State::begin (ostream & out) const
|
||||
class DPBufferEntry
|
||||
{
|
||||
public:
|
||||
int last_child;
|
||||
int min_cost;
|
||||
};
|
||||
|
||||
static vector<DPBufferEntry> flattened_dp_buffer;
|
||||
static vector<DPBufferEntry*> dp_buffer;
|
||||
|
||||
void HTMLRenderer::LineBuffer::optimize_states (void)
|
||||
{
|
||||
int n = states.size();
|
||||
|
||||
flattened_dp_buffer.resize(n*(n+1)/2);
|
||||
dp_buffer.resize(n);
|
||||
|
||||
{
|
||||
int incre = n;
|
||||
auto iter = dp_buffer.begin();
|
||||
DPBufferEntry * p = flattened_dp_buffer.data();
|
||||
while(incre > 0)
|
||||
{
|
||||
*(iter++) = p;
|
||||
p += (incre--);
|
||||
}
|
||||
}
|
||||
|
||||
int last_at_this_depth = n;
|
||||
for(int depth = 1; depth < n; ++depth)
|
||||
{
|
||||
--last_at_this_depth;
|
||||
for(int i = 0; i < last_at_this_depth; ++i)
|
||||
{
|
||||
//determine dp_buffer[depth][i]
|
||||
int best_last_child = i+1;
|
||||
int best_min_cost = states[i].diff(states[i+1]) + dp_buffer[depth-1][i+1].min_cost;
|
||||
// at depth, we consider [i+1, i+depth+1) as possible children of i
|
||||
for(int j = 2; j <= depth; ++j)
|
||||
{
|
||||
int cost = dp_buffer[j-1][i].min_cost + dp_buffer[depth-j][i+j].min_cost;
|
||||
// avoid calling diff() when possible
|
||||
if (cost >= best_min_cost) continue;
|
||||
|
||||
cost += states[i].diff(states[i+j]);
|
||||
|
||||
if(cost < best_min_cost)
|
||||
{
|
||||
best_last_child = i+j;
|
||||
best_min_cost = cost;
|
||||
}
|
||||
}
|
||||
|
||||
dp_buffer[depth][i] = {best_last_child, best_min_cost};
|
||||
}
|
||||
}
|
||||
|
||||
// now fill in the depths
|
||||
// use recursion for now, until someone finds a PDF that would causes this overflow
|
||||
function<void(int,int,int)> func = [&](int idx, int depth, int tree_depth) -> void {
|
||||
states[idx].depth = tree_depth;
|
||||
while(depth > 0)
|
||||
{
|
||||
int last_child = dp_buffer[depth][idx].last_child;
|
||||
assert(last_child > idx);
|
||||
func(last_child, depth - last_child, tree_depth + 1);
|
||||
depth = last_child - idx - 1;
|
||||
}
|
||||
};
|
||||
|
||||
func(0, n-1, 0);
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state)
|
||||
{
|
||||
if(prev_state && (prev_state->hash_value == hash_value))
|
||||
{
|
||||
// check ids again
|
||||
int i;
|
||||
for(i = 0; i < ID_COUNT; ++i)
|
||||
if(ids[i] != prev_state->ids[i])
|
||||
break;
|
||||
|
||||
if(i == ID_COUNT)
|
||||
{
|
||||
need_close = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
need_close = true;
|
||||
|
||||
out << "<span class=\"";
|
||||
bool first = true;
|
||||
for(int i = 0; i < ID_COUNT; ++i)
|
||||
{
|
||||
if(i > 0) out << ' ';
|
||||
if(prev_state && (prev_state->ids[i] == ids[i]))
|
||||
continue;
|
||||
|
||||
if(first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
out << ' ';
|
||||
}
|
||||
|
||||
out << format("%1%%|2$x|") % format_str[i] % ids[i];
|
||||
}
|
||||
|
||||
out << "\">";
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::State::end(ostream & out)
|
||||
void HTMLRenderer::LineBuffer::State::end(ostream & out) const
|
||||
{
|
||||
if(need_close)
|
||||
out << "</span>";
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::State::hash(void)
|
||||
{
|
||||
hash_value = 0;
|
||||
for(int i = 0; i < ID_COUNT; ++i)
|
||||
{
|
||||
hash_value = (hash_value << 8) | (ids[i] & 0xff);
|
||||
}
|
||||
}
|
||||
|
||||
int HTMLRenderer::LineBuffer::State::diff(const State & s) const
|
||||
{
|
||||
/*
|
||||
* A quick check based on hash_value
|
||||
* it could be wrong when there are more then 256 classes,
|
||||
* in which case the output may not be optimal, but still 'correct'
|
||||
*/
|
||||
if(hash_value == s.hash_value) return 0;
|
||||
|
||||
int d = 0;
|
||||
for(int i = 0; i < ID_COUNT; ++i)
|
||||
if(ids[i] != s.ids[i])
|
||||
++ d;
|
||||
return d;
|
||||
}
|
||||
|
||||
const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr";
|
||||
|
Loading…
Reference in New Issue
Block a user