1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-09-06 00:47:03 +00:00
pdf2htmlEX/src/HTMLRenderer/LineBuffer.cc

329 lines
8.2 KiB
C++
Raw Normal View History

2012-09-04 15:33:15 +00:00
/*
* LineBuffer.cc
*
* Generate and optimized HTML for one line
*
* by WangLu
* 2012.09.04
*/
2012-09-05 07:13:21 +00:00
#include <vector>
#include <stack>
2012-09-04 15:33:15 +00:00
#include "HTMLRenderer.h"
#include "HTMLRenderer/namespace.h"
using std::min;
using std::max;
2012-09-05 07:13:21 +00:00
using std::vector;
using std::stack;
using std::function;
2012-09-04 15:33:15 +00:00
void HTMLRenderer::LineBuffer::reset(GfxState * state)
{
state->transform(state->getCurX(), state->getCurY(), &x, &y);
tm_id = renderer->cur_tm_id;
}
void HTMLRenderer::LineBuffer::append_unicodes(const Unicode * u, int l)
{
text.insert(text.end(), u, u+l);
}
void HTMLRenderer::LineBuffer::append_offset(double width)
{
if((!offsets.empty()) && (offsets.back().start_idx == text.size()))
offsets.back().width += width;
else
offsets.push_back({text.size(), width});
}
void HTMLRenderer::LineBuffer::append_state(void)
{
if(states.empty() || (states.back().start_idx != text.size()))
{
states.resize(states.size() + 1);
states.back().start_idx = text.size();
}
set_state(states.back());
}
void HTMLRenderer::LineBuffer::flush(void)
{
/*
* Each Line is an independent absolute positioined block
* so even we have a few states or offsets, we may omit them
*/
if(text.empty()) return;
if(states.empty() || (states[0].start_idx != 0))
{
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
return;
}
2012-09-05 07:13:21 +00:00
for(auto & s : states)
s.hash();
2012-09-05 08:19:01 +00:00
if((renderer->param->optimize) && (states.size() > 2))
2012-09-05 07:13:21 +00:00
{
2012-09-05 08:19:01 +00:00
optimize_states();
2012-09-05 07:13:21 +00:00
}
else
{
2012-09-05 08:19:01 +00:00
for(size_t i = 0; i < states.size(); ++i)
states[i].depth = i;
2012-09-05 07:13:21 +00:00
}
2012-09-04 15:33:15 +00:00
states.resize(states.size() + 1);
states.back().start_idx = text.size();
2012-09-05 07:13:21 +00:00
states.back().depth = 0;
2012-09-04 15:33:15 +00:00
offsets.push_back({text.size(), 0});
double max_ascent = 0;
for(const State & s : states)
2012-09-05 07:13:21 +00:00
max_ascent = max(max_ascent, s.ascent * s.draw_font_size);
2012-09-04 15:33:15 +00:00
// TODO: class for height ?
ostream & out = renderer->html_fout;
out << format("<div style=\"left:%1%px;bottom:%2%px;height:%3%px;\" class=\"l t%|4$x|\">")
% x % y
% max_ascent
% tm_id
;
auto cur_state_iter = states.begin();
auto cur_offset_iter = offsets.begin();
2012-09-05 07:13:21 +00:00
//accumulated horizontal offset;
2012-09-04 15:33:15 +00:00
double dx = 0;
2012-09-05 07:13:21 +00:00
stack<State*> stack;
stack.push(nullptr);
int last_depth = -1;
2012-09-04 15:33:15 +00:00
size_t cur_text_idx = 0;
while(cur_text_idx < text.size())
{
if(cur_text_idx >= cur_state_iter->start_idx)
{
2012-09-05 07:13:21 +00:00
int depth = cur_state_iter -> depth;
int cnt = last_depth + 1 - depth;
assert(cnt >= 0);
while(cnt--)
{
stack.top()->end(out);
stack.pop();
}
cur_state_iter->begin(out, stack.top());
stack.push(&*cur_state_iter);
last_depth = depth;
2012-09-04 15:33:15 +00:00
++ cur_state_iter;
}
if(cur_text_idx >= cur_offset_iter->start_idx)
{
double target = cur_offset_iter->width + dx;
double w;
auto wid = renderer->install_whitespace(target, w);
2012-09-05 07:13:21 +00:00
double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold);
2012-09-04 15:33:15 +00:00
out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
dx = target - w;
++ cur_offset_iter;
}
size_t next_text_idx = min(cur_state_iter->start_idx, cur_offset_iter->start_idx);
outputUnicodes(out, text.data() + cur_text_idx, next_text_idx - cur_text_idx);
cur_text_idx = next_text_idx;
}
2012-09-05 07:13:21 +00:00
// we have a nullptr in the bottom
while(stack.top())
{
stack.top()->end(out);
stack.pop();
}
2012-09-04 15:33:15 +00:00
out << "</div>";
states.clear();
offsets.clear();
text.clear();
}
void HTMLRenderer::LineBuffer::set_state (State & state)
{
state.ids[State::FONT_ID] = renderer->cur_font_info->id;
state.ids[State::FONT_SIZE_ID] = renderer->cur_fs_id;
state.ids[State::COLOR_ID] = renderer->cur_color_id;
state.ids[State::LETTER_SPACE_ID] = renderer->cur_ls_id;
state.ids[State::WORD_SPACE_ID] = renderer->cur_ws_id;
state.ids[State::RISE_ID] = renderer->cur_rise_id;
2012-09-05 07:13:21 +00:00
const FontInfo * info = renderer->cur_font_info;
state.ascent = info->ascent;
state.descent = info->descent;
state.draw_font_size = renderer->draw_font_size;
2012-09-04 15:33:15 +00:00
}
2012-09-05 07:13:21 +00:00
class DPBufferEntry
2012-09-04 15:33:15 +00:00
{
2012-09-05 07:13:21 +00:00
public:
int last_child;
int min_cost;
};
static vector<DPBufferEntry> flattened_dp_buffer;
static vector<DPBufferEntry*> dp_buffer;
void HTMLRenderer::LineBuffer::optimize_states (void)
{
int n = states.size();
flattened_dp_buffer.resize(n*(n+1)/2);
dp_buffer.resize(n);
{
int incre = n;
auto iter = dp_buffer.begin();
DPBufferEntry * p = flattened_dp_buffer.data();
while(incre > 0)
{
*(iter++) = p;
p += (incre--);
}
}
2012-09-05 08:19:01 +00:00
// depth 0
for(int i = 0; i < n; ++i)
flattened_dp_buffer[i].min_cost = 0;
2012-09-05 07:13:21 +00:00
int last_at_this_depth = n;
for(int depth = 1; depth < n; ++depth)
{
--last_at_this_depth;
for(int i = 0; i < last_at_this_depth; ++i)
{
//determine dp_buffer[depth][i]
int best_last_child = i+1;
int best_min_cost = states[i].diff(states[i+1]) + dp_buffer[depth-1][i+1].min_cost;
// at depth, we consider [i+1, i+depth+1) as possible children of i
for(int j = 2; j <= depth; ++j)
{
int cost = dp_buffer[j-1][i].min_cost + dp_buffer[depth-j][i+j].min_cost;
// avoid calling diff() when possible
if (cost >= best_min_cost) continue;
cost += states[i].diff(states[i+j]);
if(cost < best_min_cost)
{
best_last_child = i+j;
best_min_cost = cost;
}
}
dp_buffer[depth][i] = {best_last_child, best_min_cost};
}
}
// now fill in the depths
// use recursion for now, until someone finds a PDF that would causes this overflow
function<void(int,int,int)> func = [&](int idx, int depth, int tree_depth) -> void {
states[idx].depth = tree_depth;
while(depth > 0)
{
int last_child = dp_buffer[depth][idx].last_child;
2012-09-05 08:19:01 +00:00
assert((last_child > idx) && (last_child <= idx + depth));
func(last_child, idx + depth - last_child, tree_depth + 1);
2012-09-05 07:13:21 +00:00
depth = last_child - idx - 1;
}
};
func(0, n-1, 0);
}
void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state)
{
if(prev_state && (prev_state->hash_value == hash_value))
{
// check ids again
int i;
for(i = 0; i < ID_COUNT; ++i)
if(ids[i] != prev_state->ids[i])
break;
if(i == ID_COUNT)
{
need_close = false;
return;
}
}
need_close = true;
2012-09-04 15:33:15 +00:00
out << "<span class=\"";
2012-09-05 07:13:21 +00:00
bool first = true;
2012-09-04 15:33:15 +00:00
for(int i = 0; i < ID_COUNT; ++i)
{
2012-09-05 07:13:21 +00:00
if(prev_state && (prev_state->ids[i] == ids[i]))
continue;
if(first)
{
first = false;
}
else
{
out << ' ';
}
2012-09-04 15:33:15 +00:00
out << format("%1%%|2$x|") % format_str[i] % ids[i];
}
2012-09-05 07:13:21 +00:00
2012-09-04 15:33:15 +00:00
out << "\">";
}
2012-09-05 07:13:21 +00:00
void HTMLRenderer::LineBuffer::State::end(ostream & out) const
{
if(need_close)
out << "</span>";
}
void HTMLRenderer::LineBuffer::State::hash(void)
{
hash_value = 0;
for(int i = 0; i < ID_COUNT; ++i)
{
hash_value = (hash_value << 8) | (ids[i] & 0xff);
}
}
int HTMLRenderer::LineBuffer::State::diff(const State & s) const
2012-09-04 15:33:15 +00:00
{
2012-09-05 07:13:21 +00:00
/*
* A quick check based on hash_value
* it could be wrong when there are more then 256 classes,
2012-09-05 08:19:01 +00:00
* in which case the output may not be optimal, but still 'correct' in terms of HTML
2012-09-05 07:13:21 +00:00
*/
if(hash_value == s.hash_value) return 0;
int d = 0;
for(int i = 0; i < ID_COUNT; ++i)
if(ids[i] != s.ids[i])
++ d;
return d;
2012-09-04 15:33:15 +00:00
}
const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr";