1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-09-06 00:47:03 +00:00
pdf2htmlEX/src/HTMLRenderer/TextLineBuffer.cc

480 lines
15 KiB
C++
Raw Normal View History

2012-09-04 15:33:15 +00:00
/*
2012-12-11 12:48:01 +00:00
* TextLineBuffer.cc
2012-09-04 15:33:15 +00:00
*
* Generate and optimized HTML for one line
*
2013-02-05 06:36:36 +00:00
* Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
2012-09-04 15:33:15 +00:00
*/
2012-09-05 07:13:21 +00:00
#include <vector>
2013-03-21 04:18:26 +00:00
#include <cmath>
2013-03-20 15:46:58 +00:00
#include <algorithm>
2012-09-05 07:13:21 +00:00
2012-09-04 15:33:15 +00:00
#include "HTMLRenderer.h"
2012-12-11 12:48:01 +00:00
#include "TextLineBuffer.h"
2012-11-29 09:28:05 +00:00
#include "util/namespace.h"
2012-11-29 09:45:26 +00:00
#include "util/unicode.h"
2012-11-29 10:28:07 +00:00
#include "util/math.h"
2013-02-27 18:11:34 +00:00
#include "util/css_const.h"
2013-02-15 05:07:00 +00:00
#include "util/encoding.h"
2012-09-04 15:33:15 +00:00
2012-09-12 15:26:14 +00:00
namespace pdf2htmlEX {
2012-09-04 15:33:15 +00:00
using std::min;
using std::max;
2012-09-05 07:13:21 +00:00
using std::vector;
2012-09-10 17:53:33 +00:00
using std::ostream;
2012-11-29 10:28:07 +00:00
using std::cerr;
using std::endl;
2013-03-20 15:46:58 +00:00
using std::find;
2013-03-21 04:18:26 +00:00
using std::abs;
2012-09-04 15:33:15 +00:00
2013-03-30 14:37:20 +00:00
void HTMLRenderer::TextLineBuffer::set_pos(GfxState * state)
2012-09-04 15:33:15 +00:00
{
state->transform(state->getCurX(), state->getCurY(), &x, &y);
2013-02-05 12:37:05 +00:00
tm_id = renderer->transform_matrix_manager.get_id();
2012-09-04 15:33:15 +00:00
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::append_unicodes(const Unicode * u, int l)
2012-09-04 15:33:15 +00:00
{
text.insert(text.end(), u, u+l);
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::append_offset(double width)
2012-09-04 15:33:15 +00:00
{
if((!offsets.empty()) && (offsets.back().start_idx == text.size()))
offsets.back().width += width;
else
2012-09-09 06:48:10 +00:00
offsets.push_back(Offset({text.size(), width}));
2012-09-04 15:33:15 +00:00
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::append_state(void)
2012-09-04 15:33:15 +00:00
{
if(states.empty() || (states.back().start_idx != text.size()))
{
states.resize(states.size() + 1);
states.back().start_idx = text.size();
2013-03-20 15:46:58 +00:00
states.back().hash_umask = 0;
2012-09-04 15:33:15 +00:00
}
set_state(states.back());
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::flush(void)
2012-09-04 15:33:15 +00:00
{
/*
2012-12-11 12:52:36 +00:00
* Each Line is an independent absolute positioned block
2012-09-04 15:33:15 +00:00
* so even we have a few states or offsets, we may omit them
*/
if(text.empty()) return;
2013-04-03 06:04:39 +00:00
while((!states.empty()) && (states.back().start_idx >= text.size()))
states.pop_back();
2012-09-04 15:33:15 +00:00
if(states.empty() || (states[0].start_idx != 0))
{
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
return;
}
2013-03-20 15:46:58 +00:00
optimize();
2012-09-04 15:33:15 +00:00
double max_ascent = 0;
2012-09-09 06:48:10 +00:00
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
const auto & s = *iter;
2013-03-20 15:46:58 +00:00
max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size);
2012-09-09 06:48:10 +00:00
}
2012-09-04 15:33:15 +00:00
2013-03-21 04:18:26 +00:00
for(auto iter = states.begin(); iter != states.end(); ++iter)
iter->hash();
2013-01-28 10:46:44 +00:00
ostream & out = renderer->f_pages.fs;
2013-04-03 06:04:39 +00:00
{
long long hid = renderer->height_manager.install(max_ascent);
long long lid = renderer->left_manager .install(x);
long long bid = renderer->bottom_manager.install(y);
out << "<div class=\"" << CSS::LINE_CN
<< " " << CSS::TRANSFORM_MATRIX_CN << tm_id
<< " " << CSS::LEFT_CN << lid
<< " " << CSS::HEIGHT_CN << hid
<< " " << CSS::BOTTOM_CN << bid
<< "\">";
}
2012-09-04 15:33:15 +00:00
stack.clear();
stack.push_back(nullptr);
2013-04-03 06:04:39 +00:00
//accumulated horizontal offset;
double dx = 0;
// whenever a negative offset appears, we should not pop out that <span>
// otherwise the effect of negative margin-left would disappear
2012-09-06 11:05:49 +00:00
size_t last_text_pos_with_negative_offset = 0;
2012-09-04 15:33:15 +00:00
size_t cur_text_idx = 0;
2013-04-03 06:04:39 +00:00
auto cur_offset_iter = offsets.begin();
for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++;
state_iter1 != states.end();
++state_iter1, ++state_iter2)
2012-09-04 15:33:15 +00:00
{
2013-04-03 06:04:39 +00:00
// export current state, find a closest parent
{
// greedy
int best_cost = State::ID_COUNT;
// we have a nullptr at the beginning, so no need to check for rend
for(auto iter = stack.rbegin(); *iter; ++iter)
2012-09-05 07:13:21 +00:00
{
2013-04-03 06:04:39 +00:00
int cost = state_iter1->diff(**iter);
if(cost < best_cost)
{
while(stack.back() != *iter)
{
stack.back()->end(out);
stack.pop_back();
}
best_cost = cost;
if(best_cost == 0)
break;
}
2012-09-05 07:13:21 +00:00
// cannot go further
if((*iter)->start_idx <= last_text_pos_with_negative_offset)
break;
}
2013-04-03 06:04:39 +00:00
state_iter1->begin(out, stack.back());
stack.push_back(&*state_iter1);
2012-09-04 15:33:15 +00:00
}
2013-04-03 06:04:39 +00:00
size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
2012-09-04 15:33:15 +00:00
2013-04-03 06:10:11 +00:00
// dump all text and offsets before next state
2013-04-03 06:04:39 +00:00
while(true)
{
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx <= cur_text_idx))
2013-03-21 04:18:26 +00:00
{
2013-04-03 06:04:39 +00:00
if(cur_offset_iter->start_idx > text_idx2)
break;
2013-04-03 06:10:11 +00:00
// next is offset
2013-04-03 06:04:39 +00:00
double target = cur_offset_iter->width + dx;
double actual_offset = 0;
if(abs(target) <= renderer->param->h_eps)
2013-03-24 13:42:51 +00:00
{
2013-04-03 06:04:39 +00:00
actual_offset = 0;
2013-03-24 13:42:51 +00:00
}
2013-04-03 06:04:39 +00:00
else
2013-03-24 13:42:51 +00:00
{
2013-04-03 06:04:39 +00:00
bool done = false;
if(!(state_iter1->hash_umask & State::umask_by_id(State::WORD_SPACE_ID)))
{
double space_off = state_iter1->single_space_offset();
if(abs(target - space_off) <= renderer->param->h_eps)
{
Unicode u = ' ';
outputUnicodes(out, &u, 1);
actual_offset = space_off;
done = true;
}
}
2012-09-04 15:33:15 +00:00
2013-04-03 06:04:39 +00:00
if(!done)
2013-03-24 13:42:51 +00:00
{
2013-04-03 06:04:39 +00:00
long long wid = renderer->whitespace_manager.install(target, &actual_offset);
if(!equal(actual_offset, 0))
{
if(is_positive(-actual_offset))
last_text_pos_with_negative_offset = cur_text_idx;
2013-04-03 06:04:39 +00:00
double threshold = state_iter1->em_size() * (renderer->param->space_threshold);
2012-09-07 00:39:21 +00:00
2013-04-03 06:04:39 +00:00
out << "<span class=\"" << CSS::WHITESPACE_CN
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
}
2013-03-24 13:42:51 +00:00
}
}
2013-04-03 06:04:39 +00:00
dx = target - actual_offset;
++ cur_offset_iter;
2013-03-21 04:18:26 +00:00
}
2013-04-03 06:04:39 +00:00
else
{
if(cur_text_idx >= text_idx2)
break;
2013-04-03 06:10:11 +00:00
// next is text
2013-04-03 06:04:39 +00:00
size_t next_text_idx = text_idx2;
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
next_text_idx = cur_offset_iter->start_idx;
outputUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
cur_text_idx = next_text_idx;
}
}
2012-09-04 15:33:15 +00:00
}
2012-09-05 07:13:21 +00:00
// we have a nullptr in the bottom
while(stack.back())
2012-09-05 07:13:21 +00:00
{
stack.back()->end(out);
stack.pop_back();
2012-09-05 07:13:21 +00:00
}
2012-09-04 15:33:15 +00:00
out << "</div>";
states.clear();
offsets.clear();
text.clear();
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::set_state (State & state)
2012-09-04 15:33:15 +00:00
{
state.ids[State::FONT_ID] = renderer->cur_font_info->id;
2013-02-05 06:36:36 +00:00
state.ids[State::FONT_SIZE_ID] = renderer->font_size_manager.get_id();
2013-02-05 13:56:19 +00:00
state.ids[State::FILL_COLOR_ID] = renderer->fill_color_manager.get_id();
state.ids[State::STROKE_COLOR_ID] = renderer->stroke_color_manager.get_id();
2013-02-05 06:36:36 +00:00
state.ids[State::LETTER_SPACE_ID] = renderer->letter_space_manager.get_id();
state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id();
2013-02-05 06:45:40 +00:00
state.ids[State::RISE_ID] = renderer->rise_manager.get_id();
2012-09-04 15:33:15 +00:00
2013-03-20 15:46:58 +00:00
state.font_info = renderer->cur_font_info;
2013-03-21 04:18:26 +00:00
state.draw_font_size = renderer->font_size_manager.get_actual_value();
state.letter_space = renderer->letter_space_manager.get_actual_value();
state.word_space = renderer->word_space_manager.get_actual_value();
2012-09-04 15:33:15 +00:00
}
2013-04-03 06:04:39 +00:00
void HTMLRenderer::TextLineBuffer::optimize()
2013-03-20 15:46:58 +00:00
{
if(!(renderer->param->optimize_text))
return;
2013-03-20 15:46:58 +00:00
assert(!states.empty());
2013-04-03 06:04:39 +00:00
// for optimization, we need accurate values
auto & ws_manager = renderer->word_space_manager;
double old_ws_eps = ws_manager.get_eps();
ws_manager.set_eps(EPS);
2013-04-03 06:17:27 +00:00
auto offset_iter1 = offsets.begin();
2013-03-30 17:00:04 +00:00
std::map<double, int> width_map;
2013-03-20 15:46:58 +00:00
// set proper hash_umask
2013-03-25 04:23:29 +00:00
long long word_space_umask = State::umask_by_id(State::WORD_SPACE_ID);
2013-03-30 17:00:04 +00:00
for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++;
state_iter1 != states.end();
++state_iter1, ++state_iter2)
2013-03-21 04:18:26 +00:00
{
2013-03-30 17:00:04 +00:00
size_t text_idx1 = state_iter1->start_idx;
size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
2013-03-20 15:46:58 +00:00
2013-03-30 17:00:04 +00:00
// get the text segment covered by current state (*state_iter1)
auto text_iter1 = text.begin() + text_idx1;
auto text_iter2 = text.begin() + text_idx2;
2013-04-03 06:17:27 +00:00
while((offset_iter1 != offsets.end()) && (offset_iter1->start_idx <= text_idx1))
++ offset_iter1;
auto offset_iter2 = offset_iter1;
for(; (offset_iter2 != offsets.end()) && (offset_iter2->start_idx <= text_idx2); ++offset_iter2) { }
// In some PDF files all letter spaces are implemented as position shifts between each letter
// try to simplify it with a proper letter space
2013-03-30 17:00:04 +00:00
// In some PDF files all spaces are converted into positionig shift
// We may try to change (some of) them to ' ' and adjust word_space accordingly
// This can also be applied when param->space_as_offset is set
// for now, we cosider only the no-space scenario
if(find(text_iter1, text_iter2, ' ') != text_iter2)
continue;
// if there is not any space, we may change the value of word_space arbitrarily
// collect widths
width_map.clear();
2013-03-30 17:21:14 +00:00
double threshold = (state_iter1->em_size()) * (renderer->param->space_threshold);
2013-04-03 06:17:27 +00:00
for(auto off_iter = offset_iter1; off_iter != offset_iter2; ++off_iter)
2013-03-21 04:18:26 +00:00
{
2013-04-03 06:17:27 +00:00
double target = off_iter->width;
2013-03-30 17:21:14 +00:00
// we don't want to add spaces for tiny gaps, or even negative shifts
if(target < threshold - EPS)
continue;
2013-03-30 17:00:04 +00:00
auto iter = width_map.lower_bound(target-EPS);
if((iter != width_map.end()) && (abs(iter->first - target) <= EPS))
2013-03-21 04:18:26 +00:00
{
2013-03-30 17:00:04 +00:00
++ iter->second;
2013-03-21 04:18:26 +00:00
}
2013-03-30 17:00:04 +00:00
else
2013-03-21 04:18:26 +00:00
{
2013-03-30 17:00:04 +00:00
width_map.insert(iter, std::make_pair(target, 1));
2013-03-21 04:18:26 +00:00
}
2013-03-30 17:00:04 +00:00
}
if(width_map.empty())
{
// if there is no offset at all
// we just free word_space
state_iter1->hash_umask |= word_space_umask;
continue;
}
2013-03-21 04:18:26 +00:00
2013-03-30 17:00:04 +00:00
// set word_space for the most frequently used offset
double most_used_width = 0;
int max_count = 0;
for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
{
if(iter->second > max_count)
2013-03-21 04:18:26 +00:00
{
2013-03-30 17:00:04 +00:00
max_count = iter->second;
most_used_width = iter->first;
2013-03-21 04:18:26 +00:00
}
}
2013-03-30 17:00:04 +00:00
state_iter1->word_space = 0;
double new_word_space = most_used_width - state_iter1->single_space_offset();
// install new word_space
2013-04-03 06:04:39 +00:00
state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space));
2013-03-30 17:00:04 +00:00
// mark that the word_space is not free
state_iter1->hash_umask &= (~word_space_umask);
2013-04-03 06:17:27 +00:00
offset_iter1 = offset_iter2;
2013-03-30 17:00:04 +00:00
}
2013-04-03 06:04:39 +00:00
// restore old eps
ws_manager.set_eps(old_ws_eps);
2013-03-20 15:46:58 +00:00
}
// this state will be converted to a child node of the node of prev_state
// dump the difference between previous state
// also clone corresponding states
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state)
2012-09-05 07:13:21 +00:00
{
2013-03-20 15:46:58 +00:00
long long cur_mask = 0xff;
2012-09-05 07:13:21 +00:00
bool first = true;
2013-03-20 15:46:58 +00:00
for(int i = 0; i < ID_COUNT; ++i, cur_mask<<=8)
2012-09-04 15:33:15 +00:00
{
2013-03-20 15:46:58 +00:00
if(hash_umask & cur_mask) // we don't care about this ID
{
if (prev_state && (!(prev_state->hash_umask & cur_mask))) // if prev_state have it set
{
// we have to inherit it
ids[i] = prev_state->ids[i];
hash_umask &= (~cur_mask);
2013-03-25 04:23:29 +00:00
//copy the corresponding value
//TODO: this is so ugly
switch(i)
{
case FONT_SIZE_ID:
draw_font_size = prev_state->draw_font_size;
break;
case LETTER_SPACE_ID:
letter_space = prev_state->letter_space;
break;
case WORD_SPACE_ID:
word_space = prev_state->word_space;
break;
default:
break;
}
2013-03-20 15:46:58 +00:00
}
//anyway we don't have to output it
continue;
}
// now we care about the ID
if(prev_state && (!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i]))
2012-09-05 07:13:21 +00:00
continue;
if(first)
{
out << "<span class=\"";
2012-09-05 07:13:21 +00:00
first = false;
}
else
{
out << ' ';
}
2013-01-31 22:21:57 +00:00
// out should have hex set
2013-02-05 13:56:19 +00:00
out << css_class_names[i];
2013-01-31 22:21:57 +00:00
if (ids[i] == -1)
2013-02-05 13:56:19 +00:00
out << CSS::INVALID_ID;
2013-01-31 22:21:57 +00:00
else
2013-02-05 13:56:19 +00:00
out << ids[i];
2012-09-04 15:33:15 +00:00
}
2012-09-05 07:13:21 +00:00
2013-03-20 15:46:58 +00:00
if(first) // we actually just inherit the whole prev_state
{
need_close = false;
}
else
{
out << "\">";
need_close = true;
}
2012-09-04 15:33:15 +00:00
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::State::end(ostream & out) const
2012-09-05 07:13:21 +00:00
{
if(need_close)
out << "</span>";
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::State::hash(void)
2012-09-05 07:13:21 +00:00
{
hash_value = 0;
for(int i = 0; i < ID_COUNT; ++i)
{
hash_value = (hash_value << 8) | (ids[i] & 0xff);
}
}
2012-12-11 12:48:01 +00:00
int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
2012-09-04 15:33:15 +00:00
{
2012-09-05 07:13:21 +00:00
/*
* A quick check based on hash_value
* it could be wrong when there are more then 256 classes,
2012-09-05 08:19:01 +00:00
* in which case the output may not be optimal, but still 'correct' in terms of HTML
2012-09-05 07:13:21 +00:00
*/
2013-03-20 15:46:58 +00:00
long long common_mask = ~(hash_umask | s.hash_umask);
if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0;
2012-09-05 07:13:21 +00:00
2013-03-20 15:46:58 +00:00
long long cur_mask = 0xff;
2012-09-05 07:13:21 +00:00
int d = 0;
for(int i = 0; i < ID_COUNT; ++i)
2013-03-20 15:46:58 +00:00
{
if((common_mask & cur_mask) && (ids[i] != s.ids[i]))
2012-09-05 07:13:21 +00:00
++ d;
2013-03-20 15:46:58 +00:00
cur_mask <<= 8;
}
2012-09-05 07:13:21 +00:00
return d;
2012-09-04 15:33:15 +00:00
}
2013-03-21 04:18:26 +00:00
double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const
{
2013-03-21 04:27:07 +00:00
return word_space + letter_space + font_info->space_width * draw_font_size;
2013-03-21 04:18:26 +00:00
}
2013-03-30 17:21:14 +00:00
double HTMLRenderer::TextLineBuffer::State::em_size(void) const
{
return draw_font_size * (font_info->ascent - font_info->descent);
}
2013-03-25 04:23:29 +00:00
long long HTMLRenderer::TextLineBuffer::State::umask_by_id(int id)
{
return (((long long)0xff) << (8*id));
}
2013-02-05 10:19:25 +00:00
// the order should be the same as in the enum
const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = {
2013-02-28 07:59:14 +00:00
CSS::FONT_FAMILY_CN,
2013-02-05 10:19:25 +00:00
CSS::FONT_SIZE_CN,
CSS::FILL_COLOR_CN,
CSS::STROKE_COLOR_CN,
CSS::LETTER_SPACE_CN,
CSS::WORD_SPACE_CN,
CSS::RISE_CN
};
2012-09-12 15:26:14 +00:00
} //namespace pdf2htmlEX