1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-09-06 00:47:03 +00:00
pdf2htmlEX/src/HTMLRenderer/TextLineBuffer.cc

280 lines
7.3 KiB
C++
Raw Normal View History

2012-09-04 15:33:15 +00:00
/*
2012-12-11 12:48:01 +00:00
* TextLineBuffer.cc
2012-09-04 15:33:15 +00:00
*
* Generate and optimized HTML for one line
*
2013-02-05 06:36:36 +00:00
* Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
2012-09-04 15:33:15 +00:00
*/
2012-09-05 07:13:21 +00:00
#include <vector>
2012-09-04 15:33:15 +00:00
#include "HTMLRenderer.h"
2012-12-11 12:48:01 +00:00
#include "TextLineBuffer.h"
2012-11-29 09:28:05 +00:00
#include "util/namespace.h"
2012-11-29 09:45:26 +00:00
#include "util/unicode.h"
2012-11-29 10:28:07 +00:00
#include "util/math.h"
2012-09-04 15:33:15 +00:00
2012-09-12 15:26:14 +00:00
namespace pdf2htmlEX {
2012-09-04 15:33:15 +00:00
using std::min;
using std::max;
2012-09-05 07:13:21 +00:00
using std::vector;
2012-09-10 17:53:33 +00:00
using std::ostream;
2012-11-29 10:28:07 +00:00
using std::cerr;
using std::endl;
2012-09-04 15:33:15 +00:00
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
2012-09-04 15:33:15 +00:00
{
state->transform(state->getCurX(), state->getCurY(), &x, &y);
2012-10-01 17:59:04 +00:00
tm_id = renderer->cur_ttm_id;
2012-09-04 15:33:15 +00:00
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::append_unicodes(const Unicode * u, int l)
2012-09-04 15:33:15 +00:00
{
text.insert(text.end(), u, u+l);
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::append_offset(double width)
2012-09-04 15:33:15 +00:00
{
if((!offsets.empty()) && (offsets.back().start_idx == text.size()))
offsets.back().width += width;
else
2012-09-09 06:48:10 +00:00
offsets.push_back(Offset({text.size(), width}));
2012-09-04 15:33:15 +00:00
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::append_state(void)
2012-09-04 15:33:15 +00:00
{
if(states.empty() || (states.back().start_idx != text.size()))
{
states.resize(states.size() + 1);
states.back().start_idx = text.size();
}
set_state(states.back());
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::flush(void)
2012-09-04 15:33:15 +00:00
{
/*
2012-12-11 12:52:36 +00:00
* Each Line is an independent absolute positioned block
2012-09-04 15:33:15 +00:00
* so even we have a few states or offsets, we may omit them
*/
if(text.empty()) return;
if(states.empty() || (states[0].start_idx != 0))
{
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
return;
}
2012-09-09 06:48:10 +00:00
for(auto iter = states.begin(); iter != states.end(); ++iter)
iter->hash();
2012-09-05 07:13:21 +00:00
2012-09-04 15:33:15 +00:00
states.resize(states.size() + 1);
states.back().start_idx = text.size();
2012-09-09 06:48:10 +00:00
offsets.push_back(Offset({text.size(), 0}));
2012-09-04 15:33:15 +00:00
double max_ascent = 0;
2012-09-09 06:48:10 +00:00
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
const auto & s = *iter;
2012-10-02 18:19:40 +00:00
max_ascent = max<double>(max_ascent, s.ascent * s.draw_font_size);
2012-09-09 06:48:10 +00:00
}
2012-09-04 15:33:15 +00:00
2013-01-28 10:46:44 +00:00
ostream & out = renderer->f_pages.fs;
2013-02-05 06:51:00 +00:00
renderer->height_manager.install(max_ascent);
2013-02-05 07:05:36 +00:00
renderer->left_manager.install(x);
2013-02-05 06:51:00 +00:00
2013-01-25 08:39:20 +00:00
out << "<div style=\""
<< "bottom:" << round(y) << "px;"
2012-09-16 07:53:41 +00:00
<< "\""
2013-01-25 08:39:20 +00:00
<< " class=\"l"
<< " t" << tm_id
2013-02-05 07:05:36 +00:00
<< " L" << renderer->left_manager.get_id()
2013-02-05 06:51:00 +00:00
<< " h" << renderer->height_manager.get_id()
2012-09-16 07:53:41 +00:00
<< "\">";
2012-09-04 15:33:15 +00:00
auto cur_state_iter = states.begin();
auto cur_offset_iter = offsets.begin();
2012-09-05 07:13:21 +00:00
//accumulated horizontal offset;
2012-09-04 15:33:15 +00:00
double dx = 0;
stack.clear();
stack.push_back(nullptr);
// whenever a negative offset appears, we should not pop out that <span>
// otherwise the effect of negative margin-left would disappear
2012-09-06 11:05:49 +00:00
size_t last_text_pos_with_negative_offset = 0;
2012-09-05 07:13:21 +00:00
2012-09-04 15:33:15 +00:00
size_t cur_text_idx = 0;
while(cur_text_idx < text.size())
{
if(cur_text_idx >= cur_state_iter->start_idx)
{
// greedy
int best_cost = State::ID_COUNT;
// we have a nullptr at the beginning, so no need to check for rend
for(auto iter = stack.rbegin(); *iter; ++iter)
2012-09-05 07:13:21 +00:00
{
int cost = cur_state_iter->diff(**iter);
if(cost < best_cost)
{
while(stack.back() != *iter)
{
stack.back()->end(out);
stack.pop_back();
}
best_cost = cost;
if(best_cost == 0)
break;
}
2012-09-05 07:13:21 +00:00
// cannot go further
if((*iter)->start_idx <= last_text_pos_with_negative_offset)
break;
}
cur_state_iter->begin(out, stack.back());
stack.push_back(&*cur_state_iter);
2012-09-04 15:33:15 +00:00
++ cur_state_iter;
}
if(cur_text_idx >= cur_offset_iter->start_idx)
{
double target = cur_offset_iter->width + dx;
2013-02-05 06:55:44 +00:00
auto & wm = renderer->whitespace_manager;
wm.install(target);
auto wid = wm.get_id();
double w = wm.get_actual_value();
2012-09-04 15:33:15 +00:00
if(w < 0)
last_text_pos_with_negative_offset = cur_text_idx;
2012-09-07 00:39:21 +00:00
auto * p = stack.back();
double threshold = p->draw_font_size * (p->ascent - p->descent) * (renderer->param->space_threshold);
2012-09-07 16:38:41 +00:00
out << "<span class=\"_ _" << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
2012-09-04 15:33:15 +00:00
dx = target - w;
++ cur_offset_iter;
}
2012-10-02 18:19:40 +00:00
size_t next_text_idx = min<size_t>(cur_state_iter->start_idx, cur_offset_iter->start_idx);
2012-09-10 19:01:02 +00:00
outputUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
2012-09-04 15:33:15 +00:00
cur_text_idx = next_text_idx;
}
2012-09-05 07:13:21 +00:00
// we have a nullptr in the bottom
while(stack.back())
2012-09-05 07:13:21 +00:00
{
stack.back()->end(out);
stack.pop_back();
2012-09-05 07:13:21 +00:00
}
2012-09-04 15:33:15 +00:00
out << "</div>";
states.clear();
offsets.clear();
text.clear();
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::set_state (State & state)
2012-09-04 15:33:15 +00:00
{
state.ids[State::FONT_ID] = renderer->cur_font_info->id;
2013-02-05 06:36:36 +00:00
state.ids[State::FONT_SIZE_ID] = renderer->font_size_manager.get_id();
2013-01-31 22:21:57 +00:00
state.ids[State::FILL_COLOR_ID] = renderer->cur_fill_color_id;
state.ids[State::STROKE_COLOR_ID] = renderer->cur_stroke_color_id;
2013-02-05 06:36:36 +00:00
state.ids[State::LETTER_SPACE_ID] = renderer->letter_space_manager.get_id();
state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id();
2013-02-05 06:45:40 +00:00
state.ids[State::RISE_ID] = renderer->rise_manager.get_id();
2012-09-04 15:33:15 +00:00
2012-09-05 07:13:21 +00:00
const FontInfo * info = renderer->cur_font_info;
state.ascent = info->ascent;
state.descent = info->descent;
2013-02-05 06:36:36 +00:00
state.draw_font_size = renderer->font_size_manager.get_value();
2012-09-04 15:33:15 +00:00
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state)
2012-09-05 07:13:21 +00:00
{
bool first = true;
2012-09-04 15:33:15 +00:00
for(int i = 0; i < ID_COUNT; ++i)
{
2012-09-05 07:13:21 +00:00
if(prev_state && (prev_state->ids[i] == ids[i]))
continue;
if(first)
{
out << "<span class=\"";
2012-09-05 07:13:21 +00:00
first = false;
}
else
{
out << ' ';
}
2013-01-31 22:21:57 +00:00
// out should have hex set
if (ids[i] == -1)
{
// transparent
out << format_str[i] << "t";
}
else
{
out << format_str[i] << ids[i];
}
2012-09-04 15:33:15 +00:00
}
2012-09-05 07:13:21 +00:00
if(first)
{
need_close = false;
}
else
{
out << "\">";
need_close = true;
}
2012-09-04 15:33:15 +00:00
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::State::end(ostream & out) const
2012-09-05 07:13:21 +00:00
{
if(need_close)
out << "</span>";
}
2012-12-11 12:48:01 +00:00
void HTMLRenderer::TextLineBuffer::State::hash(void)
2012-09-05 07:13:21 +00:00
{
hash_value = 0;
for(int i = 0; i < ID_COUNT; ++i)
{
hash_value = (hash_value << 8) | (ids[i] & 0xff);
}
}
2012-12-11 12:48:01 +00:00
int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
2012-09-04 15:33:15 +00:00
{
2012-09-05 07:13:21 +00:00
/*
* A quick check based on hash_value
* it could be wrong when there are more then 256 classes,
2012-09-05 08:19:01 +00:00
* in which case the output may not be optimal, but still 'correct' in terms of HTML
2012-09-05 07:13:21 +00:00
*/
if(hash_value == s.hash_value) return 0;
int d = 0;
for(int i = 0; i < ID_COUNT; ++i)
if(ids[i] != s.ids[i])
++ d;
return d;
2012-09-04 15:33:15 +00:00
}
2013-01-31 22:21:57 +00:00
const char * HTMLRenderer::TextLineBuffer::State::format_str = "fscClwr";
2012-09-12 15:26:14 +00:00
} //namespace pdf2htmlEX