mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
first implemenation of LineBuffer
This commit is contained in:
parent
c32b0c4009
commit
6079a05e34
@ -46,6 +46,7 @@ add_executable(pdf2htmlEX
|
||||
src/HTMLRenderer/text.cc
|
||||
src/HTMLRenderer/image.cc
|
||||
src/HTMLRenderer/namespace.h
|
||||
src/HTMLRenderer/LineBuffer.cc
|
||||
src/ff/ff.h
|
||||
src/ff/ff.c
|
||||
src/BackgroundRenderer.h
|
||||
|
@ -132,7 +132,7 @@ class HTMLRenderer : public OutputDev
|
||||
////////////////////////////////////////////////////
|
||||
// manage styles
|
||||
////////////////////////////////////////////////////
|
||||
FontInfo install_font(GfxFont * font);
|
||||
const FontInfo * install_font(GfxFont * font);
|
||||
void install_embedded_font(GfxFont * font, FontInfo & info);
|
||||
void install_base_font(GfxFont * font, GfxFontLoc * font_loc, FontInfo & info);
|
||||
void install_external_font (GfxFont * font, FontInfo & info);
|
||||
@ -193,15 +193,13 @@ class HTMLRenderer : public OutputDev
|
||||
////////////////////////////////////////////////////
|
||||
// states
|
||||
////////////////////////////////////////////////////
|
||||
//line status
|
||||
//indicating the status for current line & next line
|
||||
//see comments: meaning for current line || meaning for next line
|
||||
enum class LineStatus
|
||||
bool line_opened;
|
||||
enum class NewLineState
|
||||
{
|
||||
NONE, // no line is opened (last <div> is closed) || stay with the same style
|
||||
SPAN, // there's a pending opening <span> (within a pending opening <div>) || open a new <span> if possible, otherwise a new <div>
|
||||
DIV // there's a pending opening <div> (but no <span>) || has to open a new <div>
|
||||
} line_status, new_line_status;
|
||||
NONE, // stay with the same style
|
||||
SPAN, // open a new <span> if possible, otherwise a new <div>
|
||||
DIV // has to open a new <div>
|
||||
} new_line_state;
|
||||
|
||||
// The order is according to the appearance in check_state_change
|
||||
// any state changed
|
||||
@ -211,7 +209,7 @@ class HTMLRenderer : public OutputDev
|
||||
bool text_pos_changed;
|
||||
|
||||
// font & size
|
||||
FontInfo cur_font_info;
|
||||
const FontInfo * cur_font_info;
|
||||
double cur_font_size;
|
||||
long long cur_fs_id;
|
||||
bool font_changed;
|
||||
@ -264,11 +262,61 @@ class HTMLRenderer : public OutputDev
|
||||
double draw_tx, draw_ty;
|
||||
|
||||
// some metrics have to be determined after all elements in the lines have been seen
|
||||
// TODO: add a class for these
|
||||
double line_x, line_y;
|
||||
long long line_tm_id;
|
||||
double line_ascent, line_height;
|
||||
std::stringstream line_buf;
|
||||
class LineBuffer {
|
||||
public:
|
||||
LineBuffer (HTMLRenderer * renderer) : renderer(renderer) { }
|
||||
|
||||
class State {
|
||||
public:
|
||||
void begin(std::ostream & out) const;
|
||||
static void end(std::ostream & out);
|
||||
|
||||
enum {
|
||||
FONT_ID,
|
||||
FONT_SIZE_ID,
|
||||
COLOR_ID,
|
||||
LETTER_SPACE_ID,
|
||||
WORD_SPACE_ID,
|
||||
RISE_ID,
|
||||
|
||||
ID_COUNT
|
||||
};
|
||||
|
||||
long long ids[ID_COUNT];
|
||||
double ascent;
|
||||
size_t start_idx; // index of the first Text using this state
|
||||
|
||||
static const char * format_str; // class names for each id
|
||||
};
|
||||
|
||||
|
||||
class Offset {
|
||||
public:
|
||||
size_t start_idx; // should put this idx before text[start_idx];
|
||||
double width;
|
||||
};
|
||||
|
||||
void reset(GfxState * state);
|
||||
void append_unicodes(const Unicode * u, int l);
|
||||
void append_offset(double width);
|
||||
void append_state(void);
|
||||
void flush(void);
|
||||
|
||||
private:
|
||||
// retrieve state from renderer
|
||||
void set_state(State & state);
|
||||
|
||||
HTMLRenderer * renderer;
|
||||
|
||||
double x, y;
|
||||
long long tm_id;
|
||||
|
||||
std::vector<State> states;
|
||||
std::vector<Offset> offsets;
|
||||
std::vector<Unicode> text;
|
||||
|
||||
} line_buf;
|
||||
friend class LineBuffer;
|
||||
|
||||
// for font reencoding
|
||||
int32_t * cur_mapping;
|
||||
|
158
src/HTMLRenderer/LineBuffer.cc
Normal file
158
src/HTMLRenderer/LineBuffer.cc
Normal file
@ -0,0 +1,158 @@
|
||||
/*
|
||||
* LineBuffer.cc
|
||||
*
|
||||
* Generate and optimized HTML for one line
|
||||
*
|
||||
* by WangLu
|
||||
* 2012.09.04
|
||||
*/
|
||||
|
||||
#include "HTMLRenderer.h"
|
||||
#include "HTMLRenderer/namespace.h"
|
||||
|
||||
using std::min;
|
||||
using std::max;
|
||||
using std::hex;
|
||||
using std::dec;
|
||||
|
||||
void HTMLRenderer::LineBuffer::reset(GfxState * state)
|
||||
{
|
||||
state->transform(state->getCurX(), state->getCurY(), &x, &y);
|
||||
tm_id = renderer->cur_tm_id;
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::append_unicodes(const Unicode * u, int l)
|
||||
{
|
||||
text.insert(text.end(), u, u+l);
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::append_offset(double width)
|
||||
{
|
||||
if((!offsets.empty()) && (offsets.back().start_idx == text.size()))
|
||||
offsets.back().width += width;
|
||||
else
|
||||
offsets.push_back({text.size(), width});
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::append_state(void)
|
||||
{
|
||||
if(states.empty() || (states.back().start_idx != text.size()))
|
||||
{
|
||||
states.resize(states.size() + 1);
|
||||
states.back().start_idx = text.size();
|
||||
}
|
||||
|
||||
set_state(states.back());
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::flush(void)
|
||||
{
|
||||
/*
|
||||
* Each Line is an independent absolute positioined block
|
||||
* so even we have a few states or offsets, we may omit them
|
||||
*/
|
||||
if(text.empty()) return;
|
||||
|
||||
if(states.empty() || (states[0].start_idx != 0))
|
||||
{
|
||||
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
states.resize(states.size() + 1);
|
||||
states.back().start_idx = text.size();
|
||||
|
||||
offsets.push_back({text.size(), 0});
|
||||
|
||||
// TODO: optimize state
|
||||
double max_ascent = 0;
|
||||
for(const State & s : states)
|
||||
max_ascent = max(max_ascent, s.ascent);
|
||||
|
||||
// TODO: class for height ?
|
||||
ostream & out = renderer->html_fout;
|
||||
out << format("<div style=\"left:%1%px;bottom:%2%px;height:%3%px;\" class=\"l t%|4$x|\">")
|
||||
% x % y
|
||||
% max_ascent
|
||||
% tm_id
|
||||
;
|
||||
|
||||
auto cur_state_iter = states.begin();
|
||||
auto cur_offset_iter = offsets.begin();
|
||||
|
||||
double dx = 0;
|
||||
|
||||
size_t cur_text_idx = 0;
|
||||
while(cur_text_idx < text.size())
|
||||
{
|
||||
if(cur_text_idx >= cur_state_iter->start_idx)
|
||||
{
|
||||
if(cur_text_idx)
|
||||
State::end(out);
|
||||
|
||||
cur_state_iter->begin(out);
|
||||
|
||||
++ cur_state_iter;
|
||||
}
|
||||
|
||||
if(cur_text_idx >= cur_offset_iter->start_idx)
|
||||
{
|
||||
double target = cur_offset_iter->width + dx;
|
||||
double w;
|
||||
|
||||
auto wid = renderer->install_whitespace(target, w);
|
||||
|
||||
// TODO
|
||||
// double threshold = draw_font_size * (cur_font_info.ascent - cur_font_info.descent) * (param->space_threshold);
|
||||
double threshold = 0;
|
||||
out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
|
||||
|
||||
dx = target - w;
|
||||
|
||||
++ cur_offset_iter;
|
||||
}
|
||||
|
||||
size_t next_text_idx = min(cur_state_iter->start_idx, cur_offset_iter->start_idx);
|
||||
outputUnicodes(out, text.data() + cur_text_idx, next_text_idx - cur_text_idx);
|
||||
cur_text_idx = next_text_idx;
|
||||
}
|
||||
|
||||
State::end(out);
|
||||
out << "</div>";
|
||||
|
||||
|
||||
states.clear();
|
||||
offsets.clear();
|
||||
text.clear();
|
||||
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::set_state (State & state)
|
||||
{
|
||||
state.ids[State::FONT_ID] = renderer->cur_font_info->id;
|
||||
state.ids[State::FONT_SIZE_ID] = renderer->cur_fs_id;
|
||||
state.ids[State::COLOR_ID] = renderer->cur_color_id;
|
||||
state.ids[State::LETTER_SPACE_ID] = renderer->cur_ls_id;
|
||||
state.ids[State::WORD_SPACE_ID] = renderer->cur_ws_id;
|
||||
state.ids[State::RISE_ID] = renderer->cur_rise_id;
|
||||
|
||||
state.ascent = renderer->cur_font_info->ascent * renderer->draw_font_size;
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::State::begin (ostream & out) const
|
||||
{
|
||||
out << "<span class=\"";
|
||||
for(int i = 0; i < ID_COUNT; ++i)
|
||||
{
|
||||
if(i > 0) out << ' ';
|
||||
out << format("%1%%|2$x|") % format_str[i] % ids[i];
|
||||
}
|
||||
out << "\">";
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::State::end(ostream & out)
|
||||
{
|
||||
out << "</span>";
|
||||
}
|
||||
|
||||
const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr";
|
@ -25,7 +25,8 @@ static void dummy(void *, ErrorCategory, int pos, char *)
|
||||
}
|
||||
|
||||
HTMLRenderer::HTMLRenderer(const Param * param)
|
||||
:line_status(LineStatus::NONE)
|
||||
:line_opened(false)
|
||||
,line_buf(this)
|
||||
,image_count(0)
|
||||
,param(param)
|
||||
,dest_dir(param->dest_dir)
|
||||
@ -147,7 +148,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
||||
this->pageWidth = state->getPageWidth();
|
||||
this->pageHeight = state->getPageHeight();
|
||||
|
||||
assert(line_status == LineStatus::NONE);
|
||||
assert((!line_opened) && "Open line in startPage detected!");
|
||||
|
||||
html_fout << format("<div id=\"p%|1$x|\" class=\"p\" style=\"width:%2%px;height:%3%px;") % pageNum % pageWidth % pageHeight;
|
||||
|
||||
|
@ -21,7 +21,7 @@
|
||||
|
||||
using std::abs;
|
||||
|
||||
FontInfo HTMLRenderer::install_font(GfxFont * font)
|
||||
const FontInfo * HTMLRenderer::install_font(GfxFont * font)
|
||||
{
|
||||
assert(sizeof(long long) == 2*sizeof(int));
|
||||
|
||||
@ -29,7 +29,7 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
|
||||
|
||||
auto iter = font_name_map.find(fn_id);
|
||||
if(iter != font_name_map.end())
|
||||
return iter->second;
|
||||
return &(iter->second);
|
||||
|
||||
long long new_fn_id = font_name_map.size();
|
||||
|
||||
@ -38,7 +38,7 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
|
||||
if(font == nullptr)
|
||||
{
|
||||
export_remote_default_font(new_fn_id);
|
||||
return cur_info_iter->second;
|
||||
return &(cur_info_iter->second);
|
||||
}
|
||||
|
||||
cur_info_iter->second.ascent = font->getAscent();
|
||||
@ -52,12 +52,12 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
|
||||
if(font->getType() == fontType3) {
|
||||
cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl;
|
||||
export_remote_default_font(new_fn_id);
|
||||
return cur_info_iter->second;
|
||||
return &(cur_info_iter->second);
|
||||
}
|
||||
if(font->getWMode()) {
|
||||
cerr << "Writing mode is unsupported and will be rendered as Image" << endl;
|
||||
export_remote_default_font(new_fn_id);
|
||||
return cur_info_iter->second;
|
||||
return &(cur_info_iter->second);
|
||||
}
|
||||
|
||||
auto * font_loc = font->locateFont(xref, gTrue);
|
||||
@ -86,7 +86,7 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
|
||||
export_remote_default_font(new_fn_id);
|
||||
}
|
||||
|
||||
return cur_info_iter->second;
|
||||
return &(cur_info_iter->second);
|
||||
}
|
||||
|
||||
void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info)
|
||||
|
@ -72,13 +72,10 @@ void HTMLRenderer::updateFillColor(GfxState * state)
|
||||
}
|
||||
void HTMLRenderer::check_state_change(GfxState * state)
|
||||
{
|
||||
//TODO:
|
||||
// close <span> but not <div>, to use the first style of the line
|
||||
|
||||
// DEPENDENCY WARNING
|
||||
// don't adjust the order of state checking
|
||||
|
||||
new_line_status = LineStatus::NONE;
|
||||
new_line_state = NewLineState::NONE;
|
||||
|
||||
bool need_recheck_position = false;
|
||||
bool need_rescale_font = false;
|
||||
@ -94,11 +91,11 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
// font name & size
|
||||
if(all_changed || font_changed)
|
||||
{
|
||||
FontInfo new_font_info = install_font(state->getFont());
|
||||
const FontInfo * new_font_info = install_font(state->getFont());
|
||||
|
||||
if(!(new_font_info.id == cur_font_info.id))
|
||||
if(!(new_font_info->id == cur_font_info->id))
|
||||
{
|
||||
new_line_status = max(new_line_status, LineStatus::SPAN);
|
||||
new_line_state = max(new_line_state, NewLineState::SPAN);
|
||||
cur_font_info = new_font_info;
|
||||
}
|
||||
|
||||
@ -168,13 +165,13 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
|
||||
if(!(_equal(new_draw_font_size, draw_font_size)))
|
||||
{
|
||||
new_line_status = max(new_line_status, LineStatus::SPAN);
|
||||
new_line_state = max(new_line_state, NewLineState::SPAN);
|
||||
draw_font_size = new_draw_font_size;
|
||||
cur_fs_id = install_font_size(draw_font_size);
|
||||
}
|
||||
if(!(_tm_equal(new_draw_ctm, draw_ctm, 4)))
|
||||
{
|
||||
new_line_status = max(new_line_status, LineStatus::DIV);
|
||||
new_line_state = max(new_line_state, NewLineState::DIV);
|
||||
memcpy(draw_ctm, new_draw_ctm, sizeof(draw_ctm));
|
||||
cur_tm_id = install_transform_matrix(draw_ctm);
|
||||
}
|
||||
@ -236,7 +233,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
|
||||
if(!merged)
|
||||
{
|
||||
new_line_status = max(new_line_status, LineStatus::DIV);
|
||||
new_line_state = max(new_line_state, NewLineState::DIV);
|
||||
}
|
||||
}
|
||||
|
||||
@ -247,7 +244,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
double new_letter_space = state->getCharSpace();
|
||||
if(!_equal(cur_letter_space, new_letter_space))
|
||||
{
|
||||
new_line_status = max(new_line_status, LineStatus::SPAN);
|
||||
new_line_state = max(new_line_state, NewLineState::SPAN);
|
||||
cur_letter_space = new_letter_space;
|
||||
cur_ls_id = install_letter_space(cur_letter_space * draw_scale);
|
||||
}
|
||||
@ -260,7 +257,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
double new_word_space = state->getWordSpace();
|
||||
if(!_equal(cur_word_space, new_word_space))
|
||||
{
|
||||
new_line_status = max(new_line_status, LineStatus::SPAN);
|
||||
new_line_state = max(new_line_state, NewLineState::SPAN);
|
||||
cur_word_space = new_word_space;
|
||||
cur_ws_id = install_word_space(cur_word_space * draw_scale);
|
||||
}
|
||||
@ -273,7 +270,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
state->getFillRGB(&new_color);
|
||||
if(!((new_color.r == cur_color.r) && (new_color.g == cur_color.g) && (new_color.b == cur_color.b)))
|
||||
{
|
||||
new_line_status = max(new_line_status, LineStatus::SPAN);
|
||||
new_line_state = max(new_line_state, NewLineState::SPAN);
|
||||
cur_color = new_color;
|
||||
cur_color_id = install_color(&new_color);
|
||||
}
|
||||
@ -286,7 +283,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
double new_rise = state->getRise();
|
||||
if(!_equal(cur_rise, new_rise))
|
||||
{
|
||||
new_line_status = max(new_line_status, LineStatus::SPAN);
|
||||
new_line_state = max(new_line_state, NewLineState::SPAN);
|
||||
cur_rise = new_rise;
|
||||
cur_rise_id = install_rise(new_rise * draw_scale);
|
||||
}
|
||||
@ -314,31 +311,22 @@ void HTMLRenderer::reset_state_change()
|
||||
}
|
||||
void HTMLRenderer::prepare_line(GfxState * state)
|
||||
{
|
||||
// close old tags when necessary
|
||||
if((line_status == LineStatus::NONE) || (new_line_status == LineStatus::NONE))
|
||||
if(!line_opened)
|
||||
{
|
||||
//pass
|
||||
new_line_state = NewLineState::DIV;
|
||||
}
|
||||
else if(new_line_status == LineStatus::DIV)
|
||||
|
||||
if(new_line_state == NewLineState::DIV)
|
||||
{
|
||||
close_line();
|
||||
|
||||
line_buf.reset(state);
|
||||
|
||||
//resync position
|
||||
draw_ty = cur_ty;
|
||||
draw_tx = cur_tx;
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(new_line_status == LineStatus::SPAN);
|
||||
if(line_status == LineStatus::SPAN)
|
||||
html_fout << "</span>";
|
||||
else
|
||||
assert(line_status == LineStatus::DIV);
|
||||
// don't change line_status
|
||||
}
|
||||
|
||||
if(line_status == LineStatus::NONE)
|
||||
{
|
||||
new_line_status = LineStatus::DIV;
|
||||
}
|
||||
|
||||
if(new_line_status != LineStatus::DIV)
|
||||
{
|
||||
// align horizontal position
|
||||
// try to merge with the last line if possible
|
||||
@ -349,67 +337,24 @@ void HTMLRenderer::prepare_line(GfxState * state)
|
||||
}
|
||||
else
|
||||
{
|
||||
// don't close a pending span here, keep the styling
|
||||
double w;
|
||||
auto wid = install_whitespace(target, w);
|
||||
double threshold = draw_font_size * (cur_font_info.ascent - cur_font_info.descent) * (param->space_threshold);
|
||||
line_buf << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
|
||||
draw_tx += w / draw_scale;
|
||||
line_buf.append_offset(target);
|
||||
draw_tx += target / draw_scale;
|
||||
}
|
||||
}
|
||||
|
||||
if(new_line_status != LineStatus::NONE)
|
||||
if(new_line_state != NewLineState::NONE)
|
||||
{
|
||||
// have to open a new tag
|
||||
if (new_line_status == LineStatus::DIV)
|
||||
{
|
||||
state->transform(state->getCurX(), state->getCurY(), &line_x, &line_y);
|
||||
line_tm_id = cur_tm_id;
|
||||
line_ascent = cur_font_info.ascent * draw_font_size;
|
||||
line_height = (cur_font_info.ascent - cur_font_info.descent) * draw_font_size;
|
||||
|
||||
//resync position
|
||||
draw_ty = cur_ty;
|
||||
draw_tx = cur_tx;
|
||||
}
|
||||
else if(new_line_status == LineStatus::SPAN)
|
||||
{
|
||||
// pass
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false && "Bad value of new_line_status");
|
||||
}
|
||||
|
||||
line_buf << format("<span class=\"f%|1$x| s%|2$x| c%|3$x| l%|4$x| w%|5$x| r%|6$x|\">")
|
||||
% cur_font_info.id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id;
|
||||
line_ascent = max(line_ascent, cur_font_info.ascent * draw_font_size);
|
||||
line_height = max(line_height, (cur_font_info.ascent - cur_font_info.descent) * draw_font_size);
|
||||
|
||||
line_status = LineStatus::SPAN;
|
||||
line_buf.append_state();
|
||||
}
|
||||
|
||||
line_opened = true;
|
||||
}
|
||||
|
||||
void HTMLRenderer::close_line()
|
||||
{
|
||||
if(line_status == LineStatus::NONE)
|
||||
return;
|
||||
|
||||
// TODO class for height
|
||||
html_fout << format("<div style=\"left:%1%px;bottom:%2%px;height:%4%px;\" class=\"l t%|3$x|\">")
|
||||
% line_x
|
||||
% line_y
|
||||
% line_tm_id
|
||||
% line_ascent
|
||||
;
|
||||
html_fout << line_buf.rdbuf();
|
||||
line_buf.str("");
|
||||
|
||||
if(line_status == LineStatus::SPAN)
|
||||
html_fout << "</span>";
|
||||
else
|
||||
assert(line_status == LineStatus::DIV);
|
||||
|
||||
html_fout << "</div>";
|
||||
line_status = LineStatus::NONE;
|
||||
|
||||
if(line_opened)
|
||||
{
|
||||
line_opened = false;
|
||||
line_buf.flush();
|
||||
}
|
||||
}
|
||||
|
@ -407,8 +407,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
++nSpaces;
|
||||
}
|
||||
|
||||
Unicode uu = (cur_font_info.use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
|
||||
outputUnicodes(line_buf, &uu, 1);
|
||||
Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
|
||||
line_buf.append_unicodes(&uu, 1);
|
||||
|
||||
dx += dx1;
|
||||
dy += dy1;
|
||||
|
Loading…
Reference in New Issue
Block a user