1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

first implemenation of LineBuffer

This commit is contained in:
Lu Wang 2012-09-04 23:33:15 +08:00
parent c32b0c4009
commit 6079a05e34
8 changed files with 279 additions and 126 deletions

View File

@ -46,6 +46,7 @@ add_executable(pdf2htmlEX
src/HTMLRenderer/text.cc
src/HTMLRenderer/image.cc
src/HTMLRenderer/namespace.h
src/HTMLRenderer/LineBuffer.cc
src/ff/ff.h
src/ff/ff.c
src/BackgroundRenderer.h

View File

@ -132,7 +132,7 @@ class HTMLRenderer : public OutputDev
////////////////////////////////////////////////////
// manage styles
////////////////////////////////////////////////////
FontInfo install_font(GfxFont * font);
const FontInfo * install_font(GfxFont * font);
void install_embedded_font(GfxFont * font, FontInfo & info);
void install_base_font(GfxFont * font, GfxFontLoc * font_loc, FontInfo & info);
void install_external_font (GfxFont * font, FontInfo & info);
@ -193,15 +193,13 @@ class HTMLRenderer : public OutputDev
////////////////////////////////////////////////////
// states
////////////////////////////////////////////////////
//line status
//indicating the status for current line & next line
//see comments: meaning for current line || meaning for next line
enum class LineStatus
bool line_opened;
enum class NewLineState
{
NONE, // no line is opened (last <div> is closed) || stay with the same style
SPAN, // there's a pending opening <span> (within a pending opening <div>) || open a new <span> if possible, otherwise a new <div>
DIV // there's a pending opening <div> (but no <span>) || has to open a new <div>
} line_status, new_line_status;
NONE, // stay with the same style
SPAN, // open a new <span> if possible, otherwise a new <div>
DIV // has to open a new <div>
} new_line_state;
// The order is according to the appearance in check_state_change
// any state changed
@ -211,7 +209,7 @@ class HTMLRenderer : public OutputDev
bool text_pos_changed;
// font & size
FontInfo cur_font_info;
const FontInfo * cur_font_info;
double cur_font_size;
long long cur_fs_id;
bool font_changed;
@ -264,11 +262,61 @@ class HTMLRenderer : public OutputDev
double draw_tx, draw_ty;
// some metrics have to be determined after all elements in the lines have been seen
// TODO: add a class for these
double line_x, line_y;
long long line_tm_id;
double line_ascent, line_height;
std::stringstream line_buf;
class LineBuffer {
public:
LineBuffer (HTMLRenderer * renderer) : renderer(renderer) { }
class State {
public:
void begin(std::ostream & out) const;
static void end(std::ostream & out);
enum {
FONT_ID,
FONT_SIZE_ID,
COLOR_ID,
LETTER_SPACE_ID,
WORD_SPACE_ID,
RISE_ID,
ID_COUNT
};
long long ids[ID_COUNT];
double ascent;
size_t start_idx; // index of the first Text using this state
static const char * format_str; // class names for each id
};
class Offset {
public:
size_t start_idx; // should put this idx before text[start_idx];
double width;
};
void reset(GfxState * state);
void append_unicodes(const Unicode * u, int l);
void append_offset(double width);
void append_state(void);
void flush(void);
private:
// retrieve state from renderer
void set_state(State & state);
HTMLRenderer * renderer;
double x, y;
long long tm_id;
std::vector<State> states;
std::vector<Offset> offsets;
std::vector<Unicode> text;
} line_buf;
friend class LineBuffer;
// for font reencoding
int32_t * cur_mapping;

View File

@ -0,0 +1,158 @@
/*
* LineBuffer.cc
*
* Generate and optimized HTML for one line
*
* by WangLu
* 2012.09.04
*/
#include "HTMLRenderer.h"
#include "HTMLRenderer/namespace.h"
using std::min;
using std::max;
using std::hex;
using std::dec;
void HTMLRenderer::LineBuffer::reset(GfxState * state)
{
state->transform(state->getCurX(), state->getCurY(), &x, &y);
tm_id = renderer->cur_tm_id;
}
void HTMLRenderer::LineBuffer::append_unicodes(const Unicode * u, int l)
{
text.insert(text.end(), u, u+l);
}
void HTMLRenderer::LineBuffer::append_offset(double width)
{
if((!offsets.empty()) && (offsets.back().start_idx == text.size()))
offsets.back().width += width;
else
offsets.push_back({text.size(), width});
}
void HTMLRenderer::LineBuffer::append_state(void)
{
if(states.empty() || (states.back().start_idx != text.size()))
{
states.resize(states.size() + 1);
states.back().start_idx = text.size();
}
set_state(states.back());
}
void HTMLRenderer::LineBuffer::flush(void)
{
/*
* Each Line is an independent absolute positioined block
* so even we have a few states or offsets, we may omit them
*/
if(text.empty()) return;
if(states.empty() || (states[0].start_idx != 0))
{
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
return;
}
states.resize(states.size() + 1);
states.back().start_idx = text.size();
offsets.push_back({text.size(), 0});
// TODO: optimize state
double max_ascent = 0;
for(const State & s : states)
max_ascent = max(max_ascent, s.ascent);
// TODO: class for height ?
ostream & out = renderer->html_fout;
out << format("<div style=\"left:%1%px;bottom:%2%px;height:%3%px;\" class=\"l t%|4$x|\">")
% x % y
% max_ascent
% tm_id
;
auto cur_state_iter = states.begin();
auto cur_offset_iter = offsets.begin();
double dx = 0;
size_t cur_text_idx = 0;
while(cur_text_idx < text.size())
{
if(cur_text_idx >= cur_state_iter->start_idx)
{
if(cur_text_idx)
State::end(out);
cur_state_iter->begin(out);
++ cur_state_iter;
}
if(cur_text_idx >= cur_offset_iter->start_idx)
{
double target = cur_offset_iter->width + dx;
double w;
auto wid = renderer->install_whitespace(target, w);
// TODO
// double threshold = draw_font_size * (cur_font_info.ascent - cur_font_info.descent) * (param->space_threshold);
double threshold = 0;
out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
dx = target - w;
++ cur_offset_iter;
}
size_t next_text_idx = min(cur_state_iter->start_idx, cur_offset_iter->start_idx);
outputUnicodes(out, text.data() + cur_text_idx, next_text_idx - cur_text_idx);
cur_text_idx = next_text_idx;
}
State::end(out);
out << "</div>";
states.clear();
offsets.clear();
text.clear();
}
void HTMLRenderer::LineBuffer::set_state (State & state)
{
state.ids[State::FONT_ID] = renderer->cur_font_info->id;
state.ids[State::FONT_SIZE_ID] = renderer->cur_fs_id;
state.ids[State::COLOR_ID] = renderer->cur_color_id;
state.ids[State::LETTER_SPACE_ID] = renderer->cur_ls_id;
state.ids[State::WORD_SPACE_ID] = renderer->cur_ws_id;
state.ids[State::RISE_ID] = renderer->cur_rise_id;
state.ascent = renderer->cur_font_info->ascent * renderer->draw_font_size;
}
void HTMLRenderer::LineBuffer::State::begin (ostream & out) const
{
out << "<span class=\"";
for(int i = 0; i < ID_COUNT; ++i)
{
if(i > 0) out << ' ';
out << format("%1%%|2$x|") % format_str[i] % ids[i];
}
out << "\">";
}
void HTMLRenderer::LineBuffer::State::end(ostream & out)
{
out << "</span>";
}
const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr";

View File

@ -16,7 +16,7 @@
using boost::algorithm::ifind_first;
void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, const string & fontfileformat, GfxFont * font)
void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, const string & fontfileformat, GfxFont * font)
{
allcss_fout << format("@font-face{font-family:f%|1$x|;src:url(") % info.id;
@ -45,12 +45,12 @@ static string general_font_family(GfxFont * font)
}
// TODO: this function is called when some font is unable to process, may use the name there as a hint
void HTMLRenderer::export_remote_default_font(long long fn_id)
void HTMLRenderer::export_remote_default_font(long long fn_id)
{
allcss_fout << format(".f%|1$x|{font-family:sans-serif;color:transparent;visibility:hidden;}")%fn_id << endl;
}
void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont)
void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont)
{
allcss_fout << format(".f%|1$x|{") % info.id;
allcss_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
@ -68,12 +68,12 @@ void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, cons
allcss_fout << "}" << endl;
}
void HTMLRenderer::export_font_size (long long fs_id, double font_size)
void HTMLRenderer::export_font_size (long long fs_id, double font_size)
{
allcss_fout << format(".s%|1$x|{font-size:%2%px;}") % fs_id % font_size << endl;
}
void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
{
allcss_fout << format(".t%|1$x|{") % tm_id;
@ -101,24 +101,24 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
allcss_fout << "}" << endl;
}
void HTMLRenderer::export_letter_space (long long ls_id, double letter_space)
void HTMLRenderer::export_letter_space (long long ls_id, double letter_space)
{
allcss_fout << format(".l%|1$x|{letter-spacing:%2%px;}") % ls_id % letter_space << endl;
}
void HTMLRenderer::export_word_space (long long ws_id, double word_space)
void HTMLRenderer::export_word_space (long long ws_id, double word_space)
{
allcss_fout << format(".w%|1$x|{word-spacing:%2%px;}") % ws_id % word_space << endl;
}
void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
{
allcss_fout << format(".c%|1$x|{color:rgb(%2%,%3%,%4%);}")
% color_id % (int)colToByte(rgb->r) % (int)colToByte(rgb->g) % (int)colToByte(rgb->b)
<< endl;
}
void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
{
if(ws_width > 0)
allcss_fout << format("._%|1$x|{display:inline-block;width:%2%px;}") % ws_id % ws_width << endl;
@ -126,7 +126,7 @@ void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
allcss_fout << format("._%|1$x|{display:inline;margin-left:%2%px;}") % ws_id % ws_width << endl;
}
void HTMLRenderer::export_rise (long long rise_id, double rise)
void HTMLRenderer::export_rise (long long rise_id, double rise)
{
allcss_fout << format(".r%|1$x|{top:%2%px;}") % rise_id % (-rise) << endl;
}

View File

@ -25,7 +25,8 @@ static void dummy(void *, ErrorCategory, int pos, char *)
}
HTMLRenderer::HTMLRenderer(const Param * param)
:line_status(LineStatus::NONE)
:line_opened(false)
,line_buf(this)
,image_count(0)
,param(param)
,dest_dir(param->dest_dir)
@ -147,7 +148,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
this->pageWidth = state->getPageWidth();
this->pageHeight = state->getPageHeight();
assert(line_status == LineStatus::NONE);
assert((!line_opened) && "Open line in startPage detected!");
html_fout << format("<div id=\"p%|1$x|\" class=\"p\" style=\"width:%2%px;height:%3%px;") % pageNum % pageWidth % pageHeight;

View File

@ -21,7 +21,7 @@
using std::abs;
FontInfo HTMLRenderer::install_font(GfxFont * font)
const FontInfo * HTMLRenderer::install_font(GfxFont * font)
{
assert(sizeof(long long) == 2*sizeof(int));
@ -29,7 +29,7 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
auto iter = font_name_map.find(fn_id);
if(iter != font_name_map.end())
return iter->second;
return &(iter->second);
long long new_fn_id = font_name_map.size();
@ -38,7 +38,7 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
if(font == nullptr)
{
export_remote_default_font(new_fn_id);
return cur_info_iter->second;
return &(cur_info_iter->second);
}
cur_info_iter->second.ascent = font->getAscent();
@ -52,12 +52,12 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
if(font->getType() == fontType3) {
cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl;
export_remote_default_font(new_fn_id);
return cur_info_iter->second;
return &(cur_info_iter->second);
}
if(font->getWMode()) {
cerr << "Writing mode is unsupported and will be rendered as Image" << endl;
export_remote_default_font(new_fn_id);
return cur_info_iter->second;
return &(cur_info_iter->second);
}
auto * font_loc = font->locateFont(xref, gTrue);
@ -86,7 +86,7 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
export_remote_default_font(new_fn_id);
}
return cur_info_iter->second;
return &(cur_info_iter->second);
}
void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info)

View File

@ -72,13 +72,10 @@ void HTMLRenderer::updateFillColor(GfxState * state)
}
void HTMLRenderer::check_state_change(GfxState * state)
{
//TODO:
// close <span> but not <div>, to use the first style of the line
// DEPENDENCY WARNING
// don't adjust the order of state checking
new_line_status = LineStatus::NONE;
new_line_state = NewLineState::NONE;
bool need_recheck_position = false;
bool need_rescale_font = false;
@ -94,11 +91,11 @@ void HTMLRenderer::check_state_change(GfxState * state)
// font name & size
if(all_changed || font_changed)
{
FontInfo new_font_info = install_font(state->getFont());
const FontInfo * new_font_info = install_font(state->getFont());
if(!(new_font_info.id == cur_font_info.id))
if(!(new_font_info->id == cur_font_info->id))
{
new_line_status = max(new_line_status, LineStatus::SPAN);
new_line_state = max(new_line_state, NewLineState::SPAN);
cur_font_info = new_font_info;
}
@ -168,13 +165,13 @@ void HTMLRenderer::check_state_change(GfxState * state)
if(!(_equal(new_draw_font_size, draw_font_size)))
{
new_line_status = max(new_line_status, LineStatus::SPAN);
new_line_state = max(new_line_state, NewLineState::SPAN);
draw_font_size = new_draw_font_size;
cur_fs_id = install_font_size(draw_font_size);
}
if(!(_tm_equal(new_draw_ctm, draw_ctm, 4)))
{
new_line_status = max(new_line_status, LineStatus::DIV);
new_line_state = max(new_line_state, NewLineState::DIV);
memcpy(draw_ctm, new_draw_ctm, sizeof(draw_ctm));
cur_tm_id = install_transform_matrix(draw_ctm);
}
@ -236,7 +233,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
if(!merged)
{
new_line_status = max(new_line_status, LineStatus::DIV);
new_line_state = max(new_line_state, NewLineState::DIV);
}
}
@ -247,7 +244,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
double new_letter_space = state->getCharSpace();
if(!_equal(cur_letter_space, new_letter_space))
{
new_line_status = max(new_line_status, LineStatus::SPAN);
new_line_state = max(new_line_state, NewLineState::SPAN);
cur_letter_space = new_letter_space;
cur_ls_id = install_letter_space(cur_letter_space * draw_scale);
}
@ -260,7 +257,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
double new_word_space = state->getWordSpace();
if(!_equal(cur_word_space, new_word_space))
{
new_line_status = max(new_line_status, LineStatus::SPAN);
new_line_state = max(new_line_state, NewLineState::SPAN);
cur_word_space = new_word_space;
cur_ws_id = install_word_space(cur_word_space * draw_scale);
}
@ -273,7 +270,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
state->getFillRGB(&new_color);
if(!((new_color.r == cur_color.r) && (new_color.g == cur_color.g) && (new_color.b == cur_color.b)))
{
new_line_status = max(new_line_status, LineStatus::SPAN);
new_line_state = max(new_line_state, NewLineState::SPAN);
cur_color = new_color;
cur_color_id = install_color(&new_color);
}
@ -286,7 +283,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
double new_rise = state->getRise();
if(!_equal(cur_rise, new_rise))
{
new_line_status = max(new_line_status, LineStatus::SPAN);
new_line_state = max(new_line_state, NewLineState::SPAN);
cur_rise = new_rise;
cur_rise_id = install_rise(new_rise * draw_scale);
}
@ -314,31 +311,22 @@ void HTMLRenderer::reset_state_change()
}
void HTMLRenderer::prepare_line(GfxState * state)
{
// close old tags when necessary
if((line_status == LineStatus::NONE) || (new_line_status == LineStatus::NONE))
if(!line_opened)
{
//pass
}
else if(new_line_status == LineStatus::DIV)
{
close_line();
}
else
{
assert(new_line_status == LineStatus::SPAN);
if(line_status == LineStatus::SPAN)
html_fout << "</span>";
else
assert(line_status == LineStatus::DIV);
// don't change line_status
}
if(line_status == LineStatus::NONE)
{
new_line_status = LineStatus::DIV;
new_line_state = NewLineState::DIV;
}
if(new_line_status != LineStatus::DIV)
if(new_line_state == NewLineState::DIV)
{
close_line();
line_buf.reset(state);
//resync position
draw_ty = cur_ty;
draw_tx = cur_tx;
}
else
{
// align horizontal position
// try to merge with the last line if possible
@ -349,67 +337,24 @@ void HTMLRenderer::prepare_line(GfxState * state)
}
else
{
// don't close a pending span here, keep the styling
double w;
auto wid = install_whitespace(target, w);
double threshold = draw_font_size * (cur_font_info.ascent - cur_font_info.descent) * (param->space_threshold);
line_buf << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
draw_tx += w / draw_scale;
line_buf.append_offset(target);
draw_tx += target / draw_scale;
}
}
if(new_line_status != LineStatus::NONE)
if(new_line_state != NewLineState::NONE)
{
// have to open a new tag
if (new_line_status == LineStatus::DIV)
{
state->transform(state->getCurX(), state->getCurY(), &line_x, &line_y);
line_tm_id = cur_tm_id;
line_ascent = cur_font_info.ascent * draw_font_size;
line_height = (cur_font_info.ascent - cur_font_info.descent) * draw_font_size;
//resync position
draw_ty = cur_ty;
draw_tx = cur_tx;
}
else if(new_line_status == LineStatus::SPAN)
{
// pass
}
else
{
assert(false && "Bad value of new_line_status");
}
line_buf << format("<span class=\"f%|1$x| s%|2$x| c%|3$x| l%|4$x| w%|5$x| r%|6$x|\">")
% cur_font_info.id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id;
line_ascent = max(line_ascent, cur_font_info.ascent * draw_font_size);
line_height = max(line_height, (cur_font_info.ascent - cur_font_info.descent) * draw_font_size);
line_status = LineStatus::SPAN;
line_buf.append_state();
}
line_opened = true;
}
void HTMLRenderer::close_line()
{
if(line_status == LineStatus::NONE)
return;
// TODO class for height
html_fout << format("<div style=\"left:%1%px;bottom:%2%px;height:%4%px;\" class=\"l t%|3$x|\">")
% line_x
% line_y
% line_tm_id
% line_ascent
;
html_fout << line_buf.rdbuf();
line_buf.str("");
if(line_status == LineStatus::SPAN)
html_fout << "</span>";
else
assert(line_status == LineStatus::DIV);
html_fout << "</div>";
line_status = LineStatus::NONE;
if(line_opened)
{
line_opened = false;
line_buf.flush();
}
}

View File

@ -407,8 +407,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
++nSpaces;
}
Unicode uu = (cur_font_info.use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
outputUnicodes(line_buf, &uu, 1);
Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
line_buf.append_unicodes(&uu, 1);
dx += dx1;
dy += dy1;