1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

refactor line/page stuffs, preparing to support clip

This commit is contained in:
Lu Wang 2013-04-07 23:50:23 +08:00
parent fb7423dccd
commit 1da9be54d9
10 changed files with 134 additions and 138 deletions

View File

@ -57,7 +57,6 @@ public:
// will be reduced to a series of other drawing operations.
virtual GBool useShadedFills(int type) { return (type == 2) ? gTrue: gFalse; }
// Does this device use beginType3Char/endType3Char? Otherwise,
// text in Type 3 fonts will be drawn with drawChar/drawString.
virtual GBool interpretType3Chars() { return gFalse; }
@ -111,6 +110,10 @@ public:
/*
* Rendering
*/
virtual void clip(GfxState * state);
virtual void eoClip(GfxState * state);
virtual void clipToStrokePath(GfxState * state);
virtual void drawString(GfxState * state, GooString * s);
@ -186,7 +189,6 @@ protected:
// prepare the line context, (close old tags, open new tags)
// make sure the current HTML style consistent with PDF
void prepare_text_line(GfxState * state);
void close_text_line();
////////////////////////////////////////////////////
// CSS drawing
@ -261,6 +263,7 @@ protected:
bool word_space_changed;
bool letter_space_changed;
bool stroke_color_changed;
bool clip_changed;
////////////////////////////////////////////////////
// HTML states
@ -279,19 +282,26 @@ protected:
// also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty)
double draw_tx, draw_ty;
// managers store values actually used in HTML (i.e. scaled)
AllStateManager all_manager;
enum NewLineState
{
NLS_NONE, // stay with the same style
NLS_SPAN, // open a new <span> if possible, otherwise a new <div>
NLS_DIV // has to open a new <div>
} new_line_state;
////////////////////////////////////////////////////
// styles & resources
////////////////////////////////////////////////////
// managers store values actually used in HTML (i.e. scaled)
std::unordered_map<long long, FontInfo> font_info_map;
AllStateManager all_manager;
HTMLTextState cur_text_state;
HTMLLineState cur_line_state;
HTMLTextPage html_text_page;
enum NewLineState
{
NLS_NONE,
NLS_NEWSTATE,
NLS_NEWLINE,
NLS_NEWCLIP
} new_line_state;
// for font reencoding
int32_t * cur_mapping;
char ** cur_mapping2;
@ -303,13 +313,6 @@ protected:
// for string formatting
StringFormatter str_fmt;
////////////////////////////////////////////////////
// styles & resources
////////////////////////////////////////////////////
HTMLState cur_html_state;
std::unordered_map<long long, FontInfo> font_info_map;
struct {
std::ofstream fs;
std::string path;

View File

@ -347,8 +347,6 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co
const GfxRGB * line_color, const GfxRGB * fill_color,
void (*style_function)(void *, ostream &), void * style_function_data)
{
close_text_line();
double new_tm[6];
memcpy(new_tm, tm, sizeof(new_tm));

View File

@ -207,8 +207,6 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
}
void HTMLRenderer::endPage() {
close_text_line();
// dump all text
html_text_page.dump_text(f_pages.fs);
html_text_page.dump_css(f_css.fs);

View File

@ -86,6 +86,18 @@ void HTMLRenderer::updateStrokeColor(GfxState * state)
{
stroke_color_changed = true;
}
void HTMLRenderer::clip(GfxState * state)
{
clip_changed = true;
}
void HTMLRenderer::eoClip(GfxState * state)
{
clip_changed = true;
}
void HTMLRenderer::clipToStrokePath(GfxState * state)
{
clip_changed = true;
}
void HTMLRenderer::reset_state()
{
draw_text_scale = 1.0;
@ -95,16 +107,17 @@ void HTMLRenderer::reset_state()
memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm));
// reset html_state
cur_html_state.font_info = install_font(nullptr);
cur_html_state.font_size = 0;
cur_html_state.fill_color.transparent = true;
cur_html_state.stroke_color.transparent = true;
cur_html_state.letter_space = 0;
cur_html_state.word_space = 0;
cur_html_state.vertical_align = 0;
cur_html_state.x = 0;
cur_html_state.y = 0;
memcpy(cur_html_state.transform_matrix, ID_MATRIX, sizeof(cur_html_state.transform_matrix));
cur_text_state.font_info = install_font(nullptr);
cur_text_state.font_size = 0;
cur_text_state.fill_color.transparent = true;
cur_text_state.stroke_color.transparent = true;
cur_text_state.letter_space = 0;
cur_text_state.word_space = 0;
cur_text_state.vertical_align = 0;
cur_line_state.x = 0;
cur_line_state.y = 0;
memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));
cur_tx = cur_ty = 0;
draw_tx = draw_ty = 0;
@ -129,6 +142,8 @@ void HTMLRenderer::reset_state_change()
fill_color_changed = false;
stroke_color_changed = false;
clip_changed = false;
}
void HTMLRenderer::check_state_change(GfxState * state)
{
@ -142,7 +157,8 @@ void HTMLRenderer::check_state_change(GfxState * state)
bool draw_text_scale_changed = false;
// save current info for later use
HTMLState old_html_state = cur_html_state;
auto old_text_state = cur_text_state;
auto old_line_state = cur_line_state;
double old_tm[6];
memcpy(old_tm, cur_text_tm, sizeof(old_tm));
double old_draw_text_scale = draw_text_scale;
@ -159,20 +175,20 @@ void HTMLRenderer::check_state_change(GfxState * state)
{
const FontInfo * new_font_info = install_font(state->getFont());
if(!(new_font_info->id == cur_html_state.font_info->id))
if(!(new_font_info->id == cur_text_state.font_info->id))
{
// The width of the type 3 font text, if shown, is likely to be wrong
// So we will create separate (absolute positioned) blocks for them, such that it won't affect other text
// TODO: consider the font matrix and estimate the metrics
if(new_font_info->is_type3 || cur_html_state.font_info->is_type3)
if(new_font_info->is_type3 || cur_text_state.font_info->is_type3)
{
new_line_state = max<NewLineState>(new_line_state, NLS_DIV);
new_line_state = max<NewLineState>(new_line_state, NLS_NEWLINE);
}
else
{
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
new_line_state = max<NewLineState>(new_line_state, NLS_NEWSTATE);
}
cur_html_state.font_info = new_font_info;
cur_text_state.font_info = new_font_info;
}
double new_font_size = state->getFontSize();
@ -252,23 +268,23 @@ void HTMLRenderer::check_state_change(GfxState * state)
draw_text_scale = new_draw_text_scale;
}
if(!equal(new_draw_font_size, cur_html_state.font_size))
if(!equal(new_draw_font_size, cur_text_state.font_size))
{
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
cur_html_state.font_size = new_draw_font_size;
new_line_state = max<NewLineState>(new_line_state, NLS_NEWSTATE);
cur_text_state.font_size = new_draw_font_size;
}
if(!tm_equal(new_draw_text_tm, cur_html_state.transform_matrix, 4))
if(!tm_equal(new_draw_text_tm, cur_line_state.transform_matrix, 4))
{
new_line_state = max<NewLineState>(new_line_state, NLS_DIV);
memcpy(cur_html_state.transform_matrix, new_draw_text_tm, sizeof(cur_html_state.transform_matrix));
new_line_state = max<NewLineState>(new_line_state, NLS_NEWLINE);
memcpy(cur_line_state.transform_matrix, new_draw_text_tm, sizeof(cur_line_state.transform_matrix));
}
}
// see if the new line is compatible with the current line with proper position shift
// don't bother doing the heavy job when (new_line_state == NLS_DIV)
// don't bother doing the heavy job when (new_line_state == NLS_NEWLINE)
// depends: text position & transformation
if(need_recheck_position && (new_line_state < NLS_DIV))
if(need_recheck_position && (new_line_state < NLS_NEWLINE))
{
// TM[4] and/or TM[5] have been changed
// To find an offset (dx,dy), which would cancel the effect
@ -285,7 +301,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
bool merged = false;
double dx = 0;
double dy = 0;
if(tm_equal(old_html_state.transform_matrix, cur_html_state.transform_matrix, 4))
if(tm_equal(old_line_state.transform_matrix, cur_line_state.transform_matrix, 4))
{
double det = old_tm[0] * old_tm[3] - old_tm[1] * old_tm[2];
if(!equal(det, 0))
@ -316,12 +332,12 @@ void HTMLRenderer::check_state_change(GfxState * state)
// otherwise we merge the lines only when
// - text are not shifted to the left too much
// - text are not moved too high or too low
if((dx * old_draw_text_scale) >= -param.space_threshold * old_html_state.em_size() - EPS)
if((dx * old_draw_text_scale) >= -param.space_threshold * old_text_state.em_size() - EPS)
{
double oldymin = old_html_state.font_info->descent * old_html_state.font_size;
double oldymax = old_html_state.font_info->ascent * old_html_state.font_size;
double ymin = dy * old_draw_text_scale + cur_html_state.font_info->descent * cur_html_state.font_size;
double ymax = dy * old_draw_text_scale + cur_html_state.font_info->ascent * cur_html_state.font_size;
double oldymin = old_text_state.font_info->descent * old_text_state.font_size;
double oldymax = old_text_state.font_info->ascent * old_text_state.font_size;
double ymin = dy * old_draw_text_scale + cur_text_state.font_info->descent * cur_text_state.font_size;
double ymax = dy * old_draw_text_scale + cur_text_state.font_info->ascent * cur_text_state.font_size;
if((ymin <= oldymax + EPS) && (ymax >= oldymin - EPS))
{
merged = true;
@ -335,22 +351,22 @@ void HTMLRenderer::check_state_change(GfxState * state)
if(merged)
{
html_text_page.append_offset(dx * old_draw_text_scale);
html_text_page.get_cur_line()->append_offset(dx * old_draw_text_scale);
if(equal(dy, 0))
{
cur_html_state.vertical_align = 0;
cur_text_state.vertical_align = 0;
}
else
{
cur_html_state.vertical_align = (dy * old_draw_text_scale);
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
cur_text_state.vertical_align = (dy * old_draw_text_scale);
new_line_state = max<NewLineState>(new_line_state, NLS_NEWSTATE);
}
draw_tx = cur_tx;
draw_ty = cur_ty;
}
else
{
new_line_state = max<NewLineState>(new_line_state, NLS_DIV);
new_line_state = max<NewLineState>(new_line_state, NLS_NEWLINE);
}
}
@ -359,10 +375,10 @@ void HTMLRenderer::check_state_change(GfxState * state)
if(all_changed || letter_space_changed || draw_text_scale_changed)
{
double new_letter_space = state->getCharSpace() * draw_text_scale;
if(!equal(new_letter_space, cur_html_state.letter_space))
if(!equal(new_letter_space, cur_text_state.letter_space))
{
cur_html_state.letter_space = new_letter_space;
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
cur_text_state.letter_space = new_letter_space;
new_line_state = max<NewLineState>(new_line_state, NLS_NEWSTATE);
}
}
@ -371,10 +387,10 @@ void HTMLRenderer::check_state_change(GfxState * state)
if(all_changed || word_space_changed || draw_text_scale_changed)
{
double new_word_space = state->getWordSpace() * draw_text_scale;
if(!equal(new_word_space, cur_html_state.word_space))
if(!equal(new_word_space, cur_text_state.word_space))
{
cur_html_state.word_space = new_word_space;
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
cur_text_state.word_space = new_word_space;
new_line_state = max<NewLineState>(new_line_state, NLS_NEWSTATE);
}
}
@ -396,10 +412,10 @@ void HTMLRenderer::check_state_change(GfxState * state)
{
new_fill_color.transparent = true;
}
if(!(new_fill_color == cur_html_state.fill_color))
if(!(new_fill_color == cur_text_state.fill_color))
{
cur_html_state.fill_color = new_fill_color;
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
cur_text_state.fill_color = new_fill_color;
new_line_state = max<NewLineState>(new_line_state, NLS_NEWSTATE);
}
}
@ -422,10 +438,10 @@ void HTMLRenderer::check_state_change(GfxState * state)
{
new_stroke_color.transparent = true;
}
if(!(new_stroke_color == cur_html_state.stroke_color))
if(!(new_stroke_color == cur_text_state.stroke_color))
{
cur_html_state.stroke_color = new_stroke_color;
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
cur_text_state.stroke_color = new_stroke_color;
new_line_state = max<NewLineState>(new_line_state, NLS_NEWSTATE);
}
}
@ -434,13 +450,16 @@ void HTMLRenderer::check_state_change(GfxState * state)
void HTMLRenderer::prepare_text_line(GfxState * state)
{
if(new_line_state == NLS_DIV)
{
close_text_line();
if(!(html_text_page.get_cur_line()))
new_line_state = NLS_NEWLINE;
if(new_line_state == NLS_NEWLINE)
{
// update position such that they will be recorded by text_line_buf
state->transform(state->getCurX(), state->getCurY(), &cur_html_state.x, &cur_html_state.y);
cur_html_state.vertical_align = 0;
state->transform(state->getCurX(), state->getCurY(), &cur_line_state.x, &cur_line_state.y);
html_text_page.open_new_line(cur_line_state);
cur_text_state.vertical_align = 0;
//resync position
draw_ty = cur_ty;
@ -453,20 +472,15 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
double target = (cur_tx - draw_tx) * draw_text_scale;
if(!equal(target, 0))
{
html_text_page.append_offset(target);
html_text_page.get_cur_line()->append_offset(target);
draw_tx += target / draw_text_scale;
}
}
if(new_line_state != NLS_NONE)
{
html_text_page.append_state(cur_html_state);
html_text_page.get_cur_line()->append_state(cur_text_state);
}
}
void HTMLRenderer::close_text_line()
{
html_text_page.open_new_line();
}
} //namespace pdf2htmlEX

View File

@ -90,19 +90,19 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
if(is_space && (param.space_as_offset))
{
// ignore horiz_scaling, as it's merged in CTM
html_text_page.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
}
else
{
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
{
html_text_page.append_unicodes(u, uLen);
html_text_page.get_cur_line()->append_unicodes(u, uLen);
// TODO: decomposed characters may be not with the same width as the original ligature, need to fix it.
}
else
{
Unicode uu;
if(cur_html_state.font_info->use_tounicode)
if(cur_text_state.font_info->use_tounicode)
{
uu = check_unicode(u, uLen, code, font);
}
@ -110,14 +110,14 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
{
uu = unicode_from_font(code, font);
}
html_text_page.append_unicodes(&uu, 1);
html_text_page.get_cur_line()->append_unicodes(&uu, 1);
/*
* In PDF, word_space is appended if (n == 1 and *p = ' ')
* but in HTML, word_space is appended if (uu == ' ')
*/
int space_count = (is_space ? 1 : 0) - (uu == ' ' ? 1 : 0);
if(space_count != 0)
html_text_page.append_offset(cur_word_space * draw_text_scale * space_count);
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
}
}

View File

@ -19,7 +19,7 @@ struct FontInfo
bool is_type3;
};
struct HTMLState
struct HTMLTextState
{
const FontInfo * font_info;
double font_size;
@ -31,9 +31,6 @@ struct HTMLState
// relative to the previous state
double vertical_align;
double x,y;
double transform_matrix[4];
// the offset cause by a single ' ' char
double single_space_offset(void) const {
return word_space + letter_space + font_info->space_width * font_size;
@ -44,6 +41,17 @@ struct HTMLState
}
};
struct HTMLLineState
{
double x,y;
double transform_matrix[4];
};
struct HTMLClipState
{
double xmin, xmax, ymin, ymax;
};
} // namespace pdf2htmlEX
#endif //HTMLSTATE_H__

View File

@ -25,8 +25,10 @@ using std::endl;
using std::find;
using std::abs;
HTMLTextLine::HTMLTextLine (const Param & param, AllStateManager & all_manager)
: param(param), all_manager(all_manager)
HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager)
:param(param)
,all_manager(all_manager)
,line_state(line_state)
{ }
void HTMLTextLine::append_unicodes(const Unicode * u, int l)
@ -47,7 +49,7 @@ void HTMLTextLine::append_offset(double width)
offsets.emplace_back(text.size(), width);
}
void HTMLTextLine::append_state(const HTMLState & html_state)
void HTMLTextLine::append_state(const HTMLTextState & text_state)
{
if(states.empty() || (states.back().start_idx != text.size()))
{
@ -56,7 +58,7 @@ void HTMLTextLine::append_state(const HTMLState & html_state)
states.back().hash_umask = 0;
}
(HTMLState&)(states.back()) = html_state;
(HTMLTextState&)(states.back()) = text_state;
}
void HTMLTextLine::dump_text(ostream & out)
@ -78,10 +80,10 @@ void HTMLTextLine::dump_text(ostream & out)
{
// open <div> for the current text line
out << "<div class=\"" << CSS::LINE_CN
<< " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(states[0].transform_matrix)
<< " " << CSS::LEFT_CN << all_manager.left.install(states[0].x)
<< " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(line_state.transform_matrix)
<< " " << CSS::LEFT_CN << all_manager.left.install(line_state.x)
<< " " << CSS::HEIGHT_CN << all_manager.height.install(ascent)
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(states[0].y)
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(line_state.y)
;
// it will be closed by the first state
}

View File

@ -27,9 +27,9 @@ namespace pdf2htmlEX {
class HTMLTextLine
{
public:
HTMLTextLine (const Param & param, AllStateManager & all_manager);
HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager);
struct State : public HTMLState {
struct State : public HTMLTextState {
// before output
void begin(std::ostream & out, const State * prev_state);
// after output
@ -75,7 +75,7 @@ public:
void append_unicodes(const Unicode * u, int l);
void append_offset(double width);
void append_state(const HTMLState & html_state);
void append_state(const HTMLTextState & text_state);
void dump_text(std::ostream & out);
bool text_empty(void) const { return text.empty(); }
@ -85,13 +85,13 @@ public:
* Optimize and calculate necessary values
*/
void prepare(void);
private:
void optimize(void);
const Param & param;
AllStateManager & all_manager;
HTMLLineState line_state;
double ascent, descent;
std::vector<State> states;

View File

@ -16,7 +16,7 @@ using std::unique_ptr;
HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
: param(param)
, all_manager(all_manager)
, last_line(nullptr)
, cur_line(nullptr)
{ }
void HTMLTextPage::dump_text(ostream & out)
@ -29,27 +29,6 @@ void HTMLTextPage::dump_text(ostream & out)
(*iter)->dump_text(out);
}
void HTMLTextPage::append_unicodes(const Unicode * u, int l)
{
if(!last_line)
open_new_line();
last_line->append_unicodes(u, l);
}
void HTMLTextPage::append_offset(double offset)
{
if(!last_line)
open_new_line();
last_line->append_offset(offset);
}
void HTMLTextPage::append_state(const HTMLState & state)
{
if(!last_line)
open_new_line();
last_line->append_state(state);
}
void HTMLTextPage::dump_css(ostream & out)
{
//TODO
@ -58,21 +37,17 @@ void HTMLTextPage::dump_css(ostream & out)
void HTMLTextPage::clear(void)
{
text_lines.clear();
last_line = nullptr;
cur_line = nullptr;
}
void HTMLTextPage::open_new_line(void)
void HTMLTextPage::open_new_line(const HTMLLineState & line_state)
{
if(last_line && (last_line->text_empty()))
if((!text_lines.empty()) && (text_lines.back()->text_empty()))
{
// state and offsets might be nonempty
last_line->clear();
}
else
{
text_lines.emplace_back(new HTMLTextLine(param, all_manager));
last_line = text_lines.back().get();
text_lines.pop_back();
}
text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager));
cur_line = text_lines.back().get();
}
void HTMLTextPage::optimize(void)

View File

@ -28,22 +28,20 @@ class HTMLTextPage
public:
HTMLTextPage (const Param & param, AllStateManager & all_manager);
void append_unicodes(const Unicode * u, int l);
void append_offset(double offset);
void append_state(const HTMLState & state);
HTMLTextLine * get_cur_line(void) const { return cur_line; }
void dump_text(std::ostream & out);
void dump_css(std::ostream & out);
void clear(void);
void open_new_line(void);
void open_new_line(const HTMLLineState & line_state);
private:
void optimize(void);
const Param & param;
AllStateManager & all_manager;
HTMLTextLine * last_line;
HTMLTextLine * cur_line;
std::vector<std::unique_ptr<HTMLTextLine>> text_lines;
};