diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 390f4c3..85d9eb0 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -57,7 +57,6 @@ public: // will be reduced to a series of other drawing operations. virtual GBool useShadedFills(int type) { return (type == 2) ? gTrue: gFalse; } - // Does this device use beginType3Char/endType3Char? Otherwise, // text in Type 3 fonts will be drawn with drawChar/drawString. virtual GBool interpretType3Chars() { return gFalse; } @@ -111,6 +110,10 @@ public: /* * Rendering */ + + virtual void clip(GfxState * state); + virtual void eoClip(GfxState * state); + virtual void clipToStrokePath(GfxState * state); virtual void drawString(GfxState * state, GooString * s); @@ -186,7 +189,6 @@ protected: // prepare the line context, (close old tags, open new tags) // make sure the current HTML style consistent with PDF void prepare_text_line(GfxState * state); - void close_text_line(); //////////////////////////////////////////////////// // CSS drawing @@ -261,6 +263,7 @@ protected: bool word_space_changed; bool letter_space_changed; bool stroke_color_changed; + bool clip_changed; //////////////////////////////////////////////////// // HTML states @@ -279,19 +282,26 @@ protected: // also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty) double draw_tx, draw_ty; - // managers store values actually used in HTML (i.e. scaled) - AllStateManager all_manager; - enum NewLineState - { - NLS_NONE, // stay with the same style - NLS_SPAN, // open a new if possible, otherwise a new
- NLS_DIV // has to open a new
- } new_line_state; - + //////////////////////////////////////////////////// + // styles & resources + //////////////////////////////////////////////////// + // managers store values actually used in HTML (i.e. scaled) + std::unordered_map font_info_map; + AllStateManager all_manager; + HTMLTextState cur_text_state; + HTMLLineState cur_line_state; HTMLTextPage html_text_page; + enum NewLineState + { + NLS_NONE, + NLS_NEWSTATE, + NLS_NEWLINE, + NLS_NEWCLIP + } new_line_state; + // for font reencoding int32_t * cur_mapping; char ** cur_mapping2; @@ -303,13 +313,6 @@ protected: // for string formatting StringFormatter str_fmt; - //////////////////////////////////////////////////// - // styles & resources - //////////////////////////////////////////////////// - - HTMLState cur_html_state; - std::unordered_map font_info_map; - struct { std::ofstream fs; std::string path; diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc index 46b1866..9926916 100644 --- a/src/HTMLRenderer/draw.cc +++ b/src/HTMLRenderer/draw.cc @@ -347,8 +347,6 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co const GfxRGB * line_color, const GfxRGB * fill_color, void (*style_function)(void *, ostream &), void * style_function_data) { - close_text_line(); - double new_tm[6]; memcpy(new_tm, tm, sizeof(new_tm)); diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 865f7ae..a9c3e6f 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -207,8 +207,6 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) } void HTMLRenderer::endPage() { - close_text_line(); - // dump all text html_text_page.dump_text(f_pages.fs); html_text_page.dump_css(f_css.fs); diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index f9371ef..9fe888a 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -86,6 +86,18 @@ void HTMLRenderer::updateStrokeColor(GfxState * state) { stroke_color_changed = true; } +void HTMLRenderer::clip(GfxState * state) +{ + clip_changed = true; +} +void HTMLRenderer::eoClip(GfxState * state) +{ + clip_changed = true; +} +void HTMLRenderer::clipToStrokePath(GfxState * state) +{ + clip_changed = true; +} void HTMLRenderer::reset_state() { draw_text_scale = 1.0; @@ -95,16 +107,17 @@ void HTMLRenderer::reset_state() memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm)); // reset html_state - cur_html_state.font_info = install_font(nullptr); - cur_html_state.font_size = 0; - cur_html_state.fill_color.transparent = true; - cur_html_state.stroke_color.transparent = true; - cur_html_state.letter_space = 0; - cur_html_state.word_space = 0; - cur_html_state.vertical_align = 0; - cur_html_state.x = 0; - cur_html_state.y = 0; - memcpy(cur_html_state.transform_matrix, ID_MATRIX, sizeof(cur_html_state.transform_matrix)); + cur_text_state.font_info = install_font(nullptr); + cur_text_state.font_size = 0; + cur_text_state.fill_color.transparent = true; + cur_text_state.stroke_color.transparent = true; + cur_text_state.letter_space = 0; + cur_text_state.word_space = 0; + cur_text_state.vertical_align = 0; + + cur_line_state.x = 0; + cur_line_state.y = 0; + memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix)); cur_tx = cur_ty = 0; draw_tx = draw_ty = 0; @@ -129,6 +142,8 @@ void HTMLRenderer::reset_state_change() fill_color_changed = false; stroke_color_changed = false; + + clip_changed = false; } void HTMLRenderer::check_state_change(GfxState * state) { @@ -142,7 +157,8 @@ void HTMLRenderer::check_state_change(GfxState * state) bool draw_text_scale_changed = false; // save current info for later use - HTMLState old_html_state = cur_html_state; + auto old_text_state = cur_text_state; + auto old_line_state = cur_line_state; double old_tm[6]; memcpy(old_tm, cur_text_tm, sizeof(old_tm)); double old_draw_text_scale = draw_text_scale; @@ -159,20 +175,20 @@ void HTMLRenderer::check_state_change(GfxState * state) { const FontInfo * new_font_info = install_font(state->getFont()); - if(!(new_font_info->id == cur_html_state.font_info->id)) + if(!(new_font_info->id == cur_text_state.font_info->id)) { // The width of the type 3 font text, if shown, is likely to be wrong // So we will create separate (absolute positioned) blocks for them, such that it won't affect other text // TODO: consider the font matrix and estimate the metrics - if(new_font_info->is_type3 || cur_html_state.font_info->is_type3) + if(new_font_info->is_type3 || cur_text_state.font_info->is_type3) { - new_line_state = max(new_line_state, NLS_DIV); + new_line_state = max(new_line_state, NLS_NEWLINE); } else { - new_line_state = max(new_line_state, NLS_SPAN); + new_line_state = max(new_line_state, NLS_NEWSTATE); } - cur_html_state.font_info = new_font_info; + cur_text_state.font_info = new_font_info; } double new_font_size = state->getFontSize(); @@ -252,23 +268,23 @@ void HTMLRenderer::check_state_change(GfxState * state) draw_text_scale = new_draw_text_scale; } - if(!equal(new_draw_font_size, cur_html_state.font_size)) + if(!equal(new_draw_font_size, cur_text_state.font_size)) { - new_line_state = max(new_line_state, NLS_SPAN); - cur_html_state.font_size = new_draw_font_size; + new_line_state = max(new_line_state, NLS_NEWSTATE); + cur_text_state.font_size = new_draw_font_size; } - if(!tm_equal(new_draw_text_tm, cur_html_state.transform_matrix, 4)) + if(!tm_equal(new_draw_text_tm, cur_line_state.transform_matrix, 4)) { - new_line_state = max(new_line_state, NLS_DIV); - memcpy(cur_html_state.transform_matrix, new_draw_text_tm, sizeof(cur_html_state.transform_matrix)); + new_line_state = max(new_line_state, NLS_NEWLINE); + memcpy(cur_line_state.transform_matrix, new_draw_text_tm, sizeof(cur_line_state.transform_matrix)); } } // see if the new line is compatible with the current line with proper position shift - // don't bother doing the heavy job when (new_line_state == NLS_DIV) + // don't bother doing the heavy job when (new_line_state == NLS_NEWLINE) // depends: text position & transformation - if(need_recheck_position && (new_line_state < NLS_DIV)) + if(need_recheck_position && (new_line_state < NLS_NEWLINE)) { // TM[4] and/or TM[5] have been changed // To find an offset (dx,dy), which would cancel the effect @@ -285,7 +301,7 @@ void HTMLRenderer::check_state_change(GfxState * state) bool merged = false; double dx = 0; double dy = 0; - if(tm_equal(old_html_state.transform_matrix, cur_html_state.transform_matrix, 4)) + if(tm_equal(old_line_state.transform_matrix, cur_line_state.transform_matrix, 4)) { double det = old_tm[0] * old_tm[3] - old_tm[1] * old_tm[2]; if(!equal(det, 0)) @@ -316,12 +332,12 @@ void HTMLRenderer::check_state_change(GfxState * state) // otherwise we merge the lines only when // - text are not shifted to the left too much // - text are not moved too high or too low - if((dx * old_draw_text_scale) >= -param.space_threshold * old_html_state.em_size() - EPS) + if((dx * old_draw_text_scale) >= -param.space_threshold * old_text_state.em_size() - EPS) { - double oldymin = old_html_state.font_info->descent * old_html_state.font_size; - double oldymax = old_html_state.font_info->ascent * old_html_state.font_size; - double ymin = dy * old_draw_text_scale + cur_html_state.font_info->descent * cur_html_state.font_size; - double ymax = dy * old_draw_text_scale + cur_html_state.font_info->ascent * cur_html_state.font_size; + double oldymin = old_text_state.font_info->descent * old_text_state.font_size; + double oldymax = old_text_state.font_info->ascent * old_text_state.font_size; + double ymin = dy * old_draw_text_scale + cur_text_state.font_info->descent * cur_text_state.font_size; + double ymax = dy * old_draw_text_scale + cur_text_state.font_info->ascent * cur_text_state.font_size; if((ymin <= oldymax + EPS) && (ymax >= oldymin - EPS)) { merged = true; @@ -335,22 +351,22 @@ void HTMLRenderer::check_state_change(GfxState * state) if(merged) { - html_text_page.append_offset(dx * old_draw_text_scale); + html_text_page.get_cur_line()->append_offset(dx * old_draw_text_scale); if(equal(dy, 0)) { - cur_html_state.vertical_align = 0; + cur_text_state.vertical_align = 0; } else { - cur_html_state.vertical_align = (dy * old_draw_text_scale); - new_line_state = max(new_line_state, NLS_SPAN); + cur_text_state.vertical_align = (dy * old_draw_text_scale); + new_line_state = max(new_line_state, NLS_NEWSTATE); } draw_tx = cur_tx; draw_ty = cur_ty; } else { - new_line_state = max(new_line_state, NLS_DIV); + new_line_state = max(new_line_state, NLS_NEWLINE); } } @@ -359,10 +375,10 @@ void HTMLRenderer::check_state_change(GfxState * state) if(all_changed || letter_space_changed || draw_text_scale_changed) { double new_letter_space = state->getCharSpace() * draw_text_scale; - if(!equal(new_letter_space, cur_html_state.letter_space)) + if(!equal(new_letter_space, cur_text_state.letter_space)) { - cur_html_state.letter_space = new_letter_space; - new_line_state = max(new_line_state, NLS_SPAN); + cur_text_state.letter_space = new_letter_space; + new_line_state = max(new_line_state, NLS_NEWSTATE); } } @@ -371,10 +387,10 @@ void HTMLRenderer::check_state_change(GfxState * state) if(all_changed || word_space_changed || draw_text_scale_changed) { double new_word_space = state->getWordSpace() * draw_text_scale; - if(!equal(new_word_space, cur_html_state.word_space)) + if(!equal(new_word_space, cur_text_state.word_space)) { - cur_html_state.word_space = new_word_space; - new_line_state = max(new_line_state, NLS_SPAN); + cur_text_state.word_space = new_word_space; + new_line_state = max(new_line_state, NLS_NEWSTATE); } } @@ -396,10 +412,10 @@ void HTMLRenderer::check_state_change(GfxState * state) { new_fill_color.transparent = true; } - if(!(new_fill_color == cur_html_state.fill_color)) + if(!(new_fill_color == cur_text_state.fill_color)) { - cur_html_state.fill_color = new_fill_color; - new_line_state = max(new_line_state, NLS_SPAN); + cur_text_state.fill_color = new_fill_color; + new_line_state = max(new_line_state, NLS_NEWSTATE); } } @@ -422,10 +438,10 @@ void HTMLRenderer::check_state_change(GfxState * state) { new_stroke_color.transparent = true; } - if(!(new_stroke_color == cur_html_state.stroke_color)) + if(!(new_stroke_color == cur_text_state.stroke_color)) { - cur_html_state.stroke_color = new_stroke_color; - new_line_state = max(new_line_state, NLS_SPAN); + cur_text_state.stroke_color = new_stroke_color; + new_line_state = max(new_line_state, NLS_NEWSTATE); } } @@ -434,13 +450,16 @@ void HTMLRenderer::check_state_change(GfxState * state) void HTMLRenderer::prepare_text_line(GfxState * state) { - if(new_line_state == NLS_DIV) - { - close_text_line(); + if(!(html_text_page.get_cur_line())) + new_line_state = NLS_NEWLINE; + if(new_line_state == NLS_NEWLINE) + { // update position such that they will be recorded by text_line_buf - state->transform(state->getCurX(), state->getCurY(), &cur_html_state.x, &cur_html_state.y); - cur_html_state.vertical_align = 0; + state->transform(state->getCurX(), state->getCurY(), &cur_line_state.x, &cur_line_state.y); + html_text_page.open_new_line(cur_line_state); + + cur_text_state.vertical_align = 0; //resync position draw_ty = cur_ty; @@ -453,20 +472,15 @@ void HTMLRenderer::prepare_text_line(GfxState * state) double target = (cur_tx - draw_tx) * draw_text_scale; if(!equal(target, 0)) { - html_text_page.append_offset(target); + html_text_page.get_cur_line()->append_offset(target); draw_tx += target / draw_text_scale; } } if(new_line_state != NLS_NONE) { - html_text_page.append_state(cur_html_state); + html_text_page.get_cur_line()->append_state(cur_text_state); } } -void HTMLRenderer::close_text_line() -{ - html_text_page.open_new_line(); -} - } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 8a0411a..aa1d022 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -90,19 +90,19 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) if(is_space && (param.space_as_offset)) { // ignore horiz_scaling, as it's merged in CTM - html_text_page.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); + html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); } else { if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) { - html_text_page.append_unicodes(u, uLen); + html_text_page.get_cur_line()->append_unicodes(u, uLen); // TODO: decomposed characters may be not with the same width as the original ligature, need to fix it. } else { Unicode uu; - if(cur_html_state.font_info->use_tounicode) + if(cur_text_state.font_info->use_tounicode) { uu = check_unicode(u, uLen, code, font); } @@ -110,14 +110,14 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) { uu = unicode_from_font(code, font); } - html_text_page.append_unicodes(&uu, 1); + html_text_page.get_cur_line()->append_unicodes(&uu, 1); /* * In PDF, word_space is appended if (n == 1 and *p = ' ') * but in HTML, word_space is appended if (uu == ' ') */ int space_count = (is_space ? 1 : 0) - (uu == ' ' ? 1 : 0); if(space_count != 0) - html_text_page.append_offset(cur_word_space * draw_text_scale * space_count); + html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count); } } diff --git a/src/HTMLState.h b/src/HTMLState.h index 6bbe440..f5e07e0 100644 --- a/src/HTMLState.h +++ b/src/HTMLState.h @@ -19,7 +19,7 @@ struct FontInfo bool is_type3; }; -struct HTMLState +struct HTMLTextState { const FontInfo * font_info; double font_size; @@ -31,9 +31,6 @@ struct HTMLState // relative to the previous state double vertical_align; - double x,y; - double transform_matrix[4]; - // the offset cause by a single ' ' char double single_space_offset(void) const { return word_space + letter_space + font_info->space_width * font_size; @@ -44,6 +41,17 @@ struct HTMLState } }; +struct HTMLLineState +{ + double x,y; + double transform_matrix[4]; +}; + +struct HTMLClipState +{ + double xmin, xmax, ymin, ymax; +}; + } // namespace pdf2htmlEX #endif //HTMLSTATE_H__ diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index 85c7b0e..c9b9536 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -25,8 +25,10 @@ using std::endl; using std::find; using std::abs; -HTMLTextLine::HTMLTextLine (const Param & param, AllStateManager & all_manager) - : param(param), all_manager(all_manager) +HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager) + :param(param) + ,all_manager(all_manager) + ,line_state(line_state) { } void HTMLTextLine::append_unicodes(const Unicode * u, int l) @@ -47,7 +49,7 @@ void HTMLTextLine::append_offset(double width) offsets.emplace_back(text.size(), width); } -void HTMLTextLine::append_state(const HTMLState & html_state) +void HTMLTextLine::append_state(const HTMLTextState & text_state) { if(states.empty() || (states.back().start_idx != text.size())) { @@ -56,7 +58,7 @@ void HTMLTextLine::append_state(const HTMLState & html_state) states.back().hash_umask = 0; } - (HTMLState&)(states.back()) = html_state; + (HTMLTextState&)(states.back()) = text_state; } void HTMLTextLine::dump_text(ostream & out) @@ -78,10 +80,10 @@ void HTMLTextLine::dump_text(ostream & out) { // open
for the current text line out << "
states; diff --git a/src/HTMLTextPage.cc b/src/HTMLTextPage.cc index 0ac8071..8780e5e 100644 --- a/src/HTMLTextPage.cc +++ b/src/HTMLTextPage.cc @@ -16,7 +16,7 @@ using std::unique_ptr; HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager) : param(param) , all_manager(all_manager) - , last_line(nullptr) + , cur_line(nullptr) { } void HTMLTextPage::dump_text(ostream & out) @@ -29,27 +29,6 @@ void HTMLTextPage::dump_text(ostream & out) (*iter)->dump_text(out); } -void HTMLTextPage::append_unicodes(const Unicode * u, int l) -{ - if(!last_line) - open_new_line(); - last_line->append_unicodes(u, l); -} - -void HTMLTextPage::append_offset(double offset) -{ - if(!last_line) - open_new_line(); - last_line->append_offset(offset); -} - -void HTMLTextPage::append_state(const HTMLState & state) -{ - if(!last_line) - open_new_line(); - last_line->append_state(state); -} - void HTMLTextPage::dump_css(ostream & out) { //TODO @@ -58,21 +37,17 @@ void HTMLTextPage::dump_css(ostream & out) void HTMLTextPage::clear(void) { text_lines.clear(); - last_line = nullptr; + cur_line = nullptr; } -void HTMLTextPage::open_new_line(void) +void HTMLTextPage::open_new_line(const HTMLLineState & line_state) { - if(last_line && (last_line->text_empty())) + if((!text_lines.empty()) && (text_lines.back()->text_empty())) { - // state and offsets might be nonempty - last_line->clear(); - } - else - { - text_lines.emplace_back(new HTMLTextLine(param, all_manager)); - last_line = text_lines.back().get(); + text_lines.pop_back(); } + text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager)); + cur_line = text_lines.back().get(); } void HTMLTextPage::optimize(void) diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h index dcd2559..5e6683e 100644 --- a/src/HTMLTextPage.h +++ b/src/HTMLTextPage.h @@ -28,22 +28,20 @@ class HTMLTextPage public: HTMLTextPage (const Param & param, AllStateManager & all_manager); - void append_unicodes(const Unicode * u, int l); - void append_offset(double offset); - void append_state(const HTMLState & state); + HTMLTextLine * get_cur_line(void) const { return cur_line; } void dump_text(std::ostream & out); void dump_css(std::ostream & out); void clear(void); - void open_new_line(void); + void open_new_line(const HTMLLineState & line_state); private: void optimize(void); const Param & param; AllStateManager & all_manager; - HTMLTextLine * last_line; + HTMLTextLine * cur_line; std::vector> text_lines; };