diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 289f053..23c3af3 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -29,6 +29,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) // unscaled double cur_letter_space = state->getCharSpace(); double cur_word_space = state->getWordSpace(); + double cur_horiz_scaling = state->getHorizScaling(); + // Writing mode fonts and Type 3 fonts are rendered as images // I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly @@ -89,15 +91,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) if(is_space && (param.space_as_offset)) { - // ignore horiz_scaling, as it's merged in CTM + // ignore horiz_scaling, as it has been merged into CTM html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); } else { if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) { - html_text_page.get_cur_line()->append_unicodes(u, uLen); - // TODO: decomposed characters may be not with the same width as the original ligature, need to fix it. + // TODO: why multiply cur_horiz_scaling here? + html_text_page.get_cur_line()->append_unicodes(u, uLen, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling); } else { @@ -110,7 +112,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) { uu = unicode_from_font(code, font); } - html_text_page.get_cur_line()->append_unicodes(&uu, 1); + // TODO: why multiply cur_horiz_scaling here? + html_text_page.get_cur_line()->append_unicodes(&uu, 1, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling); /* * In PDF, word_space is appended if (n == 1 and *p = ' ') * but in HTML, word_space is appended if (uu == ' ') @@ -131,12 +134,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) len -= n; } - double hs = state->getHorizScaling(); - // horiz_scaling is merged into ctm now, // so the coordinate system is ugly - // TODO: why multiply hs here - dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs; + // TODO: why multiply cur_horiz_scaling here + dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * cur_horiz_scaling; dy *= cur_font_size; diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index 4759e15..51cd2d8 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -31,11 +31,13 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para ,line_state(line_state) ,clip_x1(0) ,clip_y1(0) + ,width(0) { } -void HTMLTextLine::append_unicodes(const Unicode * u, int l) +void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width) { text.insert(text.end(), u, u+l); + this->width += width; } void HTMLTextLine::append_offset(double width) @@ -49,6 +51,7 @@ void HTMLTextLine::append_offset(double width) offsets.back().width += width; else offsets.emplace_back(text.size(), width); + this->width += width; } void HTMLTextLine::append_state(const HTMLTextState & text_state) @@ -188,28 +191,17 @@ void HTMLTextLine::dump_text(ostream & out) // finally, just dump it if(!done) { - if(param.optimize_text < 3) + long long wid = all_manager.whitespace.install(target, &actual_offset); + + if(!equal(actual_offset, 0)) { - long long wid = all_manager.whitespace.install(target, &actual_offset); + if(is_positive(-actual_offset)) + last_text_pos_with_negative_offset = cur_text_idx; - if(!equal(actual_offset, 0)) - { - if(is_positive(-actual_offset)) - last_text_pos_with_negative_offset = cur_text_idx; - - double threshold = state_iter1->em_size() * (param.space_threshold); - - out << "" << (target > (threshold - EPS) ? " " : "") << ""; - } - } - else - { - // aggressive optimization double threshold = state_iter1->em_size() * (param.space_threshold); - if(target > threshold) - out << ' '; - actual_offset = target; + + out << "" << (target > (threshold - EPS) ? " " : "") << ""; } } } @@ -255,9 +247,6 @@ void HTMLTextLine::clip(const HTMLClipState & clip_state) void HTMLTextLine::prepare(void) { - if(param.optimize_text) - optimize(); - // max_ascent determines the height of the div double accum_vertical_align = 0; // accumulated ascent = 0; @@ -285,11 +274,22 @@ void HTMLTextLine::prepare(void) } +void HTMLTextLine::optimize(std::vector & lines) +{ + if(param.optimize_text == 3) + { + optimize_aggressive(lines); + } + else + { + optimize_normal(lines); + } +} /* * Adjust letter space and word space in order to reduce the number of HTML elements * May also unmask word space */ -void HTMLTextLine::optimize() +void HTMLTextLine::optimize_normal(std::vector & lines) { // remove unuseful states in the end while((!states.empty()) && (states.back().start_idx >= text.size())) @@ -465,6 +465,32 @@ void HTMLTextLine::optimize() // apply optimization std::swap(offsets, new_offsets); + + lines.push_back(this); +} + +// for optimize-text == 3 +void HTMLTextLine::optimize_aggressive(std::vector & lines) +{ + HTMLTextLine *cur_line = this; + while(true) + { + lines.push_back(cur_line); + + // break the line if there are a large (positive or negative) shift + // letter space / word space are not taken into consideration (yet) + + + // TODO + } + + /* + // aggressive optimization + if(target > state_iter1->em_size() * (param.space_threshold) - EPS) + out << ' '; + dx = 0; + lines.push_back(this); + */ } // this state will be converted to a child node of the node of prev_state diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h index c974c0f..782b491 100644 --- a/src/HTMLTextLine.h +++ b/src/HTMLTextLine.h @@ -73,7 +73,7 @@ public: double width; }; - void append_unicodes(const Unicode * u, int l); + void append_unicodes(const Unicode * u, int l, double width); void append_offset(double width); void append_state(const HTMLTextState & text_state); void dump_text(std::ostream & out); @@ -87,8 +87,10 @@ public: * Optimize and calculate necessary values */ void prepare(void); + void optimize(std::vector &); private: - void optimize(void); + void optimize_normal(std::vector &); + void optimize_aggressive(std::vector &); const Param & param; AllStateManager & all_manager; @@ -96,6 +98,7 @@ private: HTMLLineState line_state; double ascent, descent; double clip_x1, clip_y1; + double width; std::vector states; std::vector offsets; diff --git a/src/HTMLTextPage.cc b/src/HTMLTextPage.cc index 0f23d8b..4bc23d3 100644 --- a/src/HTMLTextPage.cc +++ b/src/HTMLTextPage.cc @@ -12,7 +12,6 @@ namespace pdf2htmlEX { using std::ostream; -using std::unique_ptr; HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager) : param(param) @@ -22,8 +21,24 @@ HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager) , page_height(0) { } +HTMLTextPage::~HTMLTextPage() +{ + for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter) + { + delete (*iter); + } +} + void HTMLTextPage::dump_text(ostream & out) { + if(param.optimize_text) + { + // text lines may be split during optimization, collect them + std::vector new_text_lines; + for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter) + (*iter)->optimize(new_text_lines); + std::swap(text_lines, new_text_lines); + } for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter) (*iter)->prepare(); if(param.optimize_text) @@ -98,7 +113,7 @@ void HTMLTextPage::open_new_line(const HTMLLineState & line_state) // do not reused the last text_line even if it's empty // because the clip states may point to the next index text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager)); - cur_line = text_lines.back().get(); + cur_line = text_lines.back(); } void HTMLTextPage::set_page_size(double width, double height) diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h index ec01e24..7bffec4 100644 --- a/src/HTMLTextPage.h +++ b/src/HTMLTextPage.h @@ -7,7 +7,6 @@ #define HTMLTEXTPAGE_H__ #include -#include #include #include "Param.h" @@ -26,6 +25,7 @@ class HTMLTextPage { public: HTMLTextPage (const Param & param, AllStateManager & all_manager); + ~HTMLTextPage(); HTMLTextLine * get_cur_line(void) const { return cur_line; } @@ -47,7 +47,7 @@ private: HTMLTextLine * cur_line; double page_width, page_height; - std::vector> text_lines; + std::vector text_lines; struct Clip { HTMLClipState clip_state;