From 39e171a73760c5120a7bd4c30d4fd4b06974b0b1 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Thu, 26 Jun 2014 12:39:35 +0800 Subject: [PATCH] Improve covered text handling: 1. take care of chars corespond to 0 or more than one unicode points; 2. merge sibling invisiable spans; 3. improve interfaces of HTMLLineState and HTMLRenderer; --- .../CairoBackgroundRenderer.cc | 2 +- .../SplashBackgroundRenderer.cc | 2 +- src/HTMLRenderer/HTMLRenderer.h | 9 ++- src/HTMLRenderer/state.cc | 5 +- src/HTMLRenderer/text.cc | 15 ++++- src/HTMLState.h | 7 ++- src/HTMLTextLine.cc | 61 ++++++++++++++----- src/HTMLTextLine.h | 28 ++++++++- 8 files changed, 101 insertions(+), 28 deletions(-) diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc index 19d5795..1ff8622 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.cc +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc @@ -66,7 +66,7 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y, // If a char is treated as image, it is not subject to cover test // (see HTMLRenderer::drawString), so don't increase drawn_char_count. else if (param.process_covered_text) { - if (html_renderer->get_chars_covered()[drawn_char_count]) + if (html_renderer->is_char_covered(drawn_char_count)) CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); drawn_char_count++; } diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc index 8b70e4c..4089da7 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.cc +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -91,7 +91,7 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y, // If a char is treated as image, it is not subject to cover test // (see HTMLRenderer::drawString), so don't increase drawn_char_count. else if (param.process_covered_text) { - if (html_renderer->get_chars_covered()[drawn_char_count]) + if (html_renderer->is_char_covered(drawn_char_count)) SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); drawn_char_count++; } diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 9d47095..3ec57c1 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -150,7 +150,14 @@ public: bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); } bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); } - const std::vector & get_chars_covered() { return covered_text_handler.get_chars_covered(); } + /* + * Covered text handling. + */ + // Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page. + // Does not fail on out-of-bound conditions, but return false. + bool is_char_covered(int index); + // Currently drawn char (glyph) count in current page. + int get_char_count() { return (int)covered_text_handler.get_chars_covered().size(); } protected: //////////////////////////////////////////////////// diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 9278e4e..c46ed4c 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -123,8 +123,7 @@ void HTMLRenderer::reset_state() cur_line_state.y = 0; memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix)); - if (param.process_covered_text) - cur_line_state.chars_covered = &covered_text_handler.get_chars_covered(); + cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);}; cur_clip_state.xmin = 0; cur_clip_state.xmax = 0; @@ -510,7 +509,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state) state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y); state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y); if (param.process_covered_text) - cur_line_state.first_char_index = covered_text_handler.get_chars_covered().size(); + cur_line_state.first_char_index = get_char_count(); html_text_page.open_new_line(cur_line_state); cur_text_state.vertical_align = 0; diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 8a2fae3..2ec3877 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) while (len > 0) { auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy); - HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%d\n", u[0])); + HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0])); if(!(equal(ox, 0) && equal(oy, 0))) { @@ -101,6 +101,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) if(is_space && (param.space_as_offset)) { + html_text_page.get_cur_line()->append_padding_char(); // ignore horiz_scaling, as it has been merged into CTM html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); } @@ -150,4 +151,16 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) draw_ty += dy; } +bool HTMLRenderer::is_char_covered(int index) +{ + auto covered = covered_text_handler.get_chars_covered(); + if (index < 0 || index >= (int)covered.size()) + { + std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: " + << index << ", size: " << covered.size() < + #include "Color.h" namespace pdf2htmlEX { @@ -64,9 +66,10 @@ struct HTMLLineState double transform_matrix[4]; // The page-cope char index(in drawing order) of the first char in this line. int first_char_index; - const std::vector * chars_covered; + // A function to determine whether a char is covered at a given index. + std::function is_char_covered; - HTMLLineState(): first_char_index(-1), chars_covered(nullptr) { } + HTMLLineState(): first_char_index(-1) { } }; struct HTMLClipState diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index ba32209..1304e31 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -36,7 +36,14 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width) { - text.insert(text.end(), u, u+l); + if (l == 1) + text.push_back(min(u[0], (unsigned)INT_MAX)); + else + { + text.push_back(- decomposed_text.size() - 1); + decomposed_text.emplace_back(); + decomposed_text.back().assign(u, u + l); + } this->width += width; } @@ -69,30 +76,54 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state) last_state.font_size *= last_state.font_info->font_size_scale; } -void HTMLTextLine::dump_chars(ostream & out, const Unicode * u, int uLen) +void HTMLTextLine::dump_char(std::ostream & out, int pos) { - if (!line_state.chars_covered) + int c = text[pos]; + if (c > 0) { - writeUnicodes(out, u, uLen); + Unicode u = c; + writeUnicodes(out, &u, 1); + } + else if (c < 0) + { + auto dt = decomposed_text[- c - 1]; + writeUnicodes(out, &dt.front(), dt.size()); + } +} + +void HTMLTextLine::dump_chars(ostream & out, int begin, int len) +{ + if (line_state.first_char_index < 0) + { + for (int i = 0; i < len; i++) + dump_char(out, begin + i); return; } - //TODO merge sibling invisiable spans - int start = this->line_state.first_char_index + dumped_char_count; - for(int i = 0; i < uLen; i++) + bool invisible_group_open = false; + for(int i = 0; i < len; i++) { - if (!(*line_state.chars_covered)[start + i]) //visible + if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible { - writeUnicodes(out, u + i, 1); + if (invisible_group_open) + { + invisible_group_open = false; + out << ""; + } + dump_char(out, begin + i); } else { - out << ""; - writeUnicodes(out, u + i, 1); - out << ""; + if (!invisible_group_open) + { + out << ""; + invisible_group_open = true; + } + dump_char(out, begin + i); } } - dumped_char_count += uLen; + if (invisible_group_open) + out << ""; } void HTMLTextLine::dump_text(ostream & out) @@ -110,8 +141,6 @@ void HTMLTextLine::dump_text(ostream & out) return; } - dumped_char_count = 0; - // Start Output { // open
for the current text line @@ -244,7 +273,7 @@ void HTMLTextLine::dump_text(ostream & out) size_t next_text_idx = text_idx2; if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx) next_text_idx = cur_offset_iter->start_idx; - dump_chars(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx); + dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx); cur_text_idx = next_text_idx; } } diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h index 8fa814f..c8a3c8b 100644 --- a/src/HTMLTextLine.h +++ b/src/HTMLTextLine.h @@ -73,7 +73,16 @@ public: double width; }; + /** + * Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to + * multiple code points. + */ void append_unicodes(const Unicode * u, int l, double width); + /** + * Append a special padding char with 0 width, in order to keep char index consistent. + * The padding char is ignored during output. + */ + void append_padding_char() { text.push_back(0); } void append_offset(double width); void append_state(const HTMLTextState & text_state); void dump_text(std::ostream & out); @@ -91,7 +100,13 @@ public: private: void optimize_normal(std::vector &); void optimize_aggressive(std::vector &); - void dump_chars(std::ostream & out, const Unicode * u, int uLen); + + /** + * Dump chars' unicode to output stream. + * begin/pos is the index in 'text'. + */ + void dump_chars(std::ostream & out, int begin, int len); + void dump_char(std::ostream & out, int pos); const Param & param; AllStateManager & all_manager; @@ -103,9 +118,16 @@ private: std::vector states; std::vector offsets; - std::vector text; - int dumped_char_count; + /** + * Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text': + * - If c > 0, it is the unicode code point corresponds to the glyph; + * - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?); + * - If c < -1, this glyph corresponds to more than one unicode code points, + * which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'. + */ + std::vector text; + std::vector > decomposed_text; }; } // namespace pdf2htmlEX