Improve covered text handling: 1. take care of chars corespond to 0 or more than one unicode points;

2. merge sibling invisiable spans; 3. improve interfaces of HTMLLineState and HTMLRenderer;
2024-12-22 04:50:09 +00:00 · 2014-06-26 12:39:35 +08:00 · 2014-06-26 12:39:35 +08:00 · 39e171a737
commit 39e171a737
parent 65e82028bb
8 changed files with 101 additions and 28 deletions
--- a/src/BackgroundRenderer/CairoBackgroundRenderer.cc
+++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc
@ -66,7 +66,7 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
    // If a char is treated as image, it is not subject to cover test
    // (see HTMLRenderer::drawString), so don't increase drawn_char_count.
    else if (param.process_covered_text) {
-        if (html_renderer->get_chars_covered()[drawn_char_count])
+        if (html_renderer->is_char_covered(drawn_char_count))
            CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
        drawn_char_count++;
    }
--- a/src/BackgroundRenderer/SplashBackgroundRenderer.cc
+++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc
@ -91,7 +91,7 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
    // If a char is treated as image, it is not subject to cover test
    // (see HTMLRenderer::drawString), so don't increase drawn_char_count.
    else if (param.process_covered_text) {
-        if (html_renderer->get_chars_covered()[drawn_char_count])
+        if (html_renderer->is_char_covered(drawn_char_count))
            SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
        drawn_char_count++;
    }
--- a/src/HTMLRenderer/HTMLRenderer.h
+++ b/src/HTMLRenderer/HTMLRenderer.h
@ -150,7 +150,14 @@ public:
    bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); }
    bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); }

-    const std::vector<bool> & get_chars_covered() { return covered_text_handler.get_chars_covered(); }
+    /*
+     * Covered text handling.
+     */
+    // Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page.
+    // Does not fail on out-of-bound conditions, but return false.
+    bool is_char_covered(int index);
+    // Currently drawn char (glyph) count in current page.
+    int get_char_count() { return (int)covered_text_handler.get_chars_covered().size(); }

 protected:
    ////////////////////////////////////////////////////
--- a/src/HTMLRenderer/state.cc
+++ b/src/HTMLRenderer/state.cc
@ -123,8 +123,7 @@ void HTMLRenderer::reset_state()
    cur_line_state.y = 0;
    memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));

-    if (param.process_covered_text)
-        cur_line_state.chars_covered = &covered_text_handler.get_chars_covered();
+    cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);};

    cur_clip_state.xmin = 0;
    cur_clip_state.xmax = 0;
@ -510,7 +509,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
        state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y);
        state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y);
        if (param.process_covered_text)
-            cur_line_state.first_char_index = covered_text_handler.get_chars_covered().size();
+            cur_line_state.first_char_index = get_char_count();
        html_text_page.open_new_line(cur_line_state);

        cur_text_state.vertical_align = 0;
--- a/src/HTMLRenderer/text.cc
+++ b/src/HTMLRenderer/text.cc
@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
    while (len > 0) 
    {
        auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
-        HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%d\n", u[0]));
+        HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));

        if(!(equal(ox, 0) && equal(oy, 0)))
        {
@ -101,6 +101,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
        
        if(is_space && (param.space_as_offset))
        {
+            html_text_page.get_cur_line()->append_padding_char();
            // ignore horiz_scaling, as it has been merged into CTM
            html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
        }
@ -150,4 +151,16 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
    draw_ty += dy;
 }

+bool HTMLRenderer::is_char_covered(int index)
+{
+    auto covered = covered_text_handler.get_chars_covered();
+    if (index < 0 || index >= (int)covered.size())
+    {
+        std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: "
+                << index << ", size: " << covered.size() <<endl;
+        return false;
+    }
+    return covered[index];
+}
+
 } // namespace pdf2htmlEX
--- a/src/HTMLState.h
+++ b/src/HTMLState.h
@ -5,6 +5,8 @@
 #ifndef HTMLSTATE_H__
 #define HTMLSTATE_H__

+#include <functional>
+
 #include "Color.h"

 namespace pdf2htmlEX {
@ -64,9 +66,10 @@ struct HTMLLineState
    double transform_matrix[4];
    // The page-cope char index(in drawing order) of the first char in this line.
    int first_char_index;
-    const std::vector<bool> * chars_covered;
+    // A function to determine whether a char is covered at a given index.
+    std::function<bool(int)> is_char_covered;

-    HTMLLineState(): first_char_index(-1), chars_covered(nullptr) { }
+    HTMLLineState(): first_char_index(-1) { }
 };

 struct HTMLClipState
--- a/src/HTMLTextLine.cc
+++ b/src/HTMLTextLine.cc
@ -36,7 +36,14 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para

 void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
 {
-    text.insert(text.end(), u, u+l);
+    if (l == 1)
+        text.push_back(min(u[0], (unsigned)INT_MAX));
+    else
+    {
+        text.push_back(- decomposed_text.size() - 1);
+        decomposed_text.emplace_back();
+        decomposed_text.back().assign(u, u + l);
+    }
    this->width += width;
 }

@ -69,30 +76,54 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state)
    last_state.font_size *= last_state.font_info->font_size_scale;
 }

-void HTMLTextLine::dump_chars(ostream & out, const Unicode * u, int uLen)
+void HTMLTextLine::dump_char(std::ostream & out, int pos)
 {
-    if (!line_state.chars_covered)
+    int c = text[pos];
+    if (c > 0)
    {
-        writeUnicodes(out, u, uLen);
+        Unicode u = c;
+        writeUnicodes(out, &u, 1);
+    }
+    else if (c < 0)
+    {
+        auto dt = decomposed_text[- c - 1];
+        writeUnicodes(out, &dt.front(), dt.size());
+    }
+}
+
+void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
+{
+    if (line_state.first_char_index < 0)
+    {
+        for (int i = 0; i < len; i++)
+            dump_char(out, begin + i);
        return;
    }

-    //TODO merge sibling invisiable spans
-    int start = this->line_state.first_char_index + dumped_char_count;
-    for(int i = 0; i < uLen; i++)
+    bool invisible_group_open = false;
+    for(int i = 0; i < len; i++)
    {
-        if (!(*line_state.chars_covered)[start + i]) //visible
+        if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible
        {
-            writeUnicodes(out, u + i, 1);
+            if (invisible_group_open)
+            {
+                invisible_group_open = false;
+                out << "</span>";
+            }
+            dump_char(out, begin + i);
        }
        else
        {
-            out << "<span style=\"color:transparent\">";
-            writeUnicodes(out, u + i, 1);
-            out << "</span>";
+            if (!invisible_group_open)
+            {
+                out << "<span style=\"color:transparent\">";
+                invisible_group_open = true;
+            }
+            dump_char(out, begin + i);
        }
    }
-    dumped_char_count += uLen;
+    if (invisible_group_open)
+        out << "</span>";
 }

 void HTMLTextLine::dump_text(ostream & out)
@ -110,8 +141,6 @@ void HTMLTextLine::dump_text(ostream & out)
        return;
    }

-    dumped_char_count = 0;
-
    // Start Output
    {
        // open <div> for the current text line
@ -244,7 +273,7 @@ void HTMLTextLine::dump_text(ostream & out)
                size_t next_text_idx = text_idx2;
                if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
                    next_text_idx = cur_offset_iter->start_idx;
-                dump_chars(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
+                dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx);
                cur_text_idx = next_text_idx;
            }
        }
--- a/src/HTMLTextLine.h
+++ b/src/HTMLTextLine.h
@ -73,7 +73,16 @@ public:
        double width;
    };

+    /**
+     * Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
+     * multiple code points.
+     */
    void append_unicodes(const Unicode * u, int l, double width);
+    /**
+     * Append a special padding char with 0 width, in order to keep char index consistent.
+     * The padding char is ignored during output.
+     */
+    void append_padding_char() { text.push_back(0); }
    void append_offset(double width);
    void append_state(const HTMLTextState & text_state);
    void dump_text(std::ostream & out);
@ -91,7 +100,13 @@ public:
 private:
    void optimize_normal(std::vector<HTMLTextLine*> &);
    void optimize_aggressive(std::vector<HTMLTextLine*> &);
-    void dump_chars(std::ostream & out, const Unicode * u, int uLen);
+
+    /**
+     * Dump chars' unicode to output stream.
+     * begin/pos is the index in 'text'.
+     */
+    void dump_chars(std::ostream & out, int begin, int len);
+    void dump_char(std::ostream & out, int pos);

    const Param & param;
    AllStateManager & all_manager;
@ -103,9 +118,16 @@ private:

    std::vector<State> states;
    std::vector<Offset> offsets;
-    std::vector<Unicode> text;

-    int dumped_char_count;
+    /**
+     * Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
+     * - If c > 0, it is the unicode code point corresponds to the glyph;
+     * - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
+     * - If c < -1, this glyph corresponds to more than one unicode code points,
+     *   which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
+     */
+    std::vector<int> text;
+    std::vector<std::vector<Unicode> > decomposed_text;
 };

 } // namespace pdf2htmlEX