workaround for NBSP

2024-12-22 13:00:08 +00:00 · 2013-05-02 14:32:17 +08:00 · 2013-05-02 14:32:17 +08:00 · cf5507f997
commit cf5507f997
parent 06772b3b9f
3 changed files with 21 additions and 5 deletions
--- a/src/HTMLRenderer/text.cc
+++ b/src/HTMLRenderer/text.cc
@ -113,9 +113,9 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
                html_text_page.get_cur_line()->append_unicodes(&uu, 1);
                /*
                 * In PDF, word_space is appended if (n == 1 and *p = ' ')
-                 * but in HTML, word_space is appended if (uu == ' ' || 0xa0)
+                 * but in HTML, word_space is appended if (uu == ' ')
                 */
-                int space_count = (is_space ? 1 : 0) - ((uu == ' ' || uu == 0xa0) ? 1 : 0);
+                int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
                if(space_count != 0)
                {
                    html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
--- a/src/HTMLTextLine.cc
+++ b/src/HTMLTextLine.cc
@ -403,8 +403,7 @@ void HTMLTextLine::optimize()
        // get the text segment covered by current state (*state_iter1)
        const auto text_iter1 = text.begin() + text_idx1;
        const auto text_iter2 = text.begin() + text_idx2;
-        const static std::vector<Unicode> space_chars = { 0x20, 0xa0 };
-        if(find_first_of(text_iter1, text_iter2, space_chars.begin(), space_chars.end()) == text_iter2)
+        if(find(text_iter1, text_iter2, ' ') == text_iter2)
        {
            // if there is not any space, we may change the value of word_space arbitrarily
            // note that we may only change word space, no offset will be affected
--- a/src/util/unicode.cc
+++ b/src/util/unicode.cc
@ -17,8 +17,17 @@ using std::cerr;
 using std::endl;
 using std::ostream;

+/* Test legal for HTML */
 bool isLegalUnicode(Unicode u)
 {
+    /*
+     * These characters are interpreted as white-spaces in HTML
+     * `word-spacing` may be applied on them
+     * and the browser may not use the actualy glyphs in the font
+     * So mark them as illegal
+     *
+     * The problem is that the correct value can not be copied out in this way
+     */
    /*
    if((u == 9) || (u == 10) || (u == 13))
        return true;
@ -27,7 +36,15 @@ bool isLegalUnicode(Unicode u)
    if(u <= 31) 
        return false;

-    if((u >= 127) && (u <= 159))
+    /*
+     * 160, or 0xa0 is NBSP, which is legal in HTML
+     * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
+     * Again, `word-spacing` is applied.
+     * So mark it as illegal
+     *
+     * And the same problem as above, this character can no longer be copied out
+     */
+    if((u >= 127) && (u <= 160))
        return false;

    if((u >= 0xd800) && (u <= 0xdfff))