From 66231996fdedafc395e938e4f0625ea5358f2ac6 Mon Sep 17 00:00:00 2001
From: Duan Yao <duanyao@ustc.edu>
Date: Wed, 18 Mar 2015 18:22:15 +0800
Subject: [PATCH] Improve checking of illegal unicode in HTML, matching Webkit
 and Firefox's implementation.

---
 src/HTMLRenderer/text.cc |  4 +--
 src/util/unicode.cc      | 77 ++--------------------------------------
 src/util/unicode.h       | 55 +++++++++++++++++++++++++---
 3 files changed, 55 insertions(+), 81 deletions(-)

diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc
index 887ab3a..e58a17a 100644
--- a/src/HTMLRenderer/text.cc
+++ b/src/HTMLRenderer/text.cc
@@ -19,7 +19,7 @@
 
 namespace pdf2htmlEX {
 
-using std::all_of;
+using std::none_of;
 using std::cerr;
 using std::endl;
 
@@ -107,7 +107,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
         }
         else
         {
-            if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
+            if((param.decompose_ligature) && (uLen > 1) && none_of(u, u+uLen, is_illegal_unicode))
             {
                 html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
             }
diff --git a/src/util/unicode.cc b/src/util/unicode.cc
index 8e278a3..4a2a034 100644
--- a/src/util/unicode.cc
+++ b/src/util/unicode.cc
@@ -18,79 +18,6 @@ using std::cerr;
 using std::endl;
 using std::ostream;
 
-/* 
- * Test legal for HTML 
- * 
- * A legal unicode character should be accepted by browsers, and displayed correctly.
- * Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF
- * This function is not complete, just to be improved.
- */
-bool isLegalUnicode(Unicode u)
-{
-    const Unicode max_small_unicode = 1024;
-    static bool valid_small_unicode[max_small_unicode];
-    static bool valid_small_unicode_init = false;
-    if(!valid_small_unicode_init)
-    {
-        valid_small_unicode_init = true;
-        Unicode uu = 0;
-
-        /*
-         * 9, 10 and 13 are interpreted as white-spaces in HTML
-         * `word-spacing` may be applied on them
-         * and the browser may not use the actual glyphs in the font
-         * So mark them as illegal
-         *
-         * The problem is that the correct value can not be copied out in this way
-         */
-        while(uu <= 31)
-            valid_small_unicode[uu++] = false;
-
-        /*
-         * 127-159 are not invalid
-         * 160, or 0xa0 is NBSP, which is legal in HTML
-         * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
-         * Again, `word-spacing` is applied.
-         * So mark it as illegal
-         *
-         * And the same problem as above, this character can no longer be copied out
-         */
-        while(uu < 127)
-            valid_small_unicode[uu++] = true;
-        while(uu <= 160)
-            valid_small_unicode[uu++] = false;
-        
-        /*
-         * 173, or 0xad, the soft hyphen
-         * which can be ignored by the browser in the middle of a line
-         */
-        while(uu < 173)
-            valid_small_unicode[uu++] = true;
-        while(uu <= 173)
-            valid_small_unicode[uu++] = false;
-
-
-        while(uu < max_small_unicode)
-            valid_small_unicode[uu++] = true;
-    }
-
-    if(u < max_small_unicode)
-        return valid_small_unicode[u];
-
-
-    // * U+200B: zero width space
-    if(u == 0x200b) return false;
-
-    // * U+2029: paragraph separator
-    if(u == 0x2029) return false;
-
-    // Reserved code for utf-16
-    if((u >= 0xd800) && (u <= 0xdfff))
-        return false;
-
-    return true;
-}
-
 Unicode map_to_private(CharCode code)
 {
     Unicode private_mapping = (Unicode)(code + 0xE000);
@@ -118,7 +45,7 @@ Unicode unicode_from_font (CharCode code, GfxFont * font)
         if(cname)
         {
             Unicode ou = globalParams->mapNameToUnicodeText(cname);
-            if(isLegalUnicode(ou))
+            if(!is_illegal_unicode(ou))
                 return ou;
         }
     }
@@ -133,7 +60,7 @@ Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
 
     if(len == 1)
     {
-        if(isLegalUnicode(*u))
+        if(!is_illegal_unicode(*u))
             return *u;
     }
 
diff --git a/src/util/unicode.h b/src/util/unicode.h
index 8a04195..2100695 100644
--- a/src/util/unicode.h
+++ b/src/util/unicode.h
@@ -13,11 +13,58 @@
 
 namespace pdf2htmlEX {
 
-/*
- * Check if the unicode is valid for HTML
- * http://en.wikipedia.org/wiki/HTML_decimal_character_rendering
+/**
+ * Check whether a unicode character is illegal for the output HTML.
+ * Unlike PDF readers, browsers has special treatments for such characters (normally treated as
+ * zero-width space), regardless of metrics and glyphs provided by fonts. So these characters
+ * should be mapped to unicode private area to "cheat" browsers, at the cost of loosing actual
+ * unicode values in the HTML.
+ *
+ * The following chart shows illegal characters  in HTML by webkit, mozilla, and pdf2htmlEX (p2h).
+ * pdf2htmlEX's illegal character set is the union of webkit's and mozilla's, plus illegal unicode
+ * characters. "[" and ")" surrounding ranges denote "inclusive" and "exclusive", respectively.
+ *
+ *         00(NUL)--09(\t)--0A(\n)--0D(\r)--20(SP)--7F(DEL)--9F(APC)--A0(NBSP)--AD(SHY)--061C(ALM)--1361(Ethiopic word space)
+ * webkit:   [--------------------------------)        [------------------)       [-]
+ * moz:      [--------------------------------)        [---------]                          [-]
+ * p2h:      [--------------------------------)        [------------------]       [-]       [-]         [-]
+ *
+ *         200B(ZWSP)--200C(ZWNJ)--200D(ZWJ)--200E(LRM)--200F(RLM)--2028(LSEP)--2029(PSEP)--202A(LRE)--202E(RL0)--2066(LRI)--2069(PDI)
+ * webkit:   [-----------------------------------------------]                                 [----------]
+ * moz:      [-]                                  [----------]         [-]         [-]         [----------]         [------------]
+ * p2h:      [-----------------------------------------------]         [-]         [-]         [----------]         [------------]
+ *
+ *         D800(surrogate)--DFFF(surrogate)--FEFF(ZWNBSP)--FFFC(ORC)--FFFE(non-char)--FFFF(non-char)
+ * webkit:                                      [-]           [-]
+ * moz:
+ * p2h:         [------------------]            [-]           [-]          [-----------------]
+ *
+ * Note: 0xA0 (no-break space) affects word-spacing; and if "white-space:pre" is specified,
+ * \n and \r can break line, \t can shift text, so they are considered illegal.
+ *
+ * Resources (retrieved at 2015-03-16)
+ * * webkit
+ *   * Avoid querying the font cache for the zero-width space glyph ( https://bugs.webkit.org/show_bug.cgi?id=90673 )
+ *   * treatAsZeroWidthSpace( https://github.com/WebKit/webkit/blob/17bbff7400393e9389b40cc84ce005f7cc954680/Source/WebCore/platform/graphics/FontCascade.h#L272 )
+ * * mozilla
+ *   * IsInvalidChar( http://mxr.mozilla.org/mozilla-central/source/gfx/thebes/gfxTextRun.cpp#1973 )
+ *   * IsBidiControl( http://mxr.mozilla.org/mozilla-central/source/intl/unicharutil/util/nsBidiUtils.h#114 )
+ * * Character encodings in HTML ( http://en.wikipedia.org/wiki/Character_encodings_in_HTML#HTML_character_references )
+ * * CSS Text Spec ( http://dev.w3.org/csswg/css-text/ )
+ * * unicode table ( http://unicode-table.com )
+ *
+ * TODO Web specs? IE?
+ *
  */
-bool isLegalUnicode(Unicode u);
+inline bool is_illegal_unicode(Unicode c)
+{
+    return (c < 0x20) || (c >= 0x7F && c <= 0xA0) || (c == 0xAD)
+            || (c == 0x061C) || (c == 0x1361)
+            || (c >= 0x200B && c <= 0x200F) || (c == 0x2028) || (c == 0x2029)
+            || (c >= 0x202A && c <= 0x202E) || (c >= 0x2066 && c <= 0x2069)
+            || (c >= 0xD800 && c <= 0xDFFF) || (c == 0xFEFF) || (c == 0xFFFC)
+            || (c == 0xFFFE) || (c == 0xFFFF);
+}
 
 Unicode map_to_private(CharCode code);