From 66231996fdedafc395e938e4f0625ea5358f2ac6 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Wed, 18 Mar 2015 18:22:15 +0800 Subject: [PATCH] Improve checking of illegal unicode in HTML, matching Webkit and Firefox's implementation. --- src/HTMLRenderer/text.cc | 4 +-- src/util/unicode.cc | 77 ++-------------------------------------- src/util/unicode.h | 55 +++++++++++++++++++++++++--- 3 files changed, 55 insertions(+), 81 deletions(-) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 887ab3a..e58a17a 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -19,7 +19,7 @@ namespace pdf2htmlEX { -using std::all_of; +using std::none_of; using std::cerr; using std::endl; @@ -107,7 +107,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) } else { - if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) + if((param.decompose_ligature) && (uLen > 1) && none_of(u, u+uLen, is_illegal_unicode)) { html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx); } diff --git a/src/util/unicode.cc b/src/util/unicode.cc index 8e278a3..4a2a034 100644 --- a/src/util/unicode.cc +++ b/src/util/unicode.cc @@ -18,79 +18,6 @@ using std::cerr; using std::endl; using std::ostream; -/* - * Test legal for HTML - * - * A legal unicode character should be accepted by browsers, and displayed correctly. - * Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF - * This function is not complete, just to be improved. - */ -bool isLegalUnicode(Unicode u) -{ - const Unicode max_small_unicode = 1024; - static bool valid_small_unicode[max_small_unicode]; - static bool valid_small_unicode_init = false; - if(!valid_small_unicode_init) - { - valid_small_unicode_init = true; - Unicode uu = 0; - - /* - * 9, 10 and 13 are interpreted as white-spaces in HTML - * `word-spacing` may be applied on them - * and the browser may not use the actual glyphs in the font - * So mark them as illegal - * - * The problem is that the correct value can not be copied out in this way - */ - while(uu <= 31) - valid_small_unicode[uu++] = false; - - /* - * 127-159 are not invalid - * 160, or 0xa0 is NBSP, which is legal in HTML - * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP - * Again, `word-spacing` is applied. - * So mark it as illegal - * - * And the same problem as above, this character can no longer be copied out - */ - while(uu < 127) - valid_small_unicode[uu++] = true; - while(uu <= 160) - valid_small_unicode[uu++] = false; - - /* - * 173, or 0xad, the soft hyphen - * which can be ignored by the browser in the middle of a line - */ - while(uu < 173) - valid_small_unicode[uu++] = true; - while(uu <= 173) - valid_small_unicode[uu++] = false; - - - while(uu < max_small_unicode) - valid_small_unicode[uu++] = true; - } - - if(u < max_small_unicode) - return valid_small_unicode[u]; - - - // * U+200B: zero width space - if(u == 0x200b) return false; - - // * U+2029: paragraph separator - if(u == 0x2029) return false; - - // Reserved code for utf-16 - if((u >= 0xd800) && (u <= 0xdfff)) - return false; - - return true; -} - Unicode map_to_private(CharCode code) { Unicode private_mapping = (Unicode)(code + 0xE000); @@ -118,7 +45,7 @@ Unicode unicode_from_font (CharCode code, GfxFont * font) if(cname) { Unicode ou = globalParams->mapNameToUnicodeText(cname); - if(isLegalUnicode(ou)) + if(!is_illegal_unicode(ou)) return ou; } } @@ -133,7 +60,7 @@ Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font) if(len == 1) { - if(isLegalUnicode(*u)) + if(!is_illegal_unicode(*u)) return *u; } diff --git a/src/util/unicode.h b/src/util/unicode.h index 8a04195..2100695 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -13,11 +13,58 @@ namespace pdf2htmlEX { -/* - * Check if the unicode is valid for HTML - * http://en.wikipedia.org/wiki/HTML_decimal_character_rendering +/** + * Check whether a unicode character is illegal for the output HTML. + * Unlike PDF readers, browsers has special treatments for such characters (normally treated as + * zero-width space), regardless of metrics and glyphs provided by fonts. So these characters + * should be mapped to unicode private area to "cheat" browsers, at the cost of loosing actual + * unicode values in the HTML. + * + * The following chart shows illegal characters in HTML by webkit, mozilla, and pdf2htmlEX (p2h). + * pdf2htmlEX's illegal character set is the union of webkit's and mozilla's, plus illegal unicode + * characters. "[" and ")" surrounding ranges denote "inclusive" and "exclusive", respectively. + * + * 00(NUL)--09(\t)--0A(\n)--0D(\r)--20(SP)--7F(DEL)--9F(APC)--A0(NBSP)--AD(SHY)--061C(ALM)--1361(Ethiopic word space) + * webkit: [--------------------------------) [------------------) [-] + * moz: [--------------------------------) [---------] [-] + * p2h: [--------------------------------) [------------------] [-] [-] [-] + * + * 200B(ZWSP)--200C(ZWNJ)--200D(ZWJ)--200E(LRM)--200F(RLM)--2028(LSEP)--2029(PSEP)--202A(LRE)--202E(RL0)--2066(LRI)--2069(PDI) + * webkit: [-----------------------------------------------] [----------] + * moz: [-] [----------] [-] [-] [----------] [------------] + * p2h: [-----------------------------------------------] [-] [-] [----------] [------------] + * + * D800(surrogate)--DFFF(surrogate)--FEFF(ZWNBSP)--FFFC(ORC)--FFFE(non-char)--FFFF(non-char) + * webkit: [-] [-] + * moz: + * p2h: [------------------] [-] [-] [-----------------] + * + * Note: 0xA0 (no-break space) affects word-spacing; and if "white-space:pre" is specified, + * \n and \r can break line, \t can shift text, so they are considered illegal. + * + * Resources (retrieved at 2015-03-16) + * * webkit + * * Avoid querying the font cache for the zero-width space glyph ( https://bugs.webkit.org/show_bug.cgi?id=90673 ) + * * treatAsZeroWidthSpace( https://github.com/WebKit/webkit/blob/17bbff7400393e9389b40cc84ce005f7cc954680/Source/WebCore/platform/graphics/FontCascade.h#L272 ) + * * mozilla + * * IsInvalidChar( http://mxr.mozilla.org/mozilla-central/source/gfx/thebes/gfxTextRun.cpp#1973 ) + * * IsBidiControl( http://mxr.mozilla.org/mozilla-central/source/intl/unicharutil/util/nsBidiUtils.h#114 ) + * * Character encodings in HTML ( http://en.wikipedia.org/wiki/Character_encodings_in_HTML#HTML_character_references ) + * * CSS Text Spec ( http://dev.w3.org/csswg/css-text/ ) + * * unicode table ( http://unicode-table.com ) + * + * TODO Web specs? IE? + * */ -bool isLegalUnicode(Unicode u); +inline bool is_illegal_unicode(Unicode c) +{ + return (c < 0x20) || (c >= 0x7F && c <= 0xA0) || (c == 0xAD) + || (c == 0x061C) || (c == 0x1361) + || (c >= 0x200B && c <= 0x200F) || (c == 0x2028) || (c == 0x2029) + || (c >= 0x202A && c <= 0x202E) || (c >= 0x2066 && c <= 0x2069) + || (c >= 0xD800 && c <= 0xDFFF) || (c == 0xFEFF) || (c == 0xFFFC) + || (c == 0xFFFE) || (c == 0xFFFF); +} Unicode map_to_private(CharCode code);