mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
Improve checking of illegal unicode in HTML, matching Webkit and Firefox's implementation.
This commit is contained in:
parent
5a66ac5a55
commit
66231996fd
@ -19,7 +19,7 @@
|
|||||||
|
|
||||||
namespace pdf2htmlEX {
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
using std::all_of;
|
using std::none_of;
|
||||||
using std::cerr;
|
using std::cerr;
|
||||||
using std::endl;
|
using std::endl;
|
||||||
|
|
||||||
@ -107,7 +107,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
|
if((param.decompose_ligature) && (uLen > 1) && none_of(u, u+uLen, is_illegal_unicode))
|
||||||
{
|
{
|
||||||
html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
|
html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
|
||||||
}
|
}
|
||||||
|
@ -18,79 +18,6 @@ using std::cerr;
|
|||||||
using std::endl;
|
using std::endl;
|
||||||
using std::ostream;
|
using std::ostream;
|
||||||
|
|
||||||
/*
|
|
||||||
* Test legal for HTML
|
|
||||||
*
|
|
||||||
* A legal unicode character should be accepted by browsers, and displayed correctly.
|
|
||||||
* Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF
|
|
||||||
* This function is not complete, just to be improved.
|
|
||||||
*/
|
|
||||||
bool isLegalUnicode(Unicode u)
|
|
||||||
{
|
|
||||||
const Unicode max_small_unicode = 1024;
|
|
||||||
static bool valid_small_unicode[max_small_unicode];
|
|
||||||
static bool valid_small_unicode_init = false;
|
|
||||||
if(!valid_small_unicode_init)
|
|
||||||
{
|
|
||||||
valid_small_unicode_init = true;
|
|
||||||
Unicode uu = 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 9, 10 and 13 are interpreted as white-spaces in HTML
|
|
||||||
* `word-spacing` may be applied on them
|
|
||||||
* and the browser may not use the actual glyphs in the font
|
|
||||||
* So mark them as illegal
|
|
||||||
*
|
|
||||||
* The problem is that the correct value can not be copied out in this way
|
|
||||||
*/
|
|
||||||
while(uu <= 31)
|
|
||||||
valid_small_unicode[uu++] = false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 127-159 are not invalid
|
|
||||||
* 160, or 0xa0 is NBSP, which is legal in HTML
|
|
||||||
* But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
|
|
||||||
* Again, `word-spacing` is applied.
|
|
||||||
* So mark it as illegal
|
|
||||||
*
|
|
||||||
* And the same problem as above, this character can no longer be copied out
|
|
||||||
*/
|
|
||||||
while(uu < 127)
|
|
||||||
valid_small_unicode[uu++] = true;
|
|
||||||
while(uu <= 160)
|
|
||||||
valid_small_unicode[uu++] = false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 173, or 0xad, the soft hyphen
|
|
||||||
* which can be ignored by the browser in the middle of a line
|
|
||||||
*/
|
|
||||||
while(uu < 173)
|
|
||||||
valid_small_unicode[uu++] = true;
|
|
||||||
while(uu <= 173)
|
|
||||||
valid_small_unicode[uu++] = false;
|
|
||||||
|
|
||||||
|
|
||||||
while(uu < max_small_unicode)
|
|
||||||
valid_small_unicode[uu++] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(u < max_small_unicode)
|
|
||||||
return valid_small_unicode[u];
|
|
||||||
|
|
||||||
|
|
||||||
// * U+200B: zero width space
|
|
||||||
if(u == 0x200b) return false;
|
|
||||||
|
|
||||||
// * U+2029: paragraph separator
|
|
||||||
if(u == 0x2029) return false;
|
|
||||||
|
|
||||||
// Reserved code for utf-16
|
|
||||||
if((u >= 0xd800) && (u <= 0xdfff))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
Unicode map_to_private(CharCode code)
|
Unicode map_to_private(CharCode code)
|
||||||
{
|
{
|
||||||
Unicode private_mapping = (Unicode)(code + 0xE000);
|
Unicode private_mapping = (Unicode)(code + 0xE000);
|
||||||
@ -118,7 +45,7 @@ Unicode unicode_from_font (CharCode code, GfxFont * font)
|
|||||||
if(cname)
|
if(cname)
|
||||||
{
|
{
|
||||||
Unicode ou = globalParams->mapNameToUnicodeText(cname);
|
Unicode ou = globalParams->mapNameToUnicodeText(cname);
|
||||||
if(isLegalUnicode(ou))
|
if(!is_illegal_unicode(ou))
|
||||||
return ou;
|
return ou;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -133,7 +60,7 @@ Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
|
|||||||
|
|
||||||
if(len == 1)
|
if(len == 1)
|
||||||
{
|
{
|
||||||
if(isLegalUnicode(*u))
|
if(!is_illegal_unicode(*u))
|
||||||
return *u;
|
return *u;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,11 +13,58 @@
|
|||||||
|
|
||||||
namespace pdf2htmlEX {
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* Check if the unicode is valid for HTML
|
* Check whether a unicode character is illegal for the output HTML.
|
||||||
* http://en.wikipedia.org/wiki/HTML_decimal_character_rendering
|
* Unlike PDF readers, browsers has special treatments for such characters (normally treated as
|
||||||
|
* zero-width space), regardless of metrics and glyphs provided by fonts. So these characters
|
||||||
|
* should be mapped to unicode private area to "cheat" browsers, at the cost of loosing actual
|
||||||
|
* unicode values in the HTML.
|
||||||
|
*
|
||||||
|
* The following chart shows illegal characters in HTML by webkit, mozilla, and pdf2htmlEX (p2h).
|
||||||
|
* pdf2htmlEX's illegal character set is the union of webkit's and mozilla's, plus illegal unicode
|
||||||
|
* characters. "[" and ")" surrounding ranges denote "inclusive" and "exclusive", respectively.
|
||||||
|
*
|
||||||
|
* 00(NUL)--09(\t)--0A(\n)--0D(\r)--20(SP)--7F(DEL)--9F(APC)--A0(NBSP)--AD(SHY)--061C(ALM)--1361(Ethiopic word space)
|
||||||
|
* webkit: [--------------------------------) [------------------) [-]
|
||||||
|
* moz: [--------------------------------) [---------] [-]
|
||||||
|
* p2h: [--------------------------------) [------------------] [-] [-] [-]
|
||||||
|
*
|
||||||
|
* 200B(ZWSP)--200C(ZWNJ)--200D(ZWJ)--200E(LRM)--200F(RLM)--2028(LSEP)--2029(PSEP)--202A(LRE)--202E(RL0)--2066(LRI)--2069(PDI)
|
||||||
|
* webkit: [-----------------------------------------------] [----------]
|
||||||
|
* moz: [-] [----------] [-] [-] [----------] [------------]
|
||||||
|
* p2h: [-----------------------------------------------] [-] [-] [----------] [------------]
|
||||||
|
*
|
||||||
|
* D800(surrogate)--DFFF(surrogate)--FEFF(ZWNBSP)--FFFC(ORC)--FFFE(non-char)--FFFF(non-char)
|
||||||
|
* webkit: [-] [-]
|
||||||
|
* moz:
|
||||||
|
* p2h: [------------------] [-] [-] [-----------------]
|
||||||
|
*
|
||||||
|
* Note: 0xA0 (no-break space) affects word-spacing; and if "white-space:pre" is specified,
|
||||||
|
* \n and \r can break line, \t can shift text, so they are considered illegal.
|
||||||
|
*
|
||||||
|
* Resources (retrieved at 2015-03-16)
|
||||||
|
* * webkit
|
||||||
|
* * Avoid querying the font cache for the zero-width space glyph ( https://bugs.webkit.org/show_bug.cgi?id=90673 )
|
||||||
|
* * treatAsZeroWidthSpace( https://github.com/WebKit/webkit/blob/17bbff7400393e9389b40cc84ce005f7cc954680/Source/WebCore/platform/graphics/FontCascade.h#L272 )
|
||||||
|
* * mozilla
|
||||||
|
* * IsInvalidChar( http://mxr.mozilla.org/mozilla-central/source/gfx/thebes/gfxTextRun.cpp#1973 )
|
||||||
|
* * IsBidiControl( http://mxr.mozilla.org/mozilla-central/source/intl/unicharutil/util/nsBidiUtils.h#114 )
|
||||||
|
* * Character encodings in HTML ( http://en.wikipedia.org/wiki/Character_encodings_in_HTML#HTML_character_references )
|
||||||
|
* * CSS Text Spec ( http://dev.w3.org/csswg/css-text/ )
|
||||||
|
* * unicode table ( http://unicode-table.com )
|
||||||
|
*
|
||||||
|
* TODO Web specs? IE?
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
bool isLegalUnicode(Unicode u);
|
inline bool is_illegal_unicode(Unicode c)
|
||||||
|
{
|
||||||
|
return (c < 0x20) || (c >= 0x7F && c <= 0xA0) || (c == 0xAD)
|
||||||
|
|| (c == 0x061C) || (c == 0x1361)
|
||||||
|
|| (c >= 0x200B && c <= 0x200F) || (c == 0x2028) || (c == 0x2029)
|
||||||
|
|| (c >= 0x202A && c <= 0x202E) || (c >= 0x2066 && c <= 0x2069)
|
||||||
|
|| (c >= 0xD800 && c <= 0xDFFF) || (c == 0xFEFF) || (c == 0xFFFC)
|
||||||
|
|| (c == 0xFFFE) || (c == 0xFFFF);
|
||||||
|
}
|
||||||
|
|
||||||
Unicode map_to_private(CharCode code);
|
Unicode map_to_private(CharCode code);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user