diff --git a/ChangeLog b/ChangeLog index aa453f0..fc961be 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,12 +1,19 @@ Developing v0.9 -* Lazy loading of entire pages +* Lazy loading of pages +* Show font names in debug messages * Licensed changed - Additional terms for usage in online services - Remove GPLv2 +* Bug fixes: + - --optimize-text + - Always use Unicode encoding for fonts + - space width + - disable ligature in Firefox * New options: --embed --embed-*** + --override-fstype * Deprecated/Removed options: --single-html --remove-unused-glyph diff --git a/src/util/unicode.cc b/src/util/unicode.cc index e7c96ee..d373de2 100644 --- a/src/util/unicode.cc +++ b/src/util/unicode.cc @@ -21,44 +21,64 @@ using std::ostream; * Test legal for HTML * * A legal unicode character should be accepted by browsers, and displayed correctly. + * Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF * This function is not complete, just to be improved. */ bool isLegalUnicode(Unicode u) { + const Unicode max_small_unicode = 1024; + static bool valid_small_unicode[max_small_unicode]; + static bool valid_small_unicode_init = false; + if(!valid_small_unicode_init) + { + valid_small_unicode_init = true; + Unicode uu = 0; + + /* + * 9, 10 and 13 are interpreted as white-spaces in HTML + * `word-spacing` may be applied on them + * and the browser may not use the actualy glyphs in the font + * So mark them as illegal + * + * The problem is that the correct value can not be copied out in this way + */ + while(uu <= 31) + valid_small_unicode[uu++] = false; + + /* + * 127-159 are not invalid + * 160, or 0xa0 is NBSP, which is legal in HTML + * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP + * Again, `word-spacing` is applied. + * So mark it as illegal + * + * And the same problem as above, this character can no longer be copied out + */ + while(uu < 127) + valid_small_unicode[uu++] = true; + while(uu <= 160) + valid_small_unicode[uu++] = false; + + /* + * 173, or 0xad, the soft hyphen + * which can be ignored by the browser in the middle of a line + */ + while(uu < 173) + valid_small_unicode[uu++] = true; + while(uu <= 173) + valid_small_unicode[uu++] = false; + + + while(uu < max_small_unicode) + valid_small_unicode[uu++] = true; + } + + if(u < max_small_unicode) + return valid_small_unicode[u]; + /* - * These characters are interpreted as white-spaces in HTML - * `word-spacing` may be applied on them - * and the browser may not use the actualy glyphs in the font - * So mark them as illegal - * - * The problem is that the correct value can not be copied out in this way + * Reserved code for utf-16 */ - /* - if((u == 9) || (u == 10) || (u == 13)) - return true; - */ - - if(u <= 31) - return false; - - /* - * 160, or 0xa0 is NBSP, which is legal in HTML - * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP - * Again, `word-spacing` is applied. - * So mark it as illegal - * - * And the same problem as above, this character can no longer be copied out - */ - if((u >= 127) && (u <= 160)) - return false; - - /* - * 173, or 0xad, is the soft hyphen - * which can be ignored by the browser in the middle of a line - */ - if(u == 173) - return false; - if((u >= 0xd800) && (u <= 0xdfff)) return false;