changelog; new function for checking valid unicode values

2024-12-22 13:00:08 +00:00 · 2013-07-10 11:40:11 +08:00 · 2013-07-10 11:40:11 +08:00 · 4ce7fa3400
commit 4ce7fa3400
parent 0044e9b17c
2 changed files with 60 additions and 33 deletions
--- a/9
+++ b/9
@ -1,12 +1,19 @@
 Developing v0.9
-* Lazy loading of entire pages
+* Lazy loading of pages
 * Show font names in debug messages
 * Licensed changed
 - Additional terms for usage in online services
 - Remove GPLv2
 * Bug fixes:
 - --optimize-text
 - Always use Unicode encoding for fonts
 - space width
 - disable ligature in Firefox
 * New options:
 --embed
 --embed-***
 --override-fstype
 * Deprecated/Removed options:
 --single-html
 --remove-unused-glyph
--- a/src/util/unicode.cc
+++ b/src/util/unicode.cc
@ -21,44 +21,64 @@ using std::ostream;
 * Test legal for HTML 
 * 
 * A legal unicode character should be accepted by browsers, and displayed correctly.
 * Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF
 * This function is not complete, just to be improved.
 */
 bool isLegalUnicode(Unicode u)
 {
    const Unicode max_small_unicode = 1024;
    static bool valid_small_unicode[max_small_unicode];
    static bool valid_small_unicode_init = false;
    if(!valid_small_unicode_init)
    {
        valid_small_unicode_init = true;
        Unicode uu = 0;
        /*
         * 9, 10 and 13 are interpreted as white-spaces in HTML
         * `word-spacing` may be applied on them
         * and the browser may not use the actualy glyphs in the font
         * So mark them as illegal
         *
         * The problem is that the correct value can not be copied out in this way
         */
        while(uu <= 31)
            valid_small_unicode[uu++] = false;
        /*
         * 127-159 are not invalid
         * 160, or 0xa0 is NBSP, which is legal in HTML
         * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
         * Again, `word-spacing` is applied.
         * So mark it as illegal
         *
         * And the same problem as above, this character can no longer be copied out
         */
        while(uu < 127)
            valid_small_unicode[uu++] = true;
        while(uu <= 160)
            valid_small_unicode[uu++] = false;
        /*
         * 173, or 0xad, the soft hyphen
         * which can be ignored by the browser in the middle of a line
         */
        while(uu < 173)
            valid_small_unicode[uu++] = true;
        while(uu <= 173)
            valid_small_unicode[uu++] = false;
        while(uu < max_small_unicode)
            valid_small_unicode[uu++] = true;
    }
    if(u < max_small_unicode)
        return valid_small_unicode[u];
    /*
-     * These characters are interpreted as white-spaces in HTML
+     * Reserved code for utf-16
     * `word-spacing` may be applied on them
     * and the browser may not use the actualy glyphs in the font
     * So mark them as illegal
     *
     * The problem is that the correct value can not be copied out in this way
     */
    /*
    if((u == 9) || (u == 10) || (u == 13))
        return true;
        */
    if(u <= 31) 
        return false;
    /*
     * 160, or 0xa0 is NBSP, which is legal in HTML
     * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
     * Again, `word-spacing` is applied.
     * So mark it as illegal
     *
     * And the same problem as above, this character can no longer be copied out
     */
    if((u >= 127) && (u <= 160))
        return false;
    /*
     * 173, or 0xad, is the soft hyphen
     * which can be ignored by the browser in the middle of a line
     */
    if(u == 173)
        return false;
    if((u >= 0xd800) && (u <= 0xdfff))
        return false;