changelog; new function for checking valid unicode values

2024-12-22 04:50:09 +00:00 · 2013-07-10 11:40:11 +08:00 · 2013-07-10 11:40:11 +08:00 · 4ce7fa3400
commit 4ce7fa3400
parent 0044e9b17c
2 changed files with 60 additions and 33 deletions
--- a/9
+++ b/9
@ -1,12 +1,19 @@
 Developing v0.9

-* Lazy loading of entire pages
+* Lazy loading of pages
+* Show font names in debug messages
 * Licensed changed
 - Additional terms for usage in online services
 - Remove GPLv2
+* Bug fixes:
+ - --optimize-text
+ - Always use Unicode encoding for fonts
+ - space width
+ - disable ligature in Firefox
 * New options:
 --embed
 --embed-***
+ --override-fstype
 * Deprecated/Removed options:
 --single-html
 --remove-unused-glyph
--- a/src/util/unicode.cc
+++ b/src/util/unicode.cc
@ -21,44 +21,64 @@ using std::ostream;
 * Test legal for HTML 
 * 
 * A legal unicode character should be accepted by browsers, and displayed correctly.
+ * Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF
 * This function is not complete, just to be improved.
 */
 bool isLegalUnicode(Unicode u)
 {
+    const Unicode max_small_unicode = 1024;
+    static bool valid_small_unicode[max_small_unicode];
+    static bool valid_small_unicode_init = false;
+    if(!valid_small_unicode_init)
+    {
+        valid_small_unicode_init = true;
+        Unicode uu = 0;
+
+        /*
+         * 9, 10 and 13 are interpreted as white-spaces in HTML
+         * `word-spacing` may be applied on them
+         * and the browser may not use the actualy glyphs in the font
+         * So mark them as illegal
+         *
+         * The problem is that the correct value can not be copied out in this way
+         */
+        while(uu <= 31)
+            valid_small_unicode[uu++] = false;
+
+        /*
+         * 127-159 are not invalid
+         * 160, or 0xa0 is NBSP, which is legal in HTML
+         * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
+         * Again, `word-spacing` is applied.
+         * So mark it as illegal
+         *
+         * And the same problem as above, this character can no longer be copied out
+         */
+        while(uu < 127)
+            valid_small_unicode[uu++] = true;
+        while(uu <= 160)
+            valid_small_unicode[uu++] = false;
+        
+        /*
+         * 173, or 0xad, the soft hyphen
+         * which can be ignored by the browser in the middle of a line
+         */
+        while(uu < 173)
+            valid_small_unicode[uu++] = true;
+        while(uu <= 173)
+            valid_small_unicode[uu++] = false;
+
+
+        while(uu < max_small_unicode)
+            valid_small_unicode[uu++] = true;
+    }
+
+    if(u < max_small_unicode)
+        return valid_small_unicode[u];
+
    /*
-     * These characters are interpreted as white-spaces in HTML
-     * `word-spacing` may be applied on them
-     * and the browser may not use the actualy glyphs in the font
-     * So mark them as illegal
-     *
-     * The problem is that the correct value can not be copied out in this way
+     * Reserved code for utf-16
     */
-    /*
-    if((u == 9) || (u == 10) || (u == 13))
-        return true;
-        */
-
-    if(u <= 31) 
-        return false;
-
-    /*
-     * 160, or 0xa0 is NBSP, which is legal in HTML
-     * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
-     * Again, `word-spacing` is applied.
-     * So mark it as illegal
-     *
-     * And the same problem as above, this character can no longer be copied out
-     */
-    if((u >= 127) && (u <= 160))
-        return false;
-
-    /*
-     * 173, or 0xad, is the soft hyphen
-     * which can be ignored by the browser in the middle of a line
-     */
-    if(u == 173)
-        return false;
-
    if((u >= 0xd800) && (u <= 0xdfff))
        return false;