changelog; new function for checking valid unicode values

2024-12-22 13:00:08 +00:00 · 2013-07-10 11:40:11 +08:00 · 2013-07-10 11:40:11 +08:00 · 4ce7fa3400
commit 4ce7fa3400
parent 0044e9b17c
2 changed files with 60 additions and 33 deletions
--- a/9
+++ b/9
@ -1,12 +1,19 @@
 Developing v0.9

-* Lazy loading of entire pages
+* Lazy loading of pages
+* Show font names in debug messages
 * Licensed changed
 - Additional terms for usage in online services
 - Remove GPLv2
+* Bug fixes:
+ - --optimize-text
+ - Always use Unicode encoding for fonts
+ - space width
+ - disable ligature in Firefox
 * New options:
 --embed
 --embed-***
+ --override-fstype
 * Deprecated/Removed options:
 --single-html
 --remove-unused-glyph
--- a/src/util/unicode.cc
+++ b/src/util/unicode.cc
@ -21,27 +21,32 @@ using std::ostream;
 * Test legal for HTML 
 * 
 * A legal unicode character should be accepted by browsers, and displayed correctly.
+ * Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF
 * This function is not complete, just to be improved.
 */
 bool isLegalUnicode(Unicode u)
 {
+    const Unicode max_small_unicode = 1024;
+    static bool valid_small_unicode[max_small_unicode];
+    static bool valid_small_unicode_init = false;
+    if(!valid_small_unicode_init)
+    {
+        valid_small_unicode_init = true;
+        Unicode uu = 0;
+
        /*
-     * These characters are interpreted as white-spaces in HTML
+         * 9, 10 and 13 are interpreted as white-spaces in HTML
         * `word-spacing` may be applied on them
         * and the browser may not use the actualy glyphs in the font
         * So mark them as illegal
         *
         * The problem is that the correct value can not be copied out in this way
         */
-    /*
-    if((u == 9) || (u == 10) || (u == 13))
-        return true;
-        */
-
-    if(u <= 31) 
-        return false;
+        while(uu <= 31)
+            valid_small_unicode[uu++] = false;

        /*
+         * 127-159 are not invalid
         * 160, or 0xa0 is NBSP, which is legal in HTML
         * But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
         * Again, `word-spacing` is applied.
@ -49,16 +54,31 @@ bool isLegalUnicode(Unicode u)
         *
         * And the same problem as above, this character can no longer be copied out
         */
-    if((u >= 127) && (u <= 160))
-        return false;
+        while(uu < 127)
+            valid_small_unicode[uu++] = true;
+        while(uu <= 160)
+            valid_small_unicode[uu++] = false;
        
        /*
-     * 173, or 0xad, is the soft hyphen
+         * 173, or 0xad, the soft hyphen
         * which can be ignored by the browser in the middle of a line
         */
-    if(u == 173)
-        return false;
+        while(uu < 173)
+            valid_small_unicode[uu++] = true;
+        while(uu <= 173)
+            valid_small_unicode[uu++] = false;

+
+        while(uu < max_small_unicode)
+            valid_small_unicode[uu++] = true;
+    }
+
+    if(u < max_small_unicode)
+        return valid_small_unicode[u];
+
+    /*
+     * Reserved code for utf-16
+     */
    if((u >= 0xd800) && (u <= 0xdfff))
        return false;