1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

changelog; new function for checking valid unicode values

This commit is contained in:
Lu Wang 2013-07-10 11:40:11 +08:00
parent 0044e9b17c
commit 4ce7fa3400
2 changed files with 60 additions and 33 deletions

View File

@ -1,12 +1,19 @@
Developing v0.9 Developing v0.9
* Lazy loading of entire pages * Lazy loading of pages
* Show font names in debug messages
* Licensed changed * Licensed changed
- Additional terms for usage in online services - Additional terms for usage in online services
- Remove GPLv2 - Remove GPLv2
* Bug fixes:
- --optimize-text
- Always use Unicode encoding for fonts
- space width
- disable ligature in Firefox
* New options: * New options:
--embed --embed
--embed-*** --embed-***
--override-fstype
* Deprecated/Removed options: * Deprecated/Removed options:
--single-html --single-html
--remove-unused-glyph --remove-unused-glyph

View File

@ -21,44 +21,64 @@ using std::ostream;
* Test legal for HTML * Test legal for HTML
* *
* A legal unicode character should be accepted by browsers, and displayed correctly. * A legal unicode character should be accepted by browsers, and displayed correctly.
* Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF
* This function is not complete, just to be improved. * This function is not complete, just to be improved.
*/ */
bool isLegalUnicode(Unicode u) bool isLegalUnicode(Unicode u)
{ {
const Unicode max_small_unicode = 1024;
static bool valid_small_unicode[max_small_unicode];
static bool valid_small_unicode_init = false;
if(!valid_small_unicode_init)
{
valid_small_unicode_init = true;
Unicode uu = 0;
/*
* 9, 10 and 13 are interpreted as white-spaces in HTML
* `word-spacing` may be applied on them
* and the browser may not use the actualy glyphs in the font
* So mark them as illegal
*
* The problem is that the correct value can not be copied out in this way
*/
while(uu <= 31)
valid_small_unicode[uu++] = false;
/*
* 127-159 are not invalid
* 160, or 0xa0 is NBSP, which is legal in HTML
* But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
* Again, `word-spacing` is applied.
* So mark it as illegal
*
* And the same problem as above, this character can no longer be copied out
*/
while(uu < 127)
valid_small_unicode[uu++] = true;
while(uu <= 160)
valid_small_unicode[uu++] = false;
/*
* 173, or 0xad, the soft hyphen
* which can be ignored by the browser in the middle of a line
*/
while(uu < 173)
valid_small_unicode[uu++] = true;
while(uu <= 173)
valid_small_unicode[uu++] = false;
while(uu < max_small_unicode)
valid_small_unicode[uu++] = true;
}
if(u < max_small_unicode)
return valid_small_unicode[u];
/* /*
* These characters are interpreted as white-spaces in HTML * Reserved code for utf-16
* `word-spacing` may be applied on them
* and the browser may not use the actualy glyphs in the font
* So mark them as illegal
*
* The problem is that the correct value can not be copied out in this way
*/ */
/*
if((u == 9) || (u == 10) || (u == 13))
return true;
*/
if(u <= 31)
return false;
/*
* 160, or 0xa0 is NBSP, which is legal in HTML
* But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
* Again, `word-spacing` is applied.
* So mark it as illegal
*
* And the same problem as above, this character can no longer be copied out
*/
if((u >= 127) && (u <= 160))
return false;
/*
* 173, or 0xad, is the soft hyphen
* which can be ignored by the browser in the middle of a line
*/
if(u == 173)
return false;
if((u >= 0xd800) && (u <= 0xdfff)) if((u >= 0xd800) && (u <= 0xdfff))
return false; return false;