mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
changelog; new function for checking valid unicode values
This commit is contained in:
parent
0044e9b17c
commit
4ce7fa3400
@ -1,12 +1,19 @@
|
||||
Developing v0.9
|
||||
|
||||
* Lazy loading of entire pages
|
||||
* Lazy loading of pages
|
||||
* Show font names in debug messages
|
||||
* Licensed changed
|
||||
- Additional terms for usage in online services
|
||||
- Remove GPLv2
|
||||
* Bug fixes:
|
||||
- --optimize-text
|
||||
- Always use Unicode encoding for fonts
|
||||
- space width
|
||||
- disable ligature in Firefox
|
||||
* New options:
|
||||
--embed
|
||||
--embed-***
|
||||
--override-fstype
|
||||
* Deprecated/Removed options:
|
||||
--single-html
|
||||
--remove-unused-glyph
|
||||
|
@ -21,27 +21,32 @@ using std::ostream;
|
||||
* Test legal for HTML
|
||||
*
|
||||
* A legal unicode character should be accepted by browsers, and displayed correctly.
|
||||
* Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF
|
||||
* This function is not complete, just to be improved.
|
||||
*/
|
||||
bool isLegalUnicode(Unicode u)
|
||||
{
|
||||
const Unicode max_small_unicode = 1024;
|
||||
static bool valid_small_unicode[max_small_unicode];
|
||||
static bool valid_small_unicode_init = false;
|
||||
if(!valid_small_unicode_init)
|
||||
{
|
||||
valid_small_unicode_init = true;
|
||||
Unicode uu = 0;
|
||||
|
||||
/*
|
||||
* These characters are interpreted as white-spaces in HTML
|
||||
* 9, 10 and 13 are interpreted as white-spaces in HTML
|
||||
* `word-spacing` may be applied on them
|
||||
* and the browser may not use the actualy glyphs in the font
|
||||
* So mark them as illegal
|
||||
*
|
||||
* The problem is that the correct value can not be copied out in this way
|
||||
*/
|
||||
/*
|
||||
if((u == 9) || (u == 10) || (u == 13))
|
||||
return true;
|
||||
*/
|
||||
|
||||
if(u <= 31)
|
||||
return false;
|
||||
while(uu <= 31)
|
||||
valid_small_unicode[uu++] = false;
|
||||
|
||||
/*
|
||||
* 127-159 are not invalid
|
||||
* 160, or 0xa0 is NBSP, which is legal in HTML
|
||||
* But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
|
||||
* Again, `word-spacing` is applied.
|
||||
@ -49,16 +54,31 @@ bool isLegalUnicode(Unicode u)
|
||||
*
|
||||
* And the same problem as above, this character can no longer be copied out
|
||||
*/
|
||||
if((u >= 127) && (u <= 160))
|
||||
return false;
|
||||
while(uu < 127)
|
||||
valid_small_unicode[uu++] = true;
|
||||
while(uu <= 160)
|
||||
valid_small_unicode[uu++] = false;
|
||||
|
||||
/*
|
||||
* 173, or 0xad, is the soft hyphen
|
||||
* 173, or 0xad, the soft hyphen
|
||||
* which can be ignored by the browser in the middle of a line
|
||||
*/
|
||||
if(u == 173)
|
||||
return false;
|
||||
while(uu < 173)
|
||||
valid_small_unicode[uu++] = true;
|
||||
while(uu <= 173)
|
||||
valid_small_unicode[uu++] = false;
|
||||
|
||||
|
||||
while(uu < max_small_unicode)
|
||||
valid_small_unicode[uu++] = true;
|
||||
}
|
||||
|
||||
if(u < max_small_unicode)
|
||||
return valid_small_unicode[u];
|
||||
|
||||
/*
|
||||
* Reserved code for utf-16
|
||||
*/
|
||||
if((u >= 0xd800) && (u <= 0xdfff))
|
||||
return false;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user