mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
changelog; new function for checking valid unicode values
This commit is contained in:
parent
0044e9b17c
commit
4ce7fa3400
@ -1,12 +1,19 @@
|
|||||||
Developing v0.9
|
Developing v0.9
|
||||||
|
|
||||||
* Lazy loading of entire pages
|
* Lazy loading of pages
|
||||||
|
* Show font names in debug messages
|
||||||
* Licensed changed
|
* Licensed changed
|
||||||
- Additional terms for usage in online services
|
- Additional terms for usage in online services
|
||||||
- Remove GPLv2
|
- Remove GPLv2
|
||||||
|
* Bug fixes:
|
||||||
|
- --optimize-text
|
||||||
|
- Always use Unicode encoding for fonts
|
||||||
|
- space width
|
||||||
|
- disable ligature in Firefox
|
||||||
* New options:
|
* New options:
|
||||||
--embed
|
--embed
|
||||||
--embed-***
|
--embed-***
|
||||||
|
--override-fstype
|
||||||
* Deprecated/Removed options:
|
* Deprecated/Removed options:
|
||||||
--single-html
|
--single-html
|
||||||
--remove-unused-glyph
|
--remove-unused-glyph
|
||||||
|
@ -21,44 +21,64 @@ using std::ostream;
|
|||||||
* Test legal for HTML
|
* Test legal for HTML
|
||||||
*
|
*
|
||||||
* A legal unicode character should be accepted by browsers, and displayed correctly.
|
* A legal unicode character should be accepted by browsers, and displayed correctly.
|
||||||
|
* Many unicode codes have special meaning which will be 'interpreted' by the browser, those should be filtered since they are not interpreted in PDF
|
||||||
* This function is not complete, just to be improved.
|
* This function is not complete, just to be improved.
|
||||||
*/
|
*/
|
||||||
bool isLegalUnicode(Unicode u)
|
bool isLegalUnicode(Unicode u)
|
||||||
{
|
{
|
||||||
|
const Unicode max_small_unicode = 1024;
|
||||||
|
static bool valid_small_unicode[max_small_unicode];
|
||||||
|
static bool valid_small_unicode_init = false;
|
||||||
|
if(!valid_small_unicode_init)
|
||||||
|
{
|
||||||
|
valid_small_unicode_init = true;
|
||||||
|
Unicode uu = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 9, 10 and 13 are interpreted as white-spaces in HTML
|
||||||
|
* `word-spacing` may be applied on them
|
||||||
|
* and the browser may not use the actualy glyphs in the font
|
||||||
|
* So mark them as illegal
|
||||||
|
*
|
||||||
|
* The problem is that the correct value can not be copied out in this way
|
||||||
|
*/
|
||||||
|
while(uu <= 31)
|
||||||
|
valid_small_unicode[uu++] = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 127-159 are not invalid
|
||||||
|
* 160, or 0xa0 is NBSP, which is legal in HTML
|
||||||
|
* But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
|
||||||
|
* Again, `word-spacing` is applied.
|
||||||
|
* So mark it as illegal
|
||||||
|
*
|
||||||
|
* And the same problem as above, this character can no longer be copied out
|
||||||
|
*/
|
||||||
|
while(uu < 127)
|
||||||
|
valid_small_unicode[uu++] = true;
|
||||||
|
while(uu <= 160)
|
||||||
|
valid_small_unicode[uu++] = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 173, or 0xad, the soft hyphen
|
||||||
|
* which can be ignored by the browser in the middle of a line
|
||||||
|
*/
|
||||||
|
while(uu < 173)
|
||||||
|
valid_small_unicode[uu++] = true;
|
||||||
|
while(uu <= 173)
|
||||||
|
valid_small_unicode[uu++] = false;
|
||||||
|
|
||||||
|
|
||||||
|
while(uu < max_small_unicode)
|
||||||
|
valid_small_unicode[uu++] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(u < max_small_unicode)
|
||||||
|
return valid_small_unicode[u];
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* These characters are interpreted as white-spaces in HTML
|
* Reserved code for utf-16
|
||||||
* `word-spacing` may be applied on them
|
|
||||||
* and the browser may not use the actualy glyphs in the font
|
|
||||||
* So mark them as illegal
|
|
||||||
*
|
|
||||||
* The problem is that the correct value can not be copied out in this way
|
|
||||||
*/
|
*/
|
||||||
/*
|
|
||||||
if((u == 9) || (u == 10) || (u == 13))
|
|
||||||
return true;
|
|
||||||
*/
|
|
||||||
|
|
||||||
if(u <= 31)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 160, or 0xa0 is NBSP, which is legal in HTML
|
|
||||||
* But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
|
|
||||||
* Again, `word-spacing` is applied.
|
|
||||||
* So mark it as illegal
|
|
||||||
*
|
|
||||||
* And the same problem as above, this character can no longer be copied out
|
|
||||||
*/
|
|
||||||
if((u >= 127) && (u <= 160))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 173, or 0xad, is the soft hyphen
|
|
||||||
* which can be ignored by the browser in the middle of a line
|
|
||||||
*/
|
|
||||||
if(u == 173)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if((u >= 0xd800) && (u <= 0xdfff))
|
if((u >= 0xd800) && (u <= 0xdfff))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user