mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
workaround for NBSP
This commit is contained in:
parent
06772b3b9f
commit
cf5507f997
@ -113,9 +113,9 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
html_text_page.get_cur_line()->append_unicodes(&uu, 1);
|
||||
/*
|
||||
* In PDF, word_space is appended if (n == 1 and *p = ' ')
|
||||
* but in HTML, word_space is appended if (uu == ' ' || 0xa0)
|
||||
* but in HTML, word_space is appended if (uu == ' ')
|
||||
*/
|
||||
int space_count = (is_space ? 1 : 0) - ((uu == ' ' || uu == 0xa0) ? 1 : 0);
|
||||
int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
|
||||
if(space_count != 0)
|
||||
{
|
||||
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
|
||||
|
@ -403,8 +403,7 @@ void HTMLTextLine::optimize()
|
||||
// get the text segment covered by current state (*state_iter1)
|
||||
const auto text_iter1 = text.begin() + text_idx1;
|
||||
const auto text_iter2 = text.begin() + text_idx2;
|
||||
const static std::vector<Unicode> space_chars = { 0x20, 0xa0 };
|
||||
if(find_first_of(text_iter1, text_iter2, space_chars.begin(), space_chars.end()) == text_iter2)
|
||||
if(find(text_iter1, text_iter2, ' ') == text_iter2)
|
||||
{
|
||||
// if there is not any space, we may change the value of word_space arbitrarily
|
||||
// note that we may only change word space, no offset will be affected
|
||||
|
@ -17,8 +17,17 @@ using std::cerr;
|
||||
using std::endl;
|
||||
using std::ostream;
|
||||
|
||||
/* Test legal for HTML */
|
||||
bool isLegalUnicode(Unicode u)
|
||||
{
|
||||
/*
|
||||
* These characters are interpreted as white-spaces in HTML
|
||||
* `word-spacing` may be applied on them
|
||||
* and the browser may not use the actualy glyphs in the font
|
||||
* So mark them as illegal
|
||||
*
|
||||
* The problem is that the correct value can not be copied out in this way
|
||||
*/
|
||||
/*
|
||||
if((u == 9) || (u == 10) || (u == 13))
|
||||
return true;
|
||||
@ -27,7 +36,15 @@ bool isLegalUnicode(Unicode u)
|
||||
if(u <= 31)
|
||||
return false;
|
||||
|
||||
if((u >= 127) && (u <= 159))
|
||||
/*
|
||||
* 160, or 0xa0 is NBSP, which is legal in HTML
|
||||
* But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
|
||||
* Again, `word-spacing` is applied.
|
||||
* So mark it as illegal
|
||||
*
|
||||
* And the same problem as above, this character can no longer be copied out
|
||||
*/
|
||||
if((u >= 127) && (u <= 160))
|
||||
return false;
|
||||
|
||||
if((u >= 0xd800) && (u <= 0xdfff))
|
||||
|
Loading…
Reference in New Issue
Block a user