1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

workaround for NBSP

This commit is contained in:
Lu Wang 2013-05-02 14:32:17 +08:00
parent 06772b3b9f
commit cf5507f997
3 changed files with 21 additions and 5 deletions

View File

@ -113,9 +113,9 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
html_text_page.get_cur_line()->append_unicodes(&uu, 1);
/*
* In PDF, word_space is appended if (n == 1 and *p = ' ')
* but in HTML, word_space is appended if (uu == ' ' || 0xa0)
* but in HTML, word_space is appended if (uu == ' ')
*/
int space_count = (is_space ? 1 : 0) - ((uu == ' ' || uu == 0xa0) ? 1 : 0);
int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
if(space_count != 0)
{
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);

View File

@ -403,8 +403,7 @@ void HTMLTextLine::optimize()
// get the text segment covered by current state (*state_iter1)
const auto text_iter1 = text.begin() + text_idx1;
const auto text_iter2 = text.begin() + text_idx2;
const static std::vector<Unicode> space_chars = { 0x20, 0xa0 };
if(find_first_of(text_iter1, text_iter2, space_chars.begin(), space_chars.end()) == text_iter2)
if(find(text_iter1, text_iter2, ' ') == text_iter2)
{
// if there is not any space, we may change the value of word_space arbitrarily
// note that we may only change word space, no offset will be affected

View File

@ -17,8 +17,17 @@ using std::cerr;
using std::endl;
using std::ostream;
/* Test legal for HTML */
bool isLegalUnicode(Unicode u)
{
/*
* These characters are interpreted as white-spaces in HTML
* `word-spacing` may be applied on them
* and the browser may not use the actualy glyphs in the font
* So mark them as illegal
*
* The problem is that the correct value can not be copied out in this way
*/
/*
if((u == 9) || (u == 10) || (u == 13))
return true;
@ -27,7 +36,15 @@ bool isLegalUnicode(Unicode u)
if(u <= 31)
return false;
if((u >= 127) && (u <= 159))
/*
* 160, or 0xa0 is NBSP, which is legal in HTML
* But some browser will use the glyph for ' ' in the font, it there is one, instead of the glyphs for NBSP
* Again, `word-spacing` is applied.
* So mark it as illegal
*
* And the same problem as above, this character can no longer be copied out
*/
if((u >= 127) && (u <= 160))
return false;
if((u >= 0xd800) && (u <= 0xdfff))