From 8d8a78f7941402f40c8765cc9de3d435701b554d Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 7 Aug 2012 15:03:06 +0800 Subject: [PATCH 1/4] moved to drawString --- src/HTMLRenderer.cc | 402 +++++++++++++++----------------------------- src/HTMLRenderer.h | 83 +++------ 2 files changed, 165 insertions(+), 320 deletions(-) diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index fdef4ff..e2ba118 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -45,58 +45,8 @@ * */ -TextString::TextString(GfxState *state) - :unicodes() - ,x(state->getCurX()) - ,y(state->getCurY()) - ,width(0),height(0) - ,state(state) -{ - state->transform(x,y,&x,&y); -} - -TextString::~TextString() -{ - delete state; - state = nullptr; -} - -void TextString::addChars(GfxState *state, double x, double y, - double dx, double dy, CharCode code, int nbytes) -{ - if(nbytes > 0) - { - CharCode mask = (0xffLL) << (8*(nbytes-1)); - while(nbytes > 0) - { - unicodes.push_back((Unicode)((code & mask) >> (8 * (nbytes-1)))); - --nbytes; - mask >>= 8; - } - } - - width += dx; - height += dy; -} - -void TextString::addUnicodes(GfxState *state, double x, double y, - double dx, double dy, Unicode * u, int uLen) -{ - /* - if (0 < u && u != 9 && u < 32) // skip non-printable not-tab character - return; - */ - - for(int i = 0; i < uLen; ++i) - unicodes.push_back(u[i]); - - width += dx; - height += dy; -} - - HTMLRenderer::HTMLRenderer(const Param * param) - :cur_string(nullptr), cur_line(nullptr) + :line_opened(false) ,html_fout(param->output_filename.c_str(), ofstream::binary), allcss_fout("all.css") ,param(param) { @@ -166,8 +116,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) this->pageWidth = state->getPageWidth(); this->pageHeight = state->getPageHeight(); - assert(cur_line == nullptr); - assert(cur_string == nullptr); + assert(!line_opened); html_fout << boost::format("
readable) html_fout << endl; } -bool HTMLRenderer::at_same_line(const TextString * ts1, const TextString * ts2) const -{ - // TODO, this is not accurate, with transforms - if(!(std::abs(ts1->getY() - ts2->getY()) < param->v_eps)) - return false; - - GfxState * s1 = ts1->getState(); - GfxState * s2 = ts2->getState(); - - // TODO, track this instead of check here - if(!(_equal(s1->getCharSpace(), s2->getCharSpace()) - && _equal(s1->getWordSpace(), s2->getWordSpace()) - && _equal(s1->getHorizScaling(), s2->getHorizScaling()))) - return false; - - return true; -} - void HTMLRenderer::close_cur_line() { - if(cur_line != nullptr) + if(line_opened) { html_fout << "
"; if(param->readable) html_fout << endl; - delete cur_line; - cur_line = nullptr; cur_line_x_offset = 0; + line_opened = false; } } -void HTMLRenderer::outputTextString(TextString * str) +void HTMLRenderer::outputUnicodes(const Unicode * u, int uLen) { - for (auto u : str->getUnicodes()) + for(int i = 0; i < uLen; ++i) { - switch(u) + switch(u[i]) { case '&': html_fout << "&"; break; - case '\"': - html_fout << """; + case '\"': + html_fout << """; break; case '\'': html_fout << "'"; @@ -252,189 +183,125 @@ void HTMLRenderer::outputTextString(TextString * str) default: { char buf[4]; - auto n = mapUTF8(u, buf, 4); - if(n > 0) - html_fout.write(buf, n); + auto n = mapUTF8(u[i], buf, 4); + html_fout.write(buf, n); } } } } -void HTMLRenderer::updateAll(GfxState *state) -{ - font_changed = true; - text_mat_changed = true; - ctm_changed = true; - pos_changed = true; - color_changed = true; -} - -void HTMLRenderer::updateFont(GfxState *state) -{ - font_changed = true; -} - -void HTMLRenderer::updateTextMat(GfxState * state) -{ - text_mat_changed = true; -} - -void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) -{ - ctm_changed = true; -} - -void HTMLRenderer::updateTextPos(GfxState * state) -{ - pos_changed = true; -} - -void HTMLRenderer::updateFillColor(GfxState * state) -{ - color_changed = true; -} - -void HTMLRenderer::beginString(GfxState *state, GooString *s) { - check_state_change(state); - - // TODO: remove this - GfxState * new_state = state->copy(gTrue); - - cur_string = new TextString(new_state); -} - -void HTMLRenderer::endString(GfxState *state) { - if (cur_string->getSize() == 0) { - delete cur_string ; - cur_string = nullptr; - return; - } - - // try to merge with last line - if(cur_line != nullptr) - { - if(at_same_line(cur_line, cur_string)) - { - // TODO: this is not correct - double x1 = cur_line->getState()->getLineX() + cur_line->getWidth(); - double x2 = cur_string->getState()->getLineX(); - double target = (x2-x1-cur_line_x_offset) * draw_scale; - - if(target > -param->h_eps) - { - if(target > param->h_eps) - { - double w; - auto wid = install_whitespace(target, w); - cur_line_x_offset = w-target; - html_fout << boost::format(" ") % wid; - } - else - { - cur_line_x_offset = -target; - } - - outputTextString(cur_string); - - delete cur_line; - cur_line = cur_string; - cur_string = nullptr; - return; - } - } - } - - close_cur_line(); - - GfxState * cur_state = cur_string -> getState(); - - // open a new line - // classes - html_fout << "
getY() + cur_state->getFont()->getDescent() * draw_font_size) << "px;" - << "top:" << (pageHeight - cur_string->getY() - cur_state->getFont()->getAscent() * draw_font_size) << "px;" - << "left:" << cur_string->getX() << "px;" - ; - - // letter & word spacing - if(_is_positive(cur_state->getCharSpace())) - html_fout << "letter-spacing:" << cur_state->getCharSpace() << "px;"; - if(_is_positive(cur_state->getWordSpace())) - html_fout << "word-spacing:" << cur_state->getWordSpace() << "px;"; - - //debug - //real pos & hori_scale - { - html_fout << "\""; - double x,y; - cur_state->transform(cur_state->getCurX(), cur_state->getCurY(), &x, &y); - html_fout << boost::format("data-lx=\"%5%\" data-ly=\"%6%\" data-scale=\"%4%\" data-x=\"%1%\" data-y=\"%2%\" data-hs=\"%3%") - %x%y%(cur_state->getHorizScaling())%draw_scale%cur_state->getLineX()%cur_state->getLineY(); - } - - html_fout << "\">"; - - outputTextString(cur_string); - - cur_line = cur_string; - cur_string = nullptr; - cur_line_x_offset = 0; -} - -void HTMLRenderer::drawChar(GfxState *state, double x, double y, - double dx, double dy, - double originX, double originY, - CharCode code, int nBytes, Unicode *u, int uLen) -{ - // if it is hidden, then return - if ((state->getRender() & 3) == 3) - return ; - - if(uLen > 0) - cur_string->addUnicodes(state, x, y, dx, dy, u, uLen); - else - { - cur_string->addChars(state, x, y, dx, dy, code, nBytes); - } -} - -// TODO void HTMLRenderer::drawString(GfxState * state, GooString * s) { - check_state_change(state); + if(s->getLength() == 0) + return; auto font = state->getFont(); - if(font->getWMode()) + if((font == nullptr) || (font->getWMode())) { //TODO return; } - // from poppler - double dx = 0; - double dy = 0; - double dx2, dy2; - double ox, oy; + // see if the line has to be closed due to state change + check_state_change(state); + + // if the line is still open, try to merge with it + if(line_opened) + { + double target = (state->getLineX() - cur_tx - cur_line_x_offset) * draw_scale; + if(target > -param->h_eps) + { + if(target > param->h_eps) + { + double w; + auto wid = install_whitespace(target, w); + cur_line_x_offset = w-target; + html_fout << boost::format(" ") % wid; + } + else + { + cur_line_x_offset = -target; + } + } + else + { + // can we shift left using simple tags? + close_cur_line(); + } + } + + if(!line_opened) + { + // have to open a new line + + // classes + html_fout << "
transform(state->getCurX(), state->getCurY(), &x, &y); + // TODO: recheck descent/ascent + html_fout << "\" style=\"" + << "bottom:" << (y + state->getFont()->getDescent() * draw_font_size) << "px;" + << "top:" << (pageHeight - y - state->getFont()->getAscent() * draw_font_size) << "px;" + << "left:" << x << "px;" + ; + } + + // TODO: tracking + // letter & word spacing + if(_is_positive(state->getCharSpace())) + html_fout << "letter-spacing:" << state->getCharSpace() << "px;"; + if(_is_positive(state->getWordSpace())) + html_fout << "word-spacing:" << state->getWordSpace() << "px;"; + + //debug + //real pos & hori_scale + { + html_fout << "\""; + double x,y; + state->transform(state->getCurX(), state->getCurY(), &x, &y); + html_fout << boost::format("data-lx=\"%5%\" data-ly=\"%6%\" data-scale=\"%4%\" data-x=\"%1%\" data-y=\"%2%\" data-hs=\"%3%") + %x%y%(state->getHorizScaling())%draw_scale%state->getLineX()%state->getLineY(); + } + + html_fout << "\">"; + + line_opened = true; + } + + + // Now ready to output + // get the unicodes char *p = s->getCString(); int len = s->getLength(); + double dx,dy,dx1,dy1; + double ox, oy; + int nChars = 0; int nSpaces = 0; int uLen; CharCode code; Unicode *u = nullptr; - while (len > 0) { - auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx2, &dy2, &ox, &oy); - dx += dx2; - dy += dy2; + auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy); + + if(!(_equal(ox, 0) && _equal(oy, 0))) + { + std::cerr << "TODO: non-zero orgins" << std::endl; + } + + outputUnicodes(u, uLen); + + dx += dx1; + dy += dy1; + if (n == 1 && *p == ' ') { ++nSpaces; } @@ -443,12 +310,11 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) len -= n; } - dx = dx * state->getFontSize() - + nChars * state->getCharSpace() - + nSpaces * state->getWordSpace(); - dx *= state->getHorizScaling(); - dy *= state->getFontSize(); - + cur_tx += (dx * state->getFontSize() + + nChars * state->getCharSpace() + + nSpaces * state->getWordSpace()) * state->getHorizScaling(); + + cur_ty += (dy * state->getFontSize()); } // The font installation code is stolen from PSOutputDev.cc in poppler @@ -923,35 +789,41 @@ void HTMLRenderer::export_color(long long color_id, const GfxRGB * rgb) void HTMLRenderer::check_state_change(GfxState * state) { - if(pos_changed) + bool close_line = false; + + if(all_changed || pos_changed) { - if(!_equal(state->getLineY(), cur_ty)) + double tx = state->getLineX(); + double ty = state->getLineY(); + // TODO: consider draw_scale, while it's actually draw_scale_x + if(!(std::abs(ty - cur_ty) < param->v_eps)) { - close_cur_line(); - cur_ty = state->getLineY(); + close_line = true; + cur_ty = ty; + cur_tx = tx; } } - if(color_changed) + if(all_changed || color_changed) { GfxRGB new_color; state->getFillRGB(&new_color); if(!((new_color.r == cur_color.r) && (new_color.g == cur_color.g) && (new_color.b == cur_color.b))) { - close_cur_line(); + close_line = true; cur_color = new_color; cur_color_id = install_color(&new_color); } } bool need_rescale_font = false; - if(font_changed) + if(all_changed || font_changed) { long long new_fn_id = install_font(state->getFont()); if(!(new_fn_id == cur_fn_id)) { - close_cur_line(); + close_line = true; cur_fn_id = new_fn_id; } @@ -966,7 +838,7 @@ void HTMLRenderer::check_state_change(GfxState * state) // Rise, HorizScale etc double new_ctm[6]; memcpy(new_ctm, draw_ctm, sizeof(new_ctm)); - if(text_mat_changed || ctm_changed) + if(all_changed || text_mat_changed || ctm_changed) { double * m1 = state->getCTM(); double * m2 = state->getTextMat(); @@ -976,9 +848,7 @@ void HTMLRenderer::check_state_change(GfxState * state) new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3]; new_ctm[4] = new_ctm[5] = 0; - // TODO: this is not correct - // what to check? - if(!_tm_equal(new_ctm, draw_ctm, 4)) { } + if(!_tm_equal(new_ctm, cur_ctm, 4)) { } { need_rescale_font = true; } @@ -999,28 +869,36 @@ void HTMLRenderer::check_state_change(GfxState * state) draw_scale = 1.0; } - bool flag = false; if(!(_equal(new_draw_font_size, draw_font_size))) { draw_font_size = new_draw_font_size; cur_fs_id = install_font_size(draw_font_size); - flag = true; + close_line = true; } if(!(_tm_equal(new_ctm, draw_ctm))) { memcpy(draw_ctm, new_ctm, sizeof(draw_ctm)); cur_tm_id = install_transform_matrix(draw_ctm); - flag = true; + close_line = true; } - if(flag) - close_cur_line(); } + // TODO: track these + /* + if(!(_equal(s1->getCharSpace(), s2->getCharSpace()) && _equal(s1->getWordSpace(), s2->getWordSpace()) + && _equal(s1->getHorizScaling(), s2->getHorizScaling()))) + return false; + */ + reset_state_track(); + + if(close_line) + close_cur_line(); } void HTMLRenderer::reset_state_track() { + all_changed = false; pos_changed = false; ctm_changed = false; text_mat_changed = false; diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index 523d24e..3142fd5 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -43,39 +43,6 @@ inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6) return true; } -class TextString -{ - public: - TextString(GfxState *state); - ~TextString(); - - void addChars(GfxState * state, double x, double y, - double dx, double dy, - CharCode code, int nbytes); - void addUnicodes(GfxState *state, double x, double y, - double dx, double dy, - Unicode *u, int uLen); - double getX() const {return x;} - double getY() const {return y;} - double getWidth() const {return width;} - double getHeight() const {return height;} - - GfxState * getState() const { return state; } - - const std::vector & getUnicodes() const { return unicodes; } - size_t getSize() const { return unicodes.size(); } - - - private: - std::vector unicodes; - double x, y; - double width, height; - - // TODO: - // remove this, track state change in the converter - GfxState * state; -}; - class HTMLRenderer : public OutputDev { public: @@ -91,7 +58,7 @@ class HTMLRenderer : public OutputDev virtual GBool upsideDown() { return gFalse; } // Does this device use drawChar() or drawString()? - virtual GBool useDrawChar() { return gTrue; } + virtual GBool useDrawChar() { return gFalse; } // Does this device use beginType3Char/endType3Char? Otherwise, // text in Type 3 fonts will be drawn with drawChar/drawString. @@ -121,29 +88,24 @@ class HTMLRenderer : public OutputDev virtual void endPage(); //----- update state - virtual void updateAll(GfxState * state); - virtual void updateFont(GfxState * state); - virtual void updateTextMat(GfxState * state); - virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32); - virtual void updateTextPos(GfxState * state); - - virtual void updateFillColor(GfxState * state); + /* + * To optmize false alarms + * We just mark as changed, and recheck if they have been changed when we are about to output a new string + */ + virtual void updateAll(GfxState * state) { all_changed = true; } + virtual void updateFont(GfxState * state) { font_changed = true; } + virtual void updateTextMat(GfxState * state) { text_mat_changed = true; } + virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) { ctm_changed = true; } + virtual void updateTextPos(GfxState * state) { pos_changed = true; } + virtual void updateFillColor(GfxState * state) { color_changed = true; } //----- text drawing - virtual void beginString(GfxState *state, GooString *s); - virtual void endString(GfxState *state); - virtual void drawChar(GfxState *state, double x, double y, - double dx, double dy, - double originX, double originY, - CharCode code, int nBytes, Unicode *u, int uLen); - virtual void drawString(GfxState * state, GooString * s); private: - bool at_same_line(const TextString * ts1, const TextString * ts2) const; - void close_cur_line(); - void outputTextString(TextString * str); + + void outputUnicodes(const Unicode * u, int uLen); // return the mapped font name long long install_font(GfxFont * font); @@ -184,17 +146,15 @@ class HTMLRenderer : public OutputDev double pageHeight ; - // state maintained when processing pdf + // state tracking when processing pdf void check_state_change(GfxState * state); void reset_state_track(); + bool all_changed; - // the string being processed - TextString * cur_string; - // the last word of current line - // if it's not nullptr, there's an open
waiting for new strings in the same line - TextString * cur_line; + // if we have a pending opened line + bool line_opened; // (actual x) - (supposed x) double cur_line_x_offset; @@ -210,6 +170,10 @@ class HTMLRenderer : public OutputDev long long cur_tm_id; bool ctm_changed; bool text_mat_changed; + + // this is CTM * TextMAT in PDF, not only CTM + // [4] and [5] are ignored, we'll calculate the position of the origin separately + double cur_ctm[6]; // unscaled long long cur_color_id; GfxRGB cur_color; @@ -218,9 +182,12 @@ class HTMLRenderer : public OutputDev // optmize for web // we try to render the final font size directly // to reduce the effect of ctm as much as possible + + // draw_ctm is cur_ctem scaled by 1/draw_scale, + // so everything redenered should be scaled by draw_scale double draw_ctm[6]; double draw_font_size; - double draw_scale; + double draw_scale; ofstream html_fout, allcss_fout; From 9d7046e925082f449a75f7c3ae82da5e3ef814ee Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 7 Aug 2012 16:44:49 +0800 Subject: [PATCH 2/4] fix horizontal space padding --- src/BackgroundRenderer.cc | 2 +- src/HTMLRenderer.cc | 42 +++++++++++++++++++++------------------ src/HTMLRenderer.h | 8 ++++---- 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/src/BackgroundRenderer.cc b/src/BackgroundRenderer.cc index 5941eee..2913947 100644 --- a/src/BackgroundRenderer.cc +++ b/src/BackgroundRenderer.cc @@ -14,7 +14,7 @@ void BackgroundRenderer::drawChar(GfxState *state, double x, double y, CharCode code, int nBytes, Unicode *u, int uLen) { auto font = state->getFont(); -// if((font->getType() == fontType3) || (font->getWMode())) + if((font->getType() == fontType3) || (font->getWMode())) { SplashOutputDev::drawChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); } diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index e2ba118..76047a5 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -126,8 +126,8 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) if(param->readable) html_fout << endl; cur_fn_id = cur_fs_id = cur_tm_id = cur_color_id = 0; - cur_line_x_offset = 0; cur_tx = cur_ty = 0; + cur_line_x_offset = 0; cur_font_size = 0; memcpy(cur_ctm, id_matrix, sizeof(cur_ctm)); @@ -154,7 +154,6 @@ void HTMLRenderer::close_cur_line() html_fout << "
"; if(param->readable) html_fout << endl; - cur_line_x_offset = 0; line_opened = false; } } @@ -198,7 +197,6 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) auto font = state->getFont(); if((font == nullptr) || (font->getWMode())) { - //TODO return; } @@ -208,26 +206,24 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) // if the line is still open, try to merge with it if(line_opened) { - double target = (state->getLineX() - cur_tx - cur_line_x_offset) * draw_scale; - + double target = -cur_line_x_offset * draw_scale; if(target > -param->h_eps) { if(target > param->h_eps) { double w; auto wid = install_whitespace(target, w); - cur_line_x_offset = w-target; + cur_tx += w / draw_scale; + cur_line_x_offset = (w - target) / draw_scale; html_fout << boost::format(" ") % wid; } - else - { - cur_line_x_offset = -target; - } } else { // can we shift left using simple tags? close_cur_line(); + cur_tx = state->getLineX(); + cur_line_x_offset = 0; } } @@ -791,7 +787,7 @@ void HTMLRenderer::check_state_change(GfxState * state) { bool close_line = false; - if(all_changed || pos_changed) + if(all_changed || line_pos_changed) { double tx = state->getLineX(); double ty = state->getLineY(); @@ -801,9 +797,16 @@ void HTMLRenderer::check_state_change(GfxState * state) close_line = true; cur_ty = ty; cur_tx = tx; + cur_line_x_offset = 0; + } + else + { + // LineY remains unchanged + cur_line_x_offset += cur_tx - tx; } } + // TODO, we may use nested span if only color has been changed if(all_changed || color_changed) { GfxRGB new_color; @@ -836,10 +839,9 @@ void HTMLRenderer::check_state_change(GfxState * state) // TODO // Rise, HorizScale etc - double new_ctm[6]; - memcpy(new_ctm, draw_ctm, sizeof(new_ctm)); if(all_changed || text_mat_changed || ctm_changed) { + double new_ctm[6]; double * m1 = state->getCTM(); double * m2 = state->getTextMat(); new_ctm[0] = m1[0] * m2[0] + m1[2] * m2[1]; @@ -848,21 +850,23 @@ void HTMLRenderer::check_state_change(GfxState * state) new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3]; new_ctm[4] = new_ctm[5] = 0; - if(!_tm_equal(new_ctm, cur_ctm, 4)) { } + if(!_tm_equal(new_ctm, cur_ctm)) { need_rescale_font = true; + memcpy(cur_ctm, new_ctm, sizeof(cur_ctm)); } } if(need_rescale_font) { - draw_scale = std::sqrt(new_ctm[2] * new_ctm[2] + new_ctm[3] * new_ctm[3]); + draw_scale = std::sqrt(cur_ctm[2] * cur_ctm[2] + cur_ctm[3] * cur_ctm[3]); double new_draw_font_size = cur_font_size; + if(_is_positive(draw_scale)) { new_draw_font_size *= draw_scale; for(int i = 0; i < 4; ++i) - new_ctm[i] /= draw_scale; + cur_ctm[i] /= draw_scale; } else { @@ -875,9 +879,9 @@ void HTMLRenderer::check_state_change(GfxState * state) cur_fs_id = install_font_size(draw_font_size); close_line = true; } - if(!(_tm_equal(new_ctm, draw_ctm))) + if(!(_tm_equal(cur_ctm, draw_ctm))) { - memcpy(draw_ctm, new_ctm, sizeof(draw_ctm)); + memcpy(draw_ctm, cur_ctm, sizeof(draw_ctm)); cur_tm_id = install_transform_matrix(draw_ctm); close_line = true; } @@ -899,7 +903,7 @@ void HTMLRenderer::check_state_change(GfxState * state) void HTMLRenderer::reset_state_track() { all_changed = false; - pos_changed = false; + line_pos_changed = false; ctm_changed = false; text_mat_changed = false; font_changed = false; diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index 3142fd5..637e77d 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -96,7 +96,8 @@ class HTMLRenderer : public OutputDev virtual void updateFont(GfxState * state) { font_changed = true; } virtual void updateTextMat(GfxState * state) { text_mat_changed = true; } virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) { ctm_changed = true; } - virtual void updateTextPos(GfxState * state) { pos_changed = true; } + virtual void updateTextPos(GfxState * state) { line_pos_changed = true; } + virtual void updateTextShift(GfxState * state, double shift) { cur_line_x_offset += shift * 0.001 * state->getFontSize() * state->getHorizScaling(); } virtual void updateFillColor(GfxState * state) { color_changed = true; } //----- text drawing @@ -155,12 +156,11 @@ class HTMLRenderer : public OutputDev // if we have a pending opened line bool line_opened; - // (actual x) - (supposed x) - double cur_line_x_offset; // current position double cur_tx, cur_ty; // in text coords - bool pos_changed; + double cur_line_x_offset; // in text coords, our position - real position + bool line_pos_changed; long long cur_fn_id; double cur_font_size; From 272cf9fa025bd73fcc527d057aac809f2b8c747c Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 7 Aug 2012 17:59:24 +0800 Subject: [PATCH 3/4] working on x offset --- bin/pdf2htmlEX | 2 -- src/BackgroundRenderer.cc | 2 +- src/HTMLRenderer.cc | 16 +++++++++++----- src/HTMLRenderer.h | 6 +++++- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/bin/pdf2htmlEX b/bin/pdf2htmlEX index 96ba05e..eec482c 100755 --- a/bin/pdf2htmlEX +++ b/bin/pdf2htmlEX @@ -12,7 +12,6 @@ echo -n "Converting fonts: " for f in *.ttf; do if [ -f $f ]; then - cp $f $f.old fontforge -script "${SCRIPT_DIR}/convert.pe" $f 2>/dev/null echo -n "." fi @@ -20,7 +19,6 @@ done for f in *.pfa; do if [ -f $f ]; then - cp $f $f.old fontforge -script "${SCRIPT_DIR}/convert.pe" $f 2>/dev/null rm $f echo -n "." diff --git a/src/BackgroundRenderer.cc b/src/BackgroundRenderer.cc index 2913947..1044a63 100644 --- a/src/BackgroundRenderer.cc +++ b/src/BackgroundRenderer.cc @@ -14,7 +14,7 @@ void BackgroundRenderer::drawChar(GfxState *state, double x, double y, CharCode code, int nBytes, Unicode *u, int uLen) { auto font = state->getFont(); - if((font->getType() == fontType3) || (font->getWMode())) + if((font->getType() == fontType3) || (font->getWMode()) || (uLen == 0)) { SplashOutputDev::drawChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); } diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index 76047a5..80f4a60 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -206,6 +206,9 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) // if the line is still open, try to merge with it if(line_opened) { + //debug + html_fout << ""; + double target = -cur_line_x_offset * draw_scale; if(target > -param->h_eps) { @@ -213,17 +216,17 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) { double w; auto wid = install_whitespace(target, w); - cur_tx += w / draw_scale; cur_line_x_offset = (w - target) / draw_scale; html_fout << boost::format(" ") % wid; } } else { + //debug + html_fout << ""; + // can we shift left using simple tags? close_cur_line(); - cur_tx = state->getLineX(); - cur_line_x_offset = 0; } } @@ -270,6 +273,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) html_fout << "\">"; line_opened = true; + + cur_line_x_offset = 0; } @@ -691,7 +696,7 @@ void HTMLRenderer::export_remote_font(long long fn_id, const string & suffix, Gf // TODO: this function is called when some font is unable to process, may use the name there as a hint void HTMLRenderer::export_remote_default_font(long long fn_id) { - allcss_fout << boost::format(".f%|1$x|{font-family:sans-serif;color:transparent;}")%fn_id; + allcss_fout << boost::format(".f%|1$x|{font-family:sans-serif;color:transparent;visibility:hidden;}")%fn_id; if(param->readable) allcss_fout << endl; } @@ -802,7 +807,8 @@ void HTMLRenderer::check_state_change(GfxState * state) else { // LineY remains unchanged - cur_line_x_offset += cur_tx - tx; + cur_line_x_offset = cur_tx - tx; + cur_tx = tx; } } diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index 637e77d..a154979 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -97,7 +97,11 @@ class HTMLRenderer : public OutputDev virtual void updateTextMat(GfxState * state) { text_mat_changed = true; } virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) { ctm_changed = true; } virtual void updateTextPos(GfxState * state) { line_pos_changed = true; } - virtual void updateTextShift(GfxState * state, double shift) { cur_line_x_offset += shift * 0.001 * state->getFontSize() * state->getHorizScaling(); } + virtual void updateTextShift(GfxState * state, double shift) { + double off = shift * 0.001 * state->getFontSize() * state->getHorizScaling(); + cur_line_x_offset += off; + cur_tx -= off; + } virtual void updateFillColor(GfxState * state) { color_changed = true; } //----- text drawing From 37231c95506dcc3200eed85b9cc9961b228912ae Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 7 Aug 2012 19:39:47 +0800 Subject: [PATCH 4/4] text position bug fixed --- src/HTMLRenderer.cc | 102 ++++++++++++++++++++++++++++++-------------- src/HTMLRenderer.h | 30 ++++++------- 2 files changed, 83 insertions(+), 49 deletions(-) diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index 80f4a60..3915ce6 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -127,13 +127,13 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) cur_fn_id = cur_fs_id = cur_tm_id = cur_color_id = 0; cur_tx = cur_ty = 0; - cur_line_x_offset = 0; cur_font_size = 0; memcpy(cur_ctm, id_matrix, sizeof(cur_ctm)); memcpy(draw_ctm, id_matrix, sizeof(draw_ctm)); draw_font_size = 0; draw_scale = 1.0; + draw_tx = draw_ty = 0; cur_color.r = cur_color.g = cur_color.b = 0; @@ -189,6 +189,45 @@ void HTMLRenderer::outputUnicodes(const Unicode * u, int uLen) } } +void HTMLRenderer::updateAll(GfxState * state) +{ + all_changed = true; + updateTextPos(state); +} + +void HTMLRenderer::updateFont(GfxState * state) +{ + font_changed = true; +} + +void HTMLRenderer::updateTextMat(GfxState * state) +{ + text_mat_changed = true; +} + +void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) +{ + ctm_changed = true; +} + +void HTMLRenderer::updateTextPos(GfxState * state) +{ + text_pos_changed = true; + cur_tx = state->getLineX(); + cur_ty = state->getLineY(); +} + +void HTMLRenderer::updateTextShift(GfxState * state, double shift) +{ + text_pos_changed = true; + cur_tx -= shift * 0.001 * state->getFontSize() * state->getHorizScaling(); +} + +void HTMLRenderer::updateFillColor(GfxState * state) +{ + color_changed = true; +} + void HTMLRenderer::drawString(GfxState * state, GooString * s) { if(s->getLength() == 0) @@ -206,26 +245,20 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) // if the line is still open, try to merge with it if(line_opened) { - //debug - html_fout << ""; - - double target = -cur_line_x_offset * draw_scale; + double target = (cur_tx - draw_tx) * draw_scale; if(target > -param->h_eps) { if(target > param->h_eps) { double w; auto wid = install_whitespace(target, w); - cur_line_x_offset = (w - target) / draw_scale; html_fout << boost::format(" ") % wid; + draw_tx += w / draw_scale; } } else { - //debug - html_fout << ""; - - // can we shift left using simple tags? + // or can we shift left using simple tags? close_cur_line(); } } @@ -266,7 +299,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) html_fout << "\""; double x,y; state->transform(state->getCurX(), state->getCurY(), &x, &y); - html_fout << boost::format("data-lx=\"%5%\" data-ly=\"%6%\" data-scale=\"%4%\" data-x=\"%1%\" data-y=\"%2%\" data-hs=\"%3%") + html_fout << boost::format("data-lx=\"%5%\" data-ly=\"%6%\" data-drawscale=\"%4%\" data-x=\"%1%\" data-y=\"%2%\" data-hs=\"%3%") %x%y%(state->getHorizScaling())%draw_scale%state->getLineX()%state->getLineY(); } @@ -274,7 +307,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) line_opened = true; - cur_line_x_offset = 0; + draw_tx = cur_tx; } @@ -282,20 +315,25 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) // get the unicodes char *p = s->getCString(); int len = s->getLength(); - double dx,dy,dx1,dy1; + + double dx = 0; + double dy = 0; + double dx1,dy1; double ox, oy; int nChars = 0; int nSpaces = 0; int uLen; + CharCode code; Unicode *u = nullptr; + while (len > 0) { auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy); if(!(_equal(ox, 0) && _equal(oy, 0))) { - std::cerr << "TODO: non-zero orgins" << std::endl; + std::cerr << "TODO: non-zero origins" << std::endl; } outputUnicodes(u, uLen); @@ -303,19 +341,27 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) dx += dx1; dy += dy1; - if (n == 1 && *p == ' ') { + if (n == 1 && *p == ' ') + { ++nSpaces; } + ++nChars; p += n; len -= n; } - cur_tx += (dx * state->getFontSize() + dx = (dx * state->getFontSize() + nChars * state->getCharSpace() + nSpaces * state->getWordSpace()) * state->getHorizScaling(); - cur_ty += (dy * state->getFontSize()); + dy *= state->getFontSize(); + + cur_tx += dx; + cur_ty += dy; + + draw_tx += dx; + draw_ty += dy; } // The font installation code is stolen from PSOutputDev.cc in poppler @@ -792,23 +838,13 @@ void HTMLRenderer::check_state_change(GfxState * state) { bool close_line = false; - if(all_changed || line_pos_changed) + if(all_changed || text_pos_changed) { - double tx = state->getLineX(); - double ty = state->getLineY(); - // TODO: consider draw_scale, while it's actually draw_scale_x - if(!(std::abs(ty - cur_ty) < param->v_eps)) + if(!(std::abs(cur_ty - draw_ty) * draw_scale < param->v_eps)) { close_line = true; - cur_ty = ty; - cur_tx = tx; - cur_line_x_offset = 0; - } - else - { - // LineY remains unchanged - cur_line_x_offset = cur_tx - tx; - cur_tx = tx; + draw_ty = cur_ty; + draw_tx = cur_tx; } } @@ -866,8 +902,8 @@ void HTMLRenderer::check_state_change(GfxState * state) if(need_rescale_font) { draw_scale = std::sqrt(cur_ctm[2] * cur_ctm[2] + cur_ctm[3] * cur_ctm[3]); + double new_draw_font_size = cur_font_size; - if(_is_positive(draw_scale)) { new_draw_font_size *= draw_scale; @@ -909,7 +945,7 @@ void HTMLRenderer::check_state_change(GfxState * state) void HTMLRenderer::reset_state_track() { all_changed = false; - line_pos_changed = false; + text_pos_changed = false; ctm_changed = false; text_mat_changed = false; font_changed = false; diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index a154979..7574be0 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -92,17 +92,13 @@ class HTMLRenderer : public OutputDev * To optmize false alarms * We just mark as changed, and recheck if they have been changed when we are about to output a new string */ - virtual void updateAll(GfxState * state) { all_changed = true; } - virtual void updateFont(GfxState * state) { font_changed = true; } - virtual void updateTextMat(GfxState * state) { text_mat_changed = true; } - virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) { ctm_changed = true; } - virtual void updateTextPos(GfxState * state) { line_pos_changed = true; } - virtual void updateTextShift(GfxState * state, double shift) { - double off = shift * 0.001 * state->getFontSize() * state->getHorizScaling(); - cur_line_x_offset += off; - cur_tx -= off; - } - virtual void updateFillColor(GfxState * state) { color_changed = true; } + virtual void updateAll(GfxState * state); + virtual void updateFont(GfxState * state); + virtual void updateTextMat(GfxState * state); + virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32); + virtual void updateTextPos(GfxState * state); + virtual void updateTextShift(GfxState * state, double shift); + virtual void updateFillColor(GfxState * state); //----- text drawing virtual void drawString(GfxState * state, GooString * s); @@ -162,9 +158,8 @@ class HTMLRenderer : public OutputDev bool line_opened; // current position - double cur_tx, cur_ty; // in text coords - double cur_line_x_offset; // in text coords, our position - real position - bool line_pos_changed; + double cur_tx, cur_ty; // real text position, in text coords + bool text_pos_changed; long long cur_fn_id; double cur_font_size; @@ -187,12 +182,15 @@ class HTMLRenderer : public OutputDev // we try to render the final font size directly // to reduce the effect of ctm as much as possible - // draw_ctm is cur_ctem scaled by 1/draw_scale, - // so everything redenered should be scaled by draw_scale + // draw_ctm is cur_ctm scaled by 1/draw_scale, + // so everything redenered should be multiplied by draw_scale double draw_ctm[6]; double draw_font_size; double draw_scale; + // the position of next char, in text coords + double draw_tx, draw_ty; + ofstream html_fout, allcss_fout; class FontInfo{