From c7ddf60bacc996c9c1f7ed33a321de4038928e05 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Sun, 5 Aug 2012 19:39:37 +0800 Subject: [PATCH] Improve rendering accuracy --- bin/pfa2otf.pe | 2 +- src/HTMLRenderer.cc | 220 ++++++++++++++++++++++++++++---------------- src/HTMLRenderer.h | 39 +++++--- src/pdftohtmlEX.cc | 1 - 4 files changed, 169 insertions(+), 93 deletions(-) diff --git a/bin/pfa2otf.pe b/bin/pfa2otf.pe index 0c9bf9f..d75e874 100644 --- a/bin/pfa2otf.pe +++ b/bin/pfa2otf.pe @@ -1,2 +1,2 @@ Open($1); -Generate($1:r+".otf"); +Generate($1:r+".woff"); diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index 59602e1..3cabd94 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -37,6 +37,7 @@ * p - Page * t - Transform * l - Line + * w - White space * * * Reusable CSS classes @@ -61,14 +62,14 @@ const char * HTML_HEAD = "\n\ overflow:auto;\ background-color:grey;\ }\ -#pdf-main .p {\ +#pdf-main > .p {\ position:relative;\ margin:13px auto;\ background-color:white;\ overflow:hidden;\ display:none;\ }\ -.p .t {\ +.p > .t {\ position:absolute;\ top:0;\ left:0;\ @@ -80,7 +81,7 @@ const char * HTML_HEAD = "\n\ -webkit-transform-origin:0% 100%;\ -o-transform-origin:0% 100%;\ }\ -.t .l {\ +.t > .l {\ position:absolute; \ white-space:pre;\ transform-origin:0% 100%;\ @@ -89,7 +90,7 @@ const char * HTML_HEAD = "\n\ -webkit-transform-origin:0% 100%;\ -o-transform-origin:0% 100%;\ }\ -.l > span{\ +.l > .w{\ display:inline-block;\ }\ ::selection{\ @@ -131,7 +132,8 @@ TextString::TextString(GfxState *state) ,x(state->getCurX()), y(state->getCurY()) ,width(0),height(0) ,state(state) -{ } +{ +} TextString::~TextString() { @@ -161,8 +163,6 @@ void TextString::addChar(GfxState *state, double x, double y, HTMLRenderer::HTMLRenderer(const Param * param) :cur_string(nullptr), cur_line(nullptr) - ,cur_line_x_offset(0) - ,cur_fs_id(0), cur_fn_id(0) ,html_fout(param->output_filename.c_str(), ofstream::binary), allcss_fout("all.css") ,param(param) { @@ -172,10 +172,6 @@ HTMLRenderer::HTMLRenderer(const Param * param) html_fout << HTML_HEAD; if(param->readable) html_fout << endl; - - for(int i = 0; i < 6; ++i) - ctm[i] = text_mat[i] = 0.0; - ctm[0] = text_mat[0] = ctm[3] = text_mat[3] = 1.0; } HTMLRenderer::~HTMLRenderer() @@ -197,23 +193,35 @@ void HTMLRenderer::process(PDFDoc *doc) void HTMLRenderer::startPage(int pageNum, GfxState *state) { this->pageNum = pageNum; - this->pageWidth=static_cast(state->getPageWidth()); - this->pageHeight=static_cast(state->getPageHeight()); + this->pageWidth = state->getPageWidth(); + this->pageHeight = state->getPageHeight(); assert(cur_line == nullptr); + assert(cur_string == nullptr); html_fout << boost::format("
"; if(param->readable) html_fout << endl; + cur_x = cur_y = 0; + cur_fn_id = cur_fs_id = 0; + cur_line_x_offset = 0; + + for(int i = 0; i < 6; ++i) + cur_ctm[i] = cur_text_mat[i] = 0.0; + cur_ctm[0] = cur_text_mat[0] = cur_ctm[3] = cur_text_mat[3] = 1.0; + + pos_changed = false; + ctm_changed = false; + text_mat_changed = false; + font_changed = false; + // default CTM - html_fout << "
"; + html_fout << boost::format("
") % install_transform_matrix(cur_ctm); if(param->readable) html_fout << endl; } @@ -227,13 +235,6 @@ void HTMLRenderer::endPage() { if(param->readable) html_fout << endl; } -void HTMLRenderer::convert_transform_matrix(double * tm) -{ - tm[1] = -tm[1]; - tm[2] = -tm[2]; - tm[5] = -tm[5]; -} - bool HTMLRenderer::at_same_line(const TextString * ts1, const TextString * ts2) const { if(!(std::abs(ts1->getY() - ts2->getY()) < param->v_eps)) @@ -301,54 +302,47 @@ void HTMLRenderer::outputTextString(TextString * str) } } -void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) +void HTMLRenderer::updateAll(GfxState *state) { - double new_ctm[6]; - memcpy(new_ctm, state->getCTM(), sizeof(new_ctm)); - convert_transform_matrix(new_ctm); - - if(!_tm_equal(ctm, new_ctm)) - { - close_cur_line(); - memcpy(ctm, new_ctm, sizeof(ctm)); - - // close old CTM div and create a new one - html_fout << "
"; - if(param->readable) html_fout << endl; - html_fout << boost::format("
") % install_transform_matrix(ctm); - if(param->readable) html_fout << endl; - } + font_changed = true; + text_mat_changed = true; + ctm_changed = true; + pos_changed = true; } -void HTMLRenderer::updateFont(GfxState *state) { - long long new_fn_id = install_font(state->getFont()); - long long new_fs_id = install_font_size(state->getFontSize()); - if(!((new_fn_id == cur_fn_id) && (new_fs_id == cur_fs_id))) - { - close_cur_line(); - cur_fn_id = new_fn_id; - cur_fs_id = new_fs_id; - } +void HTMLRenderer::updateFont(GfxState *state) +{ + font_changed = true; } void HTMLRenderer::updateTextMat(GfxState * state) { - double new_text_mat[6]; - memcpy(new_text_mat, state->getTextMat(), sizeof(new_text_mat)); - convert_transform_matrix(new_text_mat); + text_mat_changed = true; +} - if(!_tm_equal(text_mat, new_text_mat)) - { - close_cur_line(); - memcpy(text_mat, new_text_mat, sizeof(text_mat)); +void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) +{ + ctm_changed = true; +} - //debug - //TODO: why - text_mat[4] = text_mat[5] = 0.0; - } +void HTMLRenderer::updateTextPos(GfxState * state) +{ + pos_changed = true; +} + +void HTMLRenderer::saveTextPos(GfxState * state) +{ + cout << "save" << endl; +} + +void HTMLRenderer::restoreTextPos(GfxState * state) +{ + cout << "restore" << endl; } void HTMLRenderer::beginString(GfxState *state, GooString *s) { + check_state_change(state); + // TODO: remove this GfxState * new_state = state->copy(gTrue); @@ -377,7 +371,7 @@ void HTMLRenderer::endString(GfxState *state) { double w; auto wid = install_whitespace(target, w); cur_line_x_offset = w-target; - html_fout << boost::format(" ") % wid; + html_fout << boost::format(" ") % wid; } else { @@ -396,21 +390,23 @@ void HTMLRenderer::endString(GfxState *state) { close_cur_line(); + GfxState * cur_state = cur_string -> getState(); + // TODO: optimize text matrix search/install - html_fout << boost::format("
getY()) << "px;" + html_fout << boost::format("
getY() << "px;" << "left:" << cur_string->getX() << "px;" -// << "height:" << cur_string->getHeight() << "px;" + << "line-height:" << (cur_state->getFont()->getAscent() * cur_state->getFontSize()) << "px;" ; // letter & word spacing - GfxState * cur_state = cur_string -> getState(); if(_is_positive(cur_state->getCharSpace())) html_fout << "letter-spacing:" << cur_state->getCharSpace() << "px;"; if(_is_positive(cur_state->getWordSpace())) html_fout << "word-spacing:" << cur_state->getWordSpace() << "px;"; - //debug + //debug + //real pos { html_fout << "\""; double x,y; @@ -425,10 +421,6 @@ void HTMLRenderer::endString(GfxState *state) { cur_line = cur_string; cur_string = nullptr; cur_line_x_offset = 0; - - // HERE - //debug -// close_cur_line(); } void HTMLRenderer::drawChar(GfxState *state, double x, double y, @@ -720,7 +712,7 @@ void HTMLRenderer::install_embedded_type1_font (Ref * id, long long fn_id) tmpf.write(CTM, strlen(CTM)); } - export_remote_font(fn_id, "otf"); + export_remote_font(fn_id, "woff"); err: str_obj.streamClose(); @@ -898,21 +890,91 @@ void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) void HTMLRenderer::export_transform_matrix (long long tm_id, double * tm) { - // TODO: recognize common matices allcss_fout << boost::format(".t%|1$x|{") % tm_id; - for(const std::string & prefix : {"", "-ms-", "-moz-", "-webkit-", "-o-"}) + + // TODO: recognize common matices + static const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + if(_tm_equal(tm, id_matrix)) { - allcss_fout << prefix << "transform:matrix("; - for(int i = 0; i < 4; ++i) - allcss_fout << tm[i] << ','; - if(prefix == "-moz-") - allcss_fout << boost::format("%1%px,%2%px);") % tm[4] % tm[5]; - else - allcss_fout << boost::format("%1%,%2%);") % tm[4] % tm[5]; + // no need to output anything + } + else + { + for(const std::string & prefix : {"", "-ms-", "-moz-", "-webkit-", "-o-"}) + { + // PDF use a different coordinate system from Web + allcss_fout << prefix << "transform:matrix(" + << tm[0] << ',' + << -tm[1] << ',' + << -tm[2] << ',' + << tm[3] << ','; + + if(prefix == "-moz-") + allcss_fout << boost::format("%1%px,%2%px);") % tm[4] % -tm[5]; + else + allcss_fout << boost::format("%1%,%2%);") % tm[4] % -tm[5]; + } } allcss_fout << "}"; if(param->readable) allcss_fout << endl; } +void HTMLRenderer::check_state_change(GfxState * state) +{ + if(pos_changed) + { + if(!(_equal(state->getCurX(), cur_x) && _equal(state->getCurY(), cur_y))) + { + close_cur_line(); + cur_x = state->getCurX(); + cur_y = state->getCurY(); + } + pos_changed = false; + } + + if(font_changed) + { + long long new_fn_id = install_font(state->getFont()); + long long new_fs_id = install_font_size(state->getFontSize()); + cur_font_size = state->getFontSize(); + if(!((new_fn_id == cur_fn_id) && (new_fs_id == cur_fs_id))) + { + close_cur_line(); + cur_fn_id = new_fn_id; + cur_fs_id = new_fs_id; + } + font_changed = false; + } + if(text_mat_changed) + { + if(!_tm_equal(cur_text_mat, state->getTextMat(), 4)) + { + close_cur_line(); + memcpy(cur_text_mat, state->getTextMat(), sizeof(cur_text_mat)); + + // we've already shift the text to the correct posstion + // so later in css we need to ignore the these offsets + cur_text_mat[4] = cur_text_mat[5] = 0.0; + } + text_mat_changed = false; + } + + if(ctm_changed) + { + if(!_tm_equal(cur_ctm, state->getCTM())) + { + close_cur_line(); + memcpy(cur_ctm, state->getCTM(), sizeof(cur_ctm)); + + // close old CTM div and create a new one + html_fout << "
"; + if(param->readable) html_fout << endl; + html_fout << boost::format("
") % install_transform_matrix(cur_ctm); + if(param->readable) html_fout << endl; + } + ctm_changed = false; + } +} + diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index 18201c8..662bdc7 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -35,9 +35,9 @@ using namespace std; static const double EPS = 1e-6; inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; } inline bool _is_positive(double x) { return x > EPS; } -inline bool _tm_equal(const double * tm1, const double * tm2) +inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6) { - for(int i = 0; i < 6; ++i) + for(int i = 0; i < size; ++i) if(!_equal(tm1[i], tm2[i])) return false; return true; @@ -55,7 +55,6 @@ class TextString Unicode u); double getX() const {return x;} double getY() const {return y;} - double getWidth() const {return width;} double getHeight() const {return height;} @@ -100,7 +99,6 @@ class HTMLRenderer : public OutputDev virtual GBool needNonText() { return gFalse; } //----- initialization and control - virtual GBool checkPageSlice(Page *page, double hDPI, double vDPI, int rotate, GBool useMediaBox, GBool crop, int sliceX, int sliceY, int sliceW, int sliceH, @@ -121,9 +119,13 @@ class HTMLRenderer : public OutputDev virtual void endPage(); //----- update state - virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32); + virtual void updateAll(GfxState * state); virtual void updateFont(GfxState * state); virtual void updateTextMat(GfxState * state); + virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32); + virtual void updateTextPos(GfxState * state); + virtual void saveTextPos(GfxState * state); + virtual void restoreTextPos(GfxState * state); //----- text drawing virtual void beginString(GfxState *state, GooString *s); @@ -138,9 +140,6 @@ class HTMLRenderer : public OutputDev private: bool at_same_line(const TextString * ts1, const TextString * ts2) const; - // CSS use a different coordinate system from PDF - void convert_transform_matrix(double * tm); - void close_cur_line(); void outputTextString(TextString * str); @@ -176,11 +175,18 @@ class HTMLRenderer : public OutputDev // page info int pageNum ; - int pageWidth ; - int pageHeight ; + double pageWidth ; + double pageHeight ; // state maintained when processing pdf + + void check_state_change(GfxState * state); + + // current position + double cur_x, cur_y; + bool pos_changed; + // the string being processed TextString * cur_string; // the last word of current line @@ -188,8 +194,17 @@ class HTMLRenderer : public OutputDev TextString * cur_line; // (actual x) - (supposed x) double cur_line_x_offset; - double ctm[6], text_mat[6]; - long long cur_fs_id, cur_fn_id; + + double cur_ctm[6]; + bool ctm_changed; + + double cur_text_mat[6]; + bool text_mat_changed; + + long long cur_fn_id; + double cur_font_size; + long long cur_fs_id; + bool font_changed; ofstream html_fout, allcss_fout; diff --git a/src/pdftohtmlEX.cc b/src/pdftohtmlEX.cc index 74c7110..eee6899 100644 --- a/src/pdftohtmlEX.cc +++ b/src/pdftohtmlEX.cc @@ -124,7 +124,6 @@ po::variables_map parse_options (int argc, char **argv) ("hdpi", po::value(¶m.h_dpi)->default_value(72.0), "horizontal DPI") ("vdpi", po::value(¶m.v_dpi)->default_value(72.0), "vertical DPI") ("heps", po::value(¶m.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)") - ("heps", po::value(¶m.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)") ("veps", po::value(¶m.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)") ("readable", po::value(¶m.readable)->default_value(0), "make the ouptut human readable") ;