1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

Improve rendering accuracy

This commit is contained in:
Lu Wang 2012-08-05 19:39:37 +08:00
parent ab345ac753
commit c7ddf60bac
4 changed files with 169 additions and 93 deletions

View File

@ -1,2 +1,2 @@
Open($1);
Generate($1:r+".otf");
Generate($1:r+".woff");

View File

@ -37,6 +37,7 @@
* p - Page
* t - Transform
* l - Line
* w - White space
*
*
* Reusable CSS classes
@ -61,14 +62,14 @@ const char * HTML_HEAD = "<!DOCTYPE html>\n\
overflow:auto;\
background-color:grey;\
}\
#pdf-main .p {\
#pdf-main > .p {\
position:relative;\
margin:13px auto;\
background-color:white;\
overflow:hidden;\
display:none;\
}\
.p .t {\
.p > .t {\
position:absolute;\
top:0;\
left:0;\
@ -80,7 +81,7 @@ const char * HTML_HEAD = "<!DOCTYPE html>\n\
-webkit-transform-origin:0% 100%;\
-o-transform-origin:0% 100%;\
}\
.t .l {\
.t > .l {\
position:absolute; \
white-space:pre;\
transform-origin:0% 100%;\
@ -89,7 +90,7 @@ const char * HTML_HEAD = "<!DOCTYPE html>\n\
-webkit-transform-origin:0% 100%;\
-o-transform-origin:0% 100%;\
}\
.l > span{\
.l > .w{\
display:inline-block;\
}\
::selection{\
@ -131,7 +132,8 @@ TextString::TextString(GfxState *state)
,x(state->getCurX()), y(state->getCurY())
,width(0),height(0)
,state(state)
{ }
{
}
TextString::~TextString()
{
@ -161,8 +163,6 @@ void TextString::addChar(GfxState *state, double x, double y,
HTMLRenderer::HTMLRenderer(const Param * param)
:cur_string(nullptr), cur_line(nullptr)
,cur_line_x_offset(0)
,cur_fs_id(0), cur_fn_id(0)
,html_fout(param->output_filename.c_str(), ofstream::binary), allcss_fout("all.css")
,param(param)
{
@ -172,10 +172,6 @@ HTMLRenderer::HTMLRenderer(const Param * param)
html_fout << HTML_HEAD;
if(param->readable) html_fout << endl;
for(int i = 0; i < 6; ++i)
ctm[i] = text_mat[i] = 0.0;
ctm[0] = text_mat[0] = ctm[3] = text_mat[3] = 1.0;
}
HTMLRenderer::~HTMLRenderer()
@ -197,23 +193,35 @@ void HTMLRenderer::process(PDFDoc *doc)
void HTMLRenderer::startPage(int pageNum, GfxState *state)
{
this->pageNum = pageNum;
this->pageWidth=static_cast<int>(state->getPageWidth());
this->pageHeight=static_cast<int>(state->getPageHeight());
this->pageWidth = state->getPageWidth();
this->pageHeight = state->getPageHeight();
assert(cur_line == nullptr);
assert(cur_string == nullptr);
html_fout << boost::format("<div id=\"page-%3%\" class=\"p\" style=\"width:%1%px;height:%2%px;") % pageWidth % pageHeight % pageNum;
#if 0
// TODO:background
html_fout << boost::format("background-image:url(p%3%.png);background-position:0 0;background-size:%1%px %2%px;background-repeat:no-repeat;") % pageWidth % pageHeight % pageNum;
#endif
html_fout << "\">";
if(param->readable) html_fout << endl;
cur_x = cur_y = 0;
cur_fn_id = cur_fs_id = 0;
cur_line_x_offset = 0;
for(int i = 0; i < 6; ++i)
cur_ctm[i] = cur_text_mat[i] = 0.0;
cur_ctm[0] = cur_text_mat[0] = cur_ctm[3] = cur_text_mat[3] = 1.0;
pos_changed = false;
ctm_changed = false;
text_mat_changed = false;
font_changed = false;
// default CTM
html_fout << "<div class=\"t\">";
html_fout << boost::format("<div class=\"t t%|1$x|\">") % install_transform_matrix(cur_ctm);
if(param->readable) html_fout << endl;
}
@ -227,13 +235,6 @@ void HTMLRenderer::endPage() {
if(param->readable) html_fout << endl;
}
void HTMLRenderer::convert_transform_matrix(double * tm)
{
tm[1] = -tm[1];
tm[2] = -tm[2];
tm[5] = -tm[5];
}
bool HTMLRenderer::at_same_line(const TextString * ts1, const TextString * ts2) const
{
if(!(std::abs(ts1->getY() - ts2->getY()) < param->v_eps))
@ -301,54 +302,47 @@ void HTMLRenderer::outputTextString(TextString * str)
}
}
void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32)
void HTMLRenderer::updateAll(GfxState *state)
{
double new_ctm[6];
memcpy(new_ctm, state->getCTM(), sizeof(new_ctm));
convert_transform_matrix(new_ctm);
if(!_tm_equal(ctm, new_ctm))
{
close_cur_line();
memcpy(ctm, new_ctm, sizeof(ctm));
// close old CTM div and create a new one
html_fout << "</div>";
if(param->readable) html_fout << endl;
html_fout << boost::format("<div class=\"t t%|1$x|\">") % install_transform_matrix(ctm);
if(param->readable) html_fout << endl;
}
font_changed = true;
text_mat_changed = true;
ctm_changed = true;
pos_changed = true;
}
void HTMLRenderer::updateFont(GfxState *state) {
long long new_fn_id = install_font(state->getFont());
long long new_fs_id = install_font_size(state->getFontSize());
if(!((new_fn_id == cur_fn_id) && (new_fs_id == cur_fs_id)))
{
close_cur_line();
cur_fn_id = new_fn_id;
cur_fs_id = new_fs_id;
}
void HTMLRenderer::updateFont(GfxState *state)
{
font_changed = true;
}
void HTMLRenderer::updateTextMat(GfxState * state)
{
double new_text_mat[6];
memcpy(new_text_mat, state->getTextMat(), sizeof(new_text_mat));
convert_transform_matrix(new_text_mat);
text_mat_changed = true;
}
if(!_tm_equal(text_mat, new_text_mat))
{
close_cur_line();
memcpy(text_mat, new_text_mat, sizeof(text_mat));
void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32)
{
ctm_changed = true;
}
//debug
//TODO: why
text_mat[4] = text_mat[5] = 0.0;
}
void HTMLRenderer::updateTextPos(GfxState * state)
{
pos_changed = true;
}
void HTMLRenderer::saveTextPos(GfxState * state)
{
cout << "save" << endl;
}
void HTMLRenderer::restoreTextPos(GfxState * state)
{
cout << "restore" << endl;
}
void HTMLRenderer::beginString(GfxState *state, GooString *s) {
check_state_change(state);
// TODO: remove this
GfxState * new_state = state->copy(gTrue);
@ -377,7 +371,7 @@ void HTMLRenderer::endString(GfxState *state) {
double w;
auto wid = install_whitespace(target, w);
cur_line_x_offset = w-target;
html_fout << boost::format("<span class=\"w%|1$x|\"> </span>") % wid;
html_fout << boost::format("<span class=\"w w%|1$x|\"> </span>") % wid;
}
else
{
@ -396,21 +390,23 @@ void HTMLRenderer::endString(GfxState *state) {
close_cur_line();
GfxState * cur_state = cur_string -> getState();
// TODO: optimize text matrix search/install
html_fout << boost::format("<div class=\"l f%|1$x| s%|2$x| t%|3$x|\" style=\"") % cur_fn_id % cur_fs_id % install_transform_matrix(text_mat)
<< "top:" << (pageHeight - cur_string->getY()) << "px;"
html_fout << boost::format("<div class=\"l f%|1$x| s%|2$x| t%|3$x|\" style=\"") % cur_fn_id % cur_fs_id % install_transform_matrix(cur_text_mat)
<< "bottom:" << cur_string->getY() << "px;"
<< "left:" << cur_string->getX() << "px;"
// << "height:" << cur_string->getHeight() << "px;"
<< "line-height:" << (cur_state->getFont()->getAscent() * cur_state->getFontSize()) << "px;"
;
// letter & word spacing
GfxState * cur_state = cur_string -> getState();
if(_is_positive(cur_state->getCharSpace()))
html_fout << "letter-spacing:" << cur_state->getCharSpace() << "px;";
if(_is_positive(cur_state->getWordSpace()))
html_fout << "word-spacing:" << cur_state->getWordSpace() << "px;";
//debug
//real pos
{
html_fout << "\"";
double x,y;
@ -425,10 +421,6 @@ void HTMLRenderer::endString(GfxState *state) {
cur_line = cur_string;
cur_string = nullptr;
cur_line_x_offset = 0;
// HERE
//debug
// close_cur_line();
}
void HTMLRenderer::drawChar(GfxState *state, double x, double y,
@ -720,7 +712,7 @@ void HTMLRenderer::install_embedded_type1_font (Ref * id, long long fn_id)
tmpf.write(CTM, strlen(CTM));
}
export_remote_font(fn_id, "otf");
export_remote_font(fn_id, "woff");
err:
str_obj.streamClose();
@ -898,21 +890,91 @@ void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
void HTMLRenderer::export_transform_matrix (long long tm_id, double * tm)
{
// TODO: recognize common matices
allcss_fout << boost::format(".t%|1$x|{") % tm_id;
// TODO: recognize common matices
static const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
if(_tm_equal(tm, id_matrix))
{
// no need to output anything
}
else
{
for(const std::string & prefix : {"", "-ms-", "-moz-", "-webkit-", "-o-"})
{
allcss_fout << prefix << "transform:matrix(";
for(int i = 0; i < 4; ++i)
allcss_fout << tm[i] << ',';
// PDF use a different coordinate system from Web
allcss_fout << prefix << "transform:matrix("
<< tm[0] << ','
<< -tm[1] << ','
<< -tm[2] << ','
<< tm[3] << ',';
if(prefix == "-moz-")
allcss_fout << boost::format("%1%px,%2%px);") % tm[4] % tm[5];
allcss_fout << boost::format("%1%px,%2%px);") % tm[4] % -tm[5];
else
allcss_fout << boost::format("%1%,%2%);") % tm[4] % tm[5];
allcss_fout << boost::format("%1%,%2%);") % tm[4] % -tm[5];
}
}
allcss_fout << "}";
if(param->readable) allcss_fout << endl;
}
void HTMLRenderer::check_state_change(GfxState * state)
{
if(pos_changed)
{
if(!(_equal(state->getCurX(), cur_x) && _equal(state->getCurY(), cur_y)))
{
close_cur_line();
cur_x = state->getCurX();
cur_y = state->getCurY();
}
pos_changed = false;
}
if(font_changed)
{
long long new_fn_id = install_font(state->getFont());
long long new_fs_id = install_font_size(state->getFontSize());
cur_font_size = state->getFontSize();
if(!((new_fn_id == cur_fn_id) && (new_fs_id == cur_fs_id)))
{
close_cur_line();
cur_fn_id = new_fn_id;
cur_fs_id = new_fs_id;
}
font_changed = false;
}
if(text_mat_changed)
{
if(!_tm_equal(cur_text_mat, state->getTextMat(), 4))
{
close_cur_line();
memcpy(cur_text_mat, state->getTextMat(), sizeof(cur_text_mat));
// we've already shift the text to the correct posstion
// so later in css we need to ignore the these offsets
cur_text_mat[4] = cur_text_mat[5] = 0.0;
}
text_mat_changed = false;
}
if(ctm_changed)
{
if(!_tm_equal(cur_ctm, state->getCTM()))
{
close_cur_line();
memcpy(cur_ctm, state->getCTM(), sizeof(cur_ctm));
// close old CTM div and create a new one
html_fout << "</div>";
if(param->readable) html_fout << endl;
html_fout << boost::format("<div class=\"t t%|1$x|\">") % install_transform_matrix(cur_ctm);
if(param->readable) html_fout << endl;
}
ctm_changed = false;
}
}

View File

@ -35,9 +35,9 @@ using namespace std;
static const double EPS = 1e-6;
inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; }
inline bool _is_positive(double x) { return x > EPS; }
inline bool _tm_equal(const double * tm1, const double * tm2)
inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6)
{
for(int i = 0; i < 6; ++i)
for(int i = 0; i < size; ++i)
if(!_equal(tm1[i], tm2[i]))
return false;
return true;
@ -55,7 +55,6 @@ class TextString
Unicode u);
double getX() const {return x;}
double getY() const {return y;}
double getWidth() const {return width;}
double getHeight() const {return height;}
@ -100,7 +99,6 @@ class HTMLRenderer : public OutputDev
virtual GBool needNonText() { return gFalse; }
//----- initialization and control
virtual GBool checkPageSlice(Page *page, double hDPI, double vDPI,
int rotate, GBool useMediaBox, GBool crop,
int sliceX, int sliceY, int sliceW, int sliceH,
@ -121,9 +119,13 @@ class HTMLRenderer : public OutputDev
virtual void endPage();
//----- update state
virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
virtual void updateAll(GfxState * state);
virtual void updateFont(GfxState * state);
virtual void updateTextMat(GfxState * state);
virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
virtual void updateTextPos(GfxState * state);
virtual void saveTextPos(GfxState * state);
virtual void restoreTextPos(GfxState * state);
//----- text drawing
virtual void beginString(GfxState *state, GooString *s);
@ -138,9 +140,6 @@ class HTMLRenderer : public OutputDev
private:
bool at_same_line(const TextString * ts1, const TextString * ts2) const;
// CSS use a different coordinate system from PDF
void convert_transform_matrix(double * tm);
void close_cur_line();
void outputTextString(TextString * str);
@ -176,11 +175,18 @@ class HTMLRenderer : public OutputDev
// page info
int pageNum ;
int pageWidth ;
int pageHeight ;
double pageWidth ;
double pageHeight ;
// state maintained when processing pdf
void check_state_change(GfxState * state);
// current position
double cur_x, cur_y;
bool pos_changed;
// the string being processed
TextString * cur_string;
// the last word of current line
@ -188,8 +194,17 @@ class HTMLRenderer : public OutputDev
TextString * cur_line;
// (actual x) - (supposed x)
double cur_line_x_offset;
double ctm[6], text_mat[6];
long long cur_fs_id, cur_fn_id;
double cur_ctm[6];
bool ctm_changed;
double cur_text_mat[6];
bool text_mat_changed;
long long cur_fn_id;
double cur_font_size;
long long cur_fs_id;
bool font_changed;
ofstream html_fout, allcss_fout;

View File

@ -124,7 +124,6 @@ po::variables_map parse_options (int argc, char **argv)
("hdpi", po::value<double>(&param.h_dpi)->default_value(72.0), "horizontal DPI")
("vdpi", po::value<double>(&param.v_dpi)->default_value(72.0), "vertical DPI")
("heps", po::value<double>(&param.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)")
("heps", po::value<double>(&param.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)")
("veps", po::value<double>(&param.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")
("readable", po::value<int>(&param.readable)->default_value(0), "make the ouptut human readable")
;