diff --git a/bin/pdf2htmlEX b/bin/pdf2htmlEX
index 96ba05e..eec482c 100755
--- a/bin/pdf2htmlEX
+++ b/bin/pdf2htmlEX
@@ -12,7 +12,6 @@ echo -n "Converting fonts: "
for f in *.ttf; do
if [ -f $f ]; then
- cp $f $f.old
fontforge -script "${SCRIPT_DIR}/convert.pe" $f 2>/dev/null
echo -n "."
fi
@@ -20,7 +19,6 @@ done
for f in *.pfa; do
if [ -f $f ]; then
- cp $f $f.old
fontforge -script "${SCRIPT_DIR}/convert.pe" $f 2>/dev/null
rm $f
echo -n "."
diff --git a/src/BackgroundRenderer.cc b/src/BackgroundRenderer.cc
index 5941eee..1044a63 100644
--- a/src/BackgroundRenderer.cc
+++ b/src/BackgroundRenderer.cc
@@ -14,7 +14,7 @@ void BackgroundRenderer::drawChar(GfxState *state, double x, double y,
CharCode code, int nBytes, Unicode *u, int uLen)
{
auto font = state->getFont();
-// if((font->getType() == fontType3) || (font->getWMode()))
+ if((font->getType() == fontType3) || (font->getWMode()) || (uLen == 0))
{
SplashOutputDev::drawChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen);
}
diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc
index fdef4ff..3915ce6 100644
--- a/src/HTMLRenderer.cc
+++ b/src/HTMLRenderer.cc
@@ -45,58 +45,8 @@
*
*/
-TextString::TextString(GfxState *state)
- :unicodes()
- ,x(state->getCurX())
- ,y(state->getCurY())
- ,width(0),height(0)
- ,state(state)
-{
- state->transform(x,y,&x,&y);
-}
-
-TextString::~TextString()
-{
- delete state;
- state = nullptr;
-}
-
-void TextString::addChars(GfxState *state, double x, double y,
- double dx, double dy, CharCode code, int nbytes)
-{
- if(nbytes > 0)
- {
- CharCode mask = (0xffLL) << (8*(nbytes-1));
- while(nbytes > 0)
- {
- unicodes.push_back((Unicode)((code & mask) >> (8 * (nbytes-1))));
- --nbytes;
- mask >>= 8;
- }
- }
-
- width += dx;
- height += dy;
-}
-
-void TextString::addUnicodes(GfxState *state, double x, double y,
- double dx, double dy, Unicode * u, int uLen)
-{
- /*
- if (0 < u && u != 9 && u < 32) // skip non-printable not-tab character
- return;
- */
-
- for(int i = 0; i < uLen; ++i)
- unicodes.push_back(u[i]);
-
- width += dx;
- height += dy;
-}
-
-
HTMLRenderer::HTMLRenderer(const Param * param)
- :cur_string(nullptr), cur_line(nullptr)
+ :line_opened(false)
,html_fout(param->output_filename.c_str(), ofstream::binary), allcss_fout("all.css")
,param(param)
{
@@ -166,8 +116,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
this->pageWidth = state->getPageWidth();
this->pageHeight = state->getPageHeight();
- assert(cur_line == nullptr);
- assert(cur_string == nullptr);
+ assert(!line_opened);
html_fout << boost::format("
readable) html_fout << endl;
cur_fn_id = cur_fs_id = cur_tm_id = cur_color_id = 0;
- cur_line_x_offset = 0;
cur_tx = cur_ty = 0;
cur_font_size = 0;
+ memcpy(cur_ctm, id_matrix, sizeof(cur_ctm));
memcpy(draw_ctm, id_matrix, sizeof(draw_ctm));
draw_font_size = 0;
draw_scale = 1.0;
+ draw_tx = draw_ty = 0;
cur_color.r = cur_color.g = cur_color.b = 0;
@@ -197,48 +147,28 @@ void HTMLRenderer::endPage() {
if(param->readable) html_fout << endl;
}
-bool HTMLRenderer::at_same_line(const TextString * ts1, const TextString * ts2) const
-{
- // TODO, this is not accurate, with transforms
- if(!(std::abs(ts1->getY() - ts2->getY()) < param->v_eps))
- return false;
-
- GfxState * s1 = ts1->getState();
- GfxState * s2 = ts2->getState();
-
- // TODO, track this instead of check here
- if(!(_equal(s1->getCharSpace(), s2->getCharSpace())
- && _equal(s1->getWordSpace(), s2->getWordSpace())
- && _equal(s1->getHorizScaling(), s2->getHorizScaling())))
- return false;
-
- return true;
-}
-
void HTMLRenderer::close_cur_line()
{
- if(cur_line != nullptr)
+ if(line_opened)
{
html_fout << "
";
if(param->readable) html_fout << endl;
- delete cur_line;
- cur_line = nullptr;
- cur_line_x_offset = 0;
+ line_opened = false;
}
}
-void HTMLRenderer::outputTextString(TextString * str)
+void HTMLRenderer::outputUnicodes(const Unicode * u, int uLen)
{
- for (auto u : str->getUnicodes())
+ for(int i = 0; i < uLen; ++i)
{
- switch(u)
+ switch(u[i])
{
case '&':
html_fout << "&";
break;
- case '\"':
- html_fout << """;
+ case '\"':
+ html_fout << """;
break;
case '\'':
html_fout << "'";
@@ -252,203 +182,186 @@ void HTMLRenderer::outputTextString(TextString * str)
default:
{
char buf[4];
- auto n = mapUTF8(u, buf, 4);
- if(n > 0)
- html_fout.write(buf, n);
+ auto n = mapUTF8(u[i], buf, 4);
+ html_fout.write(buf, n);
}
}
}
}
-void HTMLRenderer::updateAll(GfxState *state)
+void HTMLRenderer::updateAll(GfxState * state)
+{
+ all_changed = true;
+ updateTextPos(state);
+}
+
+void HTMLRenderer::updateFont(GfxState * state)
{
- font_changed = true;
- text_mat_changed = true;
- ctm_changed = true;
- pos_changed = true;
- color_changed = true;
+ font_changed = true;
}
-void HTMLRenderer::updateFont(GfxState *state)
+void HTMLRenderer::updateTextMat(GfxState * state)
{
- font_changed = true;
+ text_mat_changed = true;
}
-void HTMLRenderer::updateTextMat(GfxState * state)
+void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32)
{
- text_mat_changed = true;
+ ctm_changed = true;
}
-void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32)
+void HTMLRenderer::updateTextPos(GfxState * state)
{
- ctm_changed = true;
+ text_pos_changed = true;
+ cur_tx = state->getLineX();
+ cur_ty = state->getLineY();
}
-void HTMLRenderer::updateTextPos(GfxState * state)
+void HTMLRenderer::updateTextShift(GfxState * state, double shift)
{
- pos_changed = true;
+ text_pos_changed = true;
+ cur_tx -= shift * 0.001 * state->getFontSize() * state->getHorizScaling();
}
-void HTMLRenderer::updateFillColor(GfxState * state)
+void HTMLRenderer::updateFillColor(GfxState * state)
{
- color_changed = true;
+ color_changed = true;
}
-void HTMLRenderer::beginString(GfxState *state, GooString *s) {
- check_state_change(state);
-
- // TODO: remove this
- GfxState * new_state = state->copy(gTrue);
-
- cur_string = new TextString(new_state);
-}
-
-void HTMLRenderer::endString(GfxState *state) {
- if (cur_string->getSize() == 0) {
- delete cur_string ;
- cur_string = nullptr;
- return;
- }
-
- // try to merge with last line
- if(cur_line != nullptr)
- {
- if(at_same_line(cur_line, cur_string))
- {
- // TODO: this is not correct
- double x1 = cur_line->getState()->getLineX() + cur_line->getWidth();
- double x2 = cur_string->getState()->getLineX();
- double target = (x2-x1-cur_line_x_offset) * draw_scale;
-
- if(target > -param->h_eps)
- {
- if(target > param->h_eps)
- {
- double w;
- auto wid = install_whitespace(target, w);
- cur_line_x_offset = w-target;
- html_fout << boost::format(" ") % wid;
- }
- else
- {
- cur_line_x_offset = -target;
- }
-
- outputTextString(cur_string);
-
- delete cur_line;
- cur_line = cur_string;
- cur_string = nullptr;
- return;
- }
- }
- }
-
- close_cur_line();
-
- GfxState * cur_state = cur_string -> getState();
-
- // open a new line
- // classes
- html_fout << "getY() + cur_state->getFont()->getDescent() * draw_font_size) << "px;"
- << "top:" << (pageHeight - cur_string->getY() - cur_state->getFont()->getAscent() * draw_font_size) << "px;"
- << "left:" << cur_string->getX() << "px;"
- ;
-
- // letter & word spacing
- if(_is_positive(cur_state->getCharSpace()))
- html_fout << "letter-spacing:" << cur_state->getCharSpace() << "px;";
- if(_is_positive(cur_state->getWordSpace()))
- html_fout << "word-spacing:" << cur_state->getWordSpace() << "px;";
-
- //debug
- //real pos & hori_scale
- {
- html_fout << "\"";
- double x,y;
- cur_state->transform(cur_state->getCurX(), cur_state->getCurY(), &x, &y);
- html_fout << boost::format("data-lx=\"%5%\" data-ly=\"%6%\" data-scale=\"%4%\" data-x=\"%1%\" data-y=\"%2%\" data-hs=\"%3%")
- %x%y%(cur_state->getHorizScaling())%draw_scale%cur_state->getLineX()%cur_state->getLineY();
- }
-
- html_fout << "\">";
-
- outputTextString(cur_string);
-
- cur_line = cur_string;
- cur_string = nullptr;
- cur_line_x_offset = 0;
-}
-
-void HTMLRenderer::drawChar(GfxState *state, double x, double y,
- double dx, double dy,
- double originX, double originY,
- CharCode code, int nBytes, Unicode *u, int uLen)
-{
- // if it is hidden, then return
- if ((state->getRender() & 3) == 3)
- return ;
-
- if(uLen > 0)
- cur_string->addUnicodes(state, x, y, dx, dy, u, uLen);
- else
- {
- cur_string->addChars(state, x, y, dx, dy, code, nBytes);
- }
-}
-
-// TODO
void HTMLRenderer::drawString(GfxState * state, GooString * s)
{
- check_state_change(state);
+ if(s->getLength() == 0)
+ return;
auto font = state->getFont();
- if(font->getWMode())
+ if((font == nullptr) || (font->getWMode()))
{
- //TODO
return;
}
- // from poppler
- double dx = 0;
- double dy = 0;
- double dx2, dy2;
- double ox, oy;
+ // see if the line has to be closed due to state change
+ check_state_change(state);
+
+ // if the line is still open, try to merge with it
+ if(line_opened)
+ {
+ double target = (cur_tx - draw_tx) * draw_scale;
+ if(target > -param->h_eps)
+ {
+ if(target > param->h_eps)
+ {
+ double w;
+ auto wid = install_whitespace(target, w);
+ html_fout << boost::format("
") % wid;
+ draw_tx += w / draw_scale;
+ }
+ }
+ else
+ {
+ // or can we shift left using simple tags?
+ close_cur_line();
+ }
+ }
+ if(!line_opened)
+ {
+ // have to open a new line
+
+ // classes
+ html_fout << "
transform(state->getCurX(), state->getCurY(), &x, &y);
+ // TODO: recheck descent/ascent
+ html_fout << "\" style=\""
+ << "bottom:" << (y + state->getFont()->getDescent() * draw_font_size) << "px;"
+ << "top:" << (pageHeight - y - state->getFont()->getAscent() * draw_font_size) << "px;"
+ << "left:" << x << "px;"
+ ;
+ }
+
+ // TODO: tracking
+ // letter & word spacing
+ if(_is_positive(state->getCharSpace()))
+ html_fout << "letter-spacing:" << state->getCharSpace() << "px;";
+ if(_is_positive(state->getWordSpace()))
+ html_fout << "word-spacing:" << state->getWordSpace() << "px;";
+
+ //debug
+ //real pos & hori_scale
+ {
+ html_fout << "\"";
+ double x,y;
+ state->transform(state->getCurX(), state->getCurY(), &x, &y);
+ html_fout << boost::format("data-lx=\"%5%\" data-ly=\"%6%\" data-drawscale=\"%4%\" data-x=\"%1%\" data-y=\"%2%\" data-hs=\"%3%")
+ %x%y%(state->getHorizScaling())%draw_scale%state->getLineX()%state->getLineY();
+ }
+
+ html_fout << "\">";
+
+ line_opened = true;
+
+ draw_tx = cur_tx;
+ }
+
+
+ // Now ready to output
+ // get the unicodes
char *p = s->getCString();
int len = s->getLength();
+
+ double dx = 0;
+ double dy = 0;
+ double dx1,dy1;
+ double ox, oy;
+
int nChars = 0;
int nSpaces = 0;
int uLen;
+
CharCode code;
Unicode *u = nullptr;
while (len > 0) {
- auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx2, &dy2, &ox, &oy);
- dx += dx2;
- dy += dy2;
- if (n == 1 && *p == ' ') {
+ auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy);
+
+ if(!(_equal(ox, 0) && _equal(oy, 0)))
+ {
+ std::cerr << "TODO: non-zero origins" << std::endl;
+ }
+
+ outputUnicodes(u, uLen);
+
+ dx += dx1;
+ dy += dy1;
+
+ if (n == 1 && *p == ' ')
+ {
++nSpaces;
}
+
++nChars;
p += n;
len -= n;
}
- dx = dx * state->getFontSize()
- + nChars * state->getCharSpace()
- + nSpaces * state->getWordSpace();
- dx *= state->getHorizScaling();
+ dx = (dx * state->getFontSize()
+ + nChars * state->getCharSpace()
+ + nSpaces * state->getWordSpace()) * state->getHorizScaling();
+
dy *= state->getFontSize();
+ cur_tx += dx;
+ cur_ty += dy;
+
+ draw_tx += dx;
+ draw_ty += dy;
}
// The font installation code is stolen from PSOutputDev.cc in poppler
@@ -829,7 +742,7 @@ void HTMLRenderer::export_remote_font(long long fn_id, const string & suffix, Gf
// TODO: this function is called when some font is unable to process, may use the name there as a hint
void HTMLRenderer::export_remote_default_font(long long fn_id)
{
- allcss_fout << boost::format(".f%|1$x|{font-family:sans-serif;color:transparent;}")%fn_id;
+ allcss_fout << boost::format(".f%|1$x|{font-family:sans-serif;color:transparent;visibility:hidden;}")%fn_id;
if(param->readable) allcss_fout << endl;
}
@@ -923,35 +836,39 @@ void HTMLRenderer::export_color(long long color_id, const GfxRGB * rgb)
void HTMLRenderer::check_state_change(GfxState * state)
{
- if(pos_changed)
+ bool close_line = false;
+
+ if(all_changed || text_pos_changed)
{
- if(!_equal(state->getLineY(), cur_ty))
+ if(!(std::abs(cur_ty - draw_ty) * draw_scale < param->v_eps))
{
- close_cur_line();
- cur_ty = state->getLineY();
+ close_line = true;
+ draw_ty = cur_ty;
+ draw_tx = cur_tx;
}
}
- if(color_changed)
+ // TODO, we may use nested span if only color has been changed
+ if(all_changed || color_changed)
{
GfxRGB new_color;
state->getFillRGB(&new_color);
if(!((new_color.r == cur_color.r) && (new_color.g == cur_color.g) && (new_color.b == cur_color.b)))
{
- close_cur_line();
+ close_line = true;
cur_color = new_color;
cur_color_id = install_color(&new_color);
}
}
bool need_rescale_font = false;
- if(font_changed)
+ if(all_changed || font_changed)
{
long long new_fn_id = install_font(state->getFont());
if(!(new_fn_id == cur_fn_id))
{
- close_cur_line();
+ close_line = true;
cur_fn_id = new_fn_id;
}
@@ -964,10 +881,9 @@ void HTMLRenderer::check_state_change(GfxState * state)
// TODO
// Rise, HorizScale etc
- double new_ctm[6];
- memcpy(new_ctm, draw_ctm, sizeof(new_ctm));
- if(text_mat_changed || ctm_changed)
+ if(all_changed || text_mat_changed || ctm_changed)
{
+ double new_ctm[6];
double * m1 = state->getCTM();
double * m2 = state->getTextMat();
new_ctm[0] = m1[0] * m2[0] + m1[2] * m2[1];
@@ -976,52 +892,60 @@ void HTMLRenderer::check_state_change(GfxState * state)
new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3];
new_ctm[4] = new_ctm[5] = 0;
- // TODO: this is not correct
- // what to check?
- if(!_tm_equal(new_ctm, draw_ctm, 4)) { }
+ if(!_tm_equal(new_ctm, cur_ctm))
{
need_rescale_font = true;
+ memcpy(cur_ctm, new_ctm, sizeof(cur_ctm));
}
}
if(need_rescale_font)
{
- draw_scale = std::sqrt(new_ctm[2] * new_ctm[2] + new_ctm[3] * new_ctm[3]);
+ draw_scale = std::sqrt(cur_ctm[2] * cur_ctm[2] + cur_ctm[3] * cur_ctm[3]);
+
double new_draw_font_size = cur_font_size;
if(_is_positive(draw_scale))
{
new_draw_font_size *= draw_scale;
for(int i = 0; i < 4; ++i)
- new_ctm[i] /= draw_scale;
+ cur_ctm[i] /= draw_scale;
}
else
{
draw_scale = 1.0;
}
- bool flag = false;
if(!(_equal(new_draw_font_size, draw_font_size)))
{
draw_font_size = new_draw_font_size;
cur_fs_id = install_font_size(draw_font_size);
- flag = true;
+ close_line = true;
}
- if(!(_tm_equal(new_ctm, draw_ctm)))
+ if(!(_tm_equal(cur_ctm, draw_ctm)))
{
- memcpy(draw_ctm, new_ctm, sizeof(draw_ctm));
+ memcpy(draw_ctm, cur_ctm, sizeof(draw_ctm));
cur_tm_id = install_transform_matrix(draw_ctm);
- flag = true;
+ close_line = true;
}
- if(flag)
- close_cur_line();
}
+ // TODO: track these
+ /*
+ if(!(_equal(s1->getCharSpace(), s2->getCharSpace()) && _equal(s1->getWordSpace(), s2->getWordSpace())
+ && _equal(s1->getHorizScaling(), s2->getHorizScaling())))
+ return false;
+ */
+
reset_state_track();
+
+ if(close_line)
+ close_cur_line();
}
void HTMLRenderer::reset_state_track()
{
- pos_changed = false;
+ all_changed = false;
+ text_pos_changed = false;
ctm_changed = false;
text_mat_changed = false;
font_changed = false;
diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h
index 523d24e..7574be0 100644
--- a/src/HTMLRenderer.h
+++ b/src/HTMLRenderer.h
@@ -43,39 +43,6 @@ inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6)
return true;
}
-class TextString
-{
- public:
- TextString(GfxState *state);
- ~TextString();
-
- void addChars(GfxState * state, double x, double y,
- double dx, double dy,
- CharCode code, int nbytes);
- void addUnicodes(GfxState *state, double x, double y,
- double dx, double dy,
- Unicode *u, int uLen);
- double getX() const {return x;}
- double getY() const {return y;}
- double getWidth() const {return width;}
- double getHeight() const {return height;}
-
- GfxState * getState() const { return state; }
-
- const std::vector
& getUnicodes() const { return unicodes; }
- size_t getSize() const { return unicodes.size(); }
-
-
- private:
- std::vector unicodes;
- double x, y;
- double width, height;
-
- // TODO:
- // remove this, track state change in the converter
- GfxState * state;
-};
-
class HTMLRenderer : public OutputDev
{
public:
@@ -91,7 +58,7 @@ class HTMLRenderer : public OutputDev
virtual GBool upsideDown() { return gFalse; }
// Does this device use drawChar() or drawString()?
- virtual GBool useDrawChar() { return gTrue; }
+ virtual GBool useDrawChar() { return gFalse; }
// Does this device use beginType3Char/endType3Char? Otherwise,
// text in Type 3 fonts will be drawn with drawChar/drawString.
@@ -121,29 +88,25 @@ class HTMLRenderer : public OutputDev
virtual void endPage();
//----- update state
+ /*
+ * To optmize false alarms
+ * We just mark as changed, and recheck if they have been changed when we are about to output a new string
+ */
virtual void updateAll(GfxState * state);
virtual void updateFont(GfxState * state);
virtual void updateTextMat(GfxState * state);
virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
virtual void updateTextPos(GfxState * state);
-
+ virtual void updateTextShift(GfxState * state, double shift);
virtual void updateFillColor(GfxState * state);
//----- text drawing
- virtual void beginString(GfxState *state, GooString *s);
- virtual void endString(GfxState *state);
- virtual void drawChar(GfxState *state, double x, double y,
- double dx, double dy,
- double originX, double originY,
- CharCode code, int nBytes, Unicode *u, int uLen);
-
virtual void drawString(GfxState * state, GooString * s);
private:
- bool at_same_line(const TextString * ts1, const TextString * ts2) const;
-
void close_cur_line();
- void outputTextString(TextString * str);
+
+ void outputUnicodes(const Unicode * u, int uLen);
// return the mapped font name
long long install_font(GfxFont * font);
@@ -184,23 +147,19 @@ class HTMLRenderer : public OutputDev
double pageHeight ;
- // state maintained when processing pdf
+ // state tracking when processing pdf
void check_state_change(GfxState * state);
void reset_state_track();
+ bool all_changed;
- // the string being processed
- TextString * cur_string;
- // the last word of current line
- // if it's not nullptr, there's an open waiting for new strings in the same line
- TextString * cur_line;
- // (actual x) - (supposed x)
- double cur_line_x_offset;
+ // if we have a pending opened line
+ bool line_opened;
// current position
- double cur_tx, cur_ty; // in text coords
- bool pos_changed;
+ double cur_tx, cur_ty; // real text position, in text coords
+ bool text_pos_changed;
long long cur_fn_id;
double cur_font_size;
@@ -210,6 +169,10 @@ class HTMLRenderer : public OutputDev
long long cur_tm_id;
bool ctm_changed;
bool text_mat_changed;
+
+ // this is CTM * TextMAT in PDF, not only CTM
+ // [4] and [5] are ignored, we'll calculate the position of the origin separately
+ double cur_ctm[6]; // unscaled
long long cur_color_id;
GfxRGB cur_color;
@@ -218,9 +181,15 @@ class HTMLRenderer : public OutputDev
// optmize for web
// we try to render the final font size directly
// to reduce the effect of ctm as much as possible
+
+ // draw_ctm is cur_ctm scaled by 1/draw_scale,
+ // so everything redenered should be multiplied by draw_scale
double draw_ctm[6];
double draw_font_size;
- double draw_scale;
+ double draw_scale;
+
+ // the position of next char, in text coords
+ double draw_tx, draw_ty;
ofstream html_fout, allcss_fout;