diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a64c7c..60e47f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x") #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") -add_executable(pdftohtmlEX src/pdftohtmlEX.cc src/HTMLRenderer.cc src/HTMLRenderer.h src/BackgroundRenderer.cc src/BackgroundRenderer.h) +add_executable(pdftohtmlEX src/pdftohtmlEX.cc src/HTMLRenderer.cc src/HTMLRenderer.h src/BackgroundRenderer.cc src/BackgroundRenderer.h src/Consts.h) target_link_libraries(pdftohtmlEX poppler boost_program_options) diff --git a/README.md b/README.md index f277104..5bdf438 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,9 @@ pdf2html**EX** Introduction ----------------------------- -Traditional pdf -> html conversion tools are more likely pdf -> text tools. +pdf2htmlEX renders PDF files in HTML, utilizing modern technologies of html/css, aims to provide an accuracy rendering, while keeping optimized for Web display. -For those who are not satisfied with them, this might be the right one for you. - -pdf2htmlEX utilizes latest technologies of html/css, aims to provide an accuracy rendering, while keeping optimized for Web display. - -pdf2htmlEX is optimized for recent versions of moderm web browsers such as Mozilla Firefox & Google Chrome. +pdf2htmlEX is optimized for recent versions of modern web browsers such as Mozilla Firefox & Google Chrome. Features ---------------------------- diff --git a/src/BackgroundRenderer.cc b/src/BackgroundRenderer.cc index 2913947..5941eee 100644 --- a/src/BackgroundRenderer.cc +++ b/src/BackgroundRenderer.cc @@ -14,7 +14,7 @@ void BackgroundRenderer::drawChar(GfxState *state, double x, double y, CharCode code, int nBytes, Unicode *u, int uLen) { auto font = state->getFont(); - if((font->getType() == fontType3) || (font->getWMode())) +// if((font->getType() == fontType3) || (font->getWMode())) { SplashOutputDev::drawChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); } diff --git a/src/Consts.h b/src/Consts.h new file mode 100644 index 0000000..7c8b677 --- /dev/null +++ b/src/Consts.h @@ -0,0 +1,79 @@ +/* + * Constants + * + * by WangLu + * 2012.08.07 + */ + +#ifndef CONSTS_H__ +#define CONSTS_H__ +#include +#include + +const char * HTML_HEAD = "\n\ +\ +\ +\ +\ +
"; + +const char * HTML_TAIL = "
"; + +const std::map BASE_14_FONT_CSS_FONT_MAP({\ + { "Courier", "Courier,monospace" },\ + { "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },\ + { "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },\ + { "Symbol", "Symbol,\"Standard Symbols L\"" },\ + { "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },\ +}); + +const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + +#endif //CONSTS_H__ diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index 92bbe0d..fdef4ff 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -25,6 +25,7 @@ #include "HTMLRenderer.h" #include "BackgroundRenderer.h" +#include "Consts.h" /* * CSS classes @@ -41,74 +42,9 @@ * w - White space * t - Transform matrix * c - Color + * */ -const char * HTML_HEAD = "\n\ -\ -\ -\ -\ -
"; - -const char * HTML_TAIL = "
"; - -const std::map BASE_14_FONT_CSS_FONT_MAP({\ - { "Courier", "Courier,monospace" },\ - { "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },\ - { "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },\ - { "Symbol", "Symbol" },\ - { "ZapfDingbats", "ZapfDingbats" },\ -}); - -const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; - TextString::TextString(GfxState *state) :unicodes() ,x(state->getCurX()) @@ -242,7 +178,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) cur_fn_id = cur_fs_id = cur_tm_id = cur_color_id = 0; cur_line_x_offset = 0; - cur_line_y = 0; + cur_tx = cur_ty = 0; cur_font_size = 0; memcpy(draw_ctm, id_matrix, sizeof(draw_ctm)); @@ -370,6 +306,7 @@ void HTMLRenderer::beginString(GfxState *state, GooString *s) { void HTMLRenderer::endString(GfxState *state) { if (cur_string->getSize() == 0) { delete cur_string ; + cur_string = nullptr; return; } @@ -378,8 +315,9 @@ void HTMLRenderer::endString(GfxState *state) { { if(at_same_line(cur_line, cur_string)) { - double x1 = cur_line->getX() + cur_line->getWidth(); - double x2 = cur_string->getX(); + // TODO: this is not correct + double x1 = cur_line->getState()->getLineX() + cur_line->getWidth(); + double x2 = cur_string->getState()->getLineX(); double target = (x2-x1-cur_line_x_offset) * draw_scale; if(target > -param->h_eps) @@ -437,7 +375,8 @@ void HTMLRenderer::endString(GfxState *state) { html_fout << "\""; double x,y; cur_state->transform(cur_state->getCurX(), cur_state->getCurY(), &x, &y); - html_fout << boost::format(" data-x=\"%1%\" data-y=\"%2%\" hs=\"%3%")%x%y%(cur_state->getHorizScaling()); + html_fout << boost::format("data-lx=\"%5%\" data-ly=\"%6%\" data-scale=\"%4%\" data-x=\"%1%\" data-y=\"%2%\" data-hs=\"%3%") + %x%y%(cur_state->getHorizScaling())%draw_scale%cur_state->getLineX()%cur_state->getLineY(); } html_fout << "\">"; @@ -462,17 +401,6 @@ void HTMLRenderer::drawChar(GfxState *state, double x, double y, cur_string->addUnicodes(state, x, y, dx, dy, u, uLen); else { - if(nBytes > 0) - { - std::cerr << "Cannot map to Unicode!" << std::endl; - std::cerr << cur_fn_id << std::endl; - std::cerr << "*"; - for(int i = 0; i < nBytes; ++i) - { - std::cerr << (int)(((char*)&code)[i]); - } - std::cerr << std::endl; - } cur_string->addChars(state, x, y, dx, dy, code, nBytes); } } @@ -480,11 +408,16 @@ void HTMLRenderer::drawChar(GfxState *state, double x, double y, // TODO void HTMLRenderer::drawString(GfxState * state, GooString * s) { + check_state_change(state); + auto font = state->getFont(); if(font->getWMode()) - std::cerr << "TODO: writing mode" << std::endl; + { + //TODO + return; + } - // stolen from poppler + // from poppler double dx = 0; double dy = 0; double dx2, dy2; @@ -992,12 +925,11 @@ void HTMLRenderer::check_state_change(GfxState * state) { if(pos_changed) { - if(!_equal(state->getLineY(), cur_line_y)) + if(!_equal(state->getLineY(), cur_ty)) { close_cur_line(); - cur_line_y = state->getLineY(); + cur_ty = state->getLineY(); } - } if(color_changed) @@ -1012,7 +944,7 @@ void HTMLRenderer::check_state_change(GfxState * state) } } - bool need_rescale_font = true; + bool need_rescale_font = false; if(font_changed) { long long new_fn_id = install_font(state->getFont()); @@ -1022,6 +954,7 @@ void HTMLRenderer::check_state_change(GfxState * state) close_cur_line(); cur_fn_id = new_fn_id; } + if(!_equal(cur_font_size, state->getFontSize())) { cur_font_size = state->getFontSize(); @@ -1043,7 +976,9 @@ void HTMLRenderer::check_state_change(GfxState * state) new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3]; new_ctm[4] = new_ctm[5] = 0; - if(!_tm_equal(new_ctm, draw_ctm, 4)) + // TODO: this is not correct + // what to check? + if(!_tm_equal(new_ctm, draw_ctm, 4)) { } { need_rescale_font = true; } diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index dd1fe54..523d24e 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -199,7 +199,7 @@ class HTMLRenderer : public OutputDev double cur_line_x_offset; // current position - double cur_line_y; + double cur_tx, cur_ty; // in text coords bool pos_changed; long long cur_fn_id;