From 917dbc6233d827d08e4e732979d684d1dd24032e Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 9 Aug 2012 22:47:22 +0800 Subject: [PATCH 1/2] dump tounicode map --- CMakeLists.txt | 4 +-- src/HTMLRenderer.cc | 64 +++++++++++++++++++++++++++++++-------------- src/Param.h | 2 ++ src/pdftohtmlEX.cc | 1 + 4 files changed, 50 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 60e47f0..39700f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,8 +14,8 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wunused-function") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x") -#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") add_executable(pdftohtmlEX src/pdftohtmlEX.cc src/HTMLRenderer.cc src/HTMLRenderer.h src/BackgroundRenderer.cc src/BackgroundRenderer.h src/Consts.h) target_link_libraries(pdftohtmlEX poppler boost_program_options) diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index 3907ec4..45094c5 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include "HTMLRenderer.h" #include "BackgroundRenderer.h" @@ -87,29 +88,31 @@ void HTMLRenderer::process(PDFDoc *doc) std::cerr.flush(); } std::cerr << std::endl; - std::cerr << "Processing Others: "; - // Render non-text objects as image - // copied from poppler - SplashColor color; - color[0] = color[1] = color[2] = 255; - - auto bg_renderer = new BackgroundRenderer(splashModeRGB8, 4, gFalse, color); - bg_renderer->startDoc(doc); - - for(int i = param->first_page; i <= param->last_page ; ++i) + if(param->process_nontext) { - doc->displayPage(bg_renderer, i, 4*param->h_dpi, 4*param->v_dpi, - 0, true, false, false, - nullptr, nullptr, nullptr, nullptr); - bg_renderer->getBitmap()->writeImgFile(splashFormatPng, (char*)(boost::format("p%|1$x|.png")%i).str().c_str(), 4*param->h_dpi, 4*param->v_dpi); + // Render non-text objects as image + std::cerr << "Processing Others: "; + // copied from poppler + SplashColor color; + color[0] = color[1] = color[2] = 255; - std::cerr << "."; - std::cerr.flush(); + auto bg_renderer = new BackgroundRenderer(splashModeRGB8, 4, gFalse, color); + bg_renderer->startDoc(doc); + + for(int i = param->first_page; i <= param->last_page ; ++i) + { + doc->displayPage(bg_renderer, i, 4*param->h_dpi, 4*param->v_dpi, + 0, true, false, false, + nullptr, nullptr, nullptr, nullptr); + bg_renderer->getBitmap()->writeImgFile(splashFormatPng, (char*)(boost::format("p%|1$x|.png")%i).str().c_str(), 4*param->h_dpi, 4*param->v_dpi); + + std::cerr << "."; + std::cerr.flush(); + } + delete bg_renderer; + std::cerr << std::endl; } - delete bg_renderer; - - std::cerr << std::endl; } void HTMLRenderer::startPage(int pageNum, GfxState *state) @@ -303,6 +306,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) //debug //real pos & hori_scale + if(0) { html_fout << "\""; double x,y; @@ -438,6 +442,28 @@ void HTMLRenderer::install_embedded_font (GfxFont * font, long long fn_id) { //generate script for fontforge fontscript_fout << boost::format("Open(\"%1%(%2%)\",1)") % param->input_filename % font->getName()->getCString() << endl; + if(font->hasToUnicodeCMap()) + { + auto ctu = font->getToUnicode(); + ofstream map_fout((boost::format("f%|1$x|.encoding") % fn_id).str().c_str()); + for(int i = 0; i < 256; ++i) + { + Unicode * u; + auto n = ctu->mapToUnicode(i, &u); + // not sure what to do when n > 1 + if(n > 0) + { + map_fout << boost::format("0x%|1$X|") % i; + for(int j = 0; j < n; ++j) + map_fout << boost::format(" 0x%|1$X|") % u[i]; + map_fout << " #" << endl; + } + } + + fontscript_fout << boost::format("LoadEncodingFile(\"f%|1$x|.encoding\")") % fn_id << endl; + fontscript_fout << boost::format("Reencode(\"f%|1$x|.encoding\")") % fn_id << endl; + } + fontscript_fout << boost::format("Generate(\"f%|1$x|.ttf\")") % fn_id << endl; export_remote_font(fn_id, ".ttf", "truetype", font); diff --git a/src/Param.h b/src/Param.h index 0b3747f..39f8462 100644 --- a/src/Param.h +++ b/src/Param.h @@ -19,6 +19,8 @@ struct Param double h_dpi, v_dpi; double h_eps, v_eps; + + int process_nontext; }; diff --git a/src/pdftohtmlEX.cc b/src/pdftohtmlEX.cc index 35bfdfd..366324f 100644 --- a/src/pdftohtmlEX.cc +++ b/src/pdftohtmlEX.cc @@ -125,6 +125,7 @@ po::variables_map parse_options (int argc, char **argv) ("vdpi", po::value(¶m.v_dpi)->default_value(72.0), "vertical DPI") ("heps", po::value(¶m.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)") ("veps", po::value(¶m.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)") + ("process-nontext", po::value(¶m.process_nontext)->default_value(1), "process nontext objects") ; opt_hidden.add_options() From 648f7a1e2a212d24b44423b3e94c87dcae19031e Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 10 Aug 2012 01:16:04 +0800 Subject: [PATCH 2/2] dump correct gid->unicode map for fontforge --- bin/pdf2htmlEX | 4 +++- src/HTMLRenderer.cc | 45 +++++++++++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/bin/pdf2htmlEX b/bin/pdf2htmlEX index 3c1e8bd..ffd80e8 100755 --- a/bin/pdf2htmlEX +++ b/bin/pdf2htmlEX @@ -12,7 +12,9 @@ if [ -f convert.pe ]; then echo -n "Converting fonts: " fontforge -script convert.pe 2>/dev/null echo "." -# rm convert.pe + rm convert.pe fi +rm *.encoding 2>/dev/null + echo "Done." diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index 45094c5..716f5e7 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -442,26 +442,43 @@ void HTMLRenderer::install_embedded_font (GfxFont * font, long long fn_id) { //generate script for fontforge fontscript_fout << boost::format("Open(\"%1%(%2%)\",1)") % param->input_filename % font->getName()->getCString() << endl; - if(font->hasToUnicodeCMap()) + if(font->hasToUnicodeCMap() && (font->getType() == fontTrueType)) { - auto ctu = font->getToUnicode(); - ofstream map_fout((boost::format("f%|1$x|.encoding") % fn_id).str().c_str()); - for(int i = 0; i < 256; ++i) + char * buf; + int buflen; + FoFiTrueType * ttf; + if((buf = font->readEmbFontFile(xref, &buflen))) { - Unicode * u; - auto n = ctu->mapToUnicode(i, &u); - // not sure what to do when n > 1 - if(n > 0) + if((ttf = FoFiTrueType::make(buf, buflen))) { - map_fout << boost::format("0x%|1$X|") % i; - for(int j = 0; j < n; ++j) - map_fout << boost::format(" 0x%|1$X|") % u[i]; - map_fout << " #" << endl; + auto ctg = dynamic_cast(font)->getCodeToGIDMap(ttf); + auto ctu = font->getToUnicode(); + ofstream map_fout((boost::format("f%|1$x|.encoding") % fn_id).str().c_str()); + + for(int i = 0; i < 256; ++i) + { + int code = ctg[i]; + Unicode * u; + auto n = ctu->mapToUnicode(i, &u); + // not sure what to do when n > 1 + if(n > 0) + { + map_fout << boost::format("0x%|1$X|") % code; + for(int j = 0; j < n; ++j) + map_fout << boost::format(" 0x%|1$X|") % u[j]; + map_fout << boost::format(" # 0x%|1$X|") % i << endl; + } + } + + fontscript_fout << boost::format("LoadEncodingFile(\"f%|1$x|.encoding\", \"f%|1$x|\")") % fn_id << endl; + fontscript_fout << boost::format("Reencode(\"f%|1$x|\", 1)") % fn_id << endl; + + ctu->decRefCnt(); + delete ttf; } + gfree(buf); } - fontscript_fout << boost::format("LoadEncodingFile(\"f%|1$x|.encoding\")") % fn_id << endl; - fontscript_fout << boost::format("Reencode(\"f%|1$x|.encoding\")") % fn_id << endl; } fontscript_fout << boost::format("Generate(\"f%|1$x|.ttf\")") % fn_id << endl;