From 5a4eccc632b68f16619360f57f5f6795dbfaa673 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 10 Aug 2012 21:30:41 +0800 Subject: [PATCH 01/11] support external font --- CMakeLists.txt | 2 +- src/Consts.cc | 85 +++++++++++++++++++++++++++++++++++++++++++++ src/Consts.h | 71 +++++-------------------------------- src/HTMLRenderer.cc | 58 ++++++++++--------------------- src/HTMLRenderer.h | 17 ++------- src/util.h | 63 +++++++++++++++++++++++++++++++++ 6 files changed, 179 insertions(+), 117 deletions(-) create mode 100644 src/Consts.cc create mode 100644 src/util.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 39700f3..7b9a208 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") -add_executable(pdftohtmlEX src/pdftohtmlEX.cc src/HTMLRenderer.cc src/HTMLRenderer.h src/BackgroundRenderer.cc src/BackgroundRenderer.h src/Consts.h) +add_executable(pdftohtmlEX src/pdftohtmlEX.cc src/HTMLRenderer.cc src/HTMLRenderer.h src/BackgroundRenderer.cc src/BackgroundRenderer.h src/Consts.h src/Consts.cc src/util.h) target_link_libraries(pdftohtmlEX poppler boost_program_options) diff --git a/src/Consts.cc b/src/Consts.cc new file mode 100644 index 0000000..87ee1cf --- /dev/null +++ b/src/Consts.cc @@ -0,0 +1,85 @@ +/* + * Constants + * + * by WangLu + * 2012.08.10 + */ + +#include "Consts.h" + +const double EPS = 1e-6; + +const char * HTML_HEAD = "\n\ +\ +\ +\ +\ +
"; + +const char * HTML_TAIL = "
"; + +const std::map BASE_14_FONT_CSS_FONT_MAP({\ + { "Courier", "Courier,monospace" },\ + { "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },\ + { "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },\ + { "Symbol", "Symbol,\"Standard Symbols L\"" },\ + { "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },\ +}); + +const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + +const std::map GB_ENCODED_FONT_NAME_MAP({\ + {"\xCB\xCE\xCC\xE5", "SimSun"},\ + {"\xBA\xDA\xCC\xE5", "SimHei"},\ + {"\xBF\xAC\xCC\xE5_GB2312", "SimKai"},\ + {"\xB7\xC2\xCB\xCE_GB2312", "SimFang"},\ + {"\xC1\xA5\xCA\xE9", "SimLi"},\ +}); diff --git a/src/Consts.h b/src/Consts.h index e39b67e..23fdc4c 100644 --- a/src/Consts.h +++ b/src/Consts.h @@ -9,72 +9,17 @@ #define CONSTS_H__ #include #include +#include -const char * HTML_HEAD = "\n\ -\ -\ -\ -\ -
"; +extern const double EPS; -const char * HTML_TAIL = "
"; +extern const char * HTML_HEAD; +extern const char * HTML_TAIL; -const std::map BASE_14_FONT_CSS_FONT_MAP({\ - { "Courier", "Courier,monospace" },\ - { "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },\ - { "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },\ - { "Symbol", "Symbol,\"Standard Symbols L\"" },\ - { "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },\ -}); +extern const std::map BASE_14_FONT_CSS_FONT_MAP; -const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; +extern const double id_matrix[6]; + +extern const std::map GB_ENCODED_FONT_NAME_MAP; #endif //CONSTS_H__ diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index 1367399..0529050 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -27,6 +26,7 @@ #include "HTMLRenderer.h" #include "BackgroundRenderer.h" #include "Consts.h" +#include "util.h" /* * CSS classes @@ -163,37 +163,6 @@ void HTMLRenderer::close_cur_line() } } -void HTMLRenderer::outputUnicodes(const Unicode * u, int uLen) -{ - for(int i = 0; i < uLen; ++i) - { - switch(u[i]) - { - case '&': - html_fout << "&"; - break; - case '\"': - html_fout << """; - break; - case '\'': - html_fout << "'"; - break; - case '<': - html_fout << "<"; - break; - case '>': - html_fout << ">"; - break; - default: - { - char buf[4]; - auto n = mapUTF8(u[i], buf, 4); - html_fout.write(buf, n); - } - } - } -} - void HTMLRenderer::updateAll(GfxState * state) { all_changed = true; @@ -359,12 +328,12 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) { Unicode u = (c&0xff); c >>= 8; - outputUnicodes(&u, 1); + outputUnicodes(html_fout, &u, 1); } } else { - outputUnicodes(u, uLen); + outputUnicodes(html_fout, u, uLen); } dx += dx1; @@ -436,8 +405,7 @@ long long HTMLRenderer::install_font(GfxFont * font) install_embedded_font(font, new_fn_id); break; case gfxFontLocExternal: - std::cerr << "TODO: external font" << std::endl; - export_remote_default_font(new_fn_id); + install_external_font(font, new_fn_id); break; case gfxFontLocResident: install_base_font(font, font_loc, new_fn_id); @@ -502,8 +470,20 @@ void HTMLRenderer::install_embedded_font (GfxFont * font, long long fn_id) export_remote_font(fn_id, ".ttf", "truetype", font); } + +void HTMLRenderer::install_external_font( GfxFont * font, long long fn_id) +{ + std::string fontname(font->getName()->getCString()); + + // resolve bad encodings in GB + auto iter = GB_ENCODED_FONT_NAME_MAP.find(fontname); + if(iter != GB_ENCODED_FONT_NAME_MAP.end()) + fontname = iter->second; + + export_local_font(fn_id, font, fontname, ""); +} -void HTMLRenderer::install_base_font( GfxFont * font, GfxFontLoc * font_loc, long long fn_id) +void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id) { std::string psname(font_loc->path->getCString()); string basename = psname.substr(0, psname.find('-')); @@ -517,7 +497,7 @@ void HTMLRenderer::install_base_font( GfxFont * font, GfxFontLoc * font_loc, lon else cssfont = iter->second; - export_local_font(fn_id, font, font_loc, psname, cssfont); + export_local_font(fn_id, font, psname, cssfont); } long long HTMLRenderer::install_font_size(double font_size) @@ -601,7 +581,7 @@ void HTMLRenderer::export_remote_default_font(long long fn_id) allcss_fout << endl; } -void HTMLRenderer::export_local_font(long long fn_id, GfxFont * font, GfxFontLoc * font_loc, const string & original_font_name, const string & cssfont) +void HTMLRenderer::export_local_font(long long fn_id, GfxFont * font, const string & original_font_name, const string & cssfont) { allcss_fout << boost::format(".f%|1$x|{") % fn_id; allcss_fout << "font-family:" << ((cssfont == "") ? (original_font_name+","+general_font_family(font)) : cssfont) << ";"; diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index f4bc00e..727dc9a 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -29,20 +29,10 @@ #include #include "Param.h" +#include "util.h" using namespace std; -static const double EPS = 1e-6; -inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; } -inline bool _is_positive(double x) { return x > EPS; } -inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6) -{ - for(int i = 0; i < size; ++i) - if(!_equal(tm1[i], tm2[i])) - return false; - return true; -} - class HTMLRenderer : public OutputDev { public: @@ -106,13 +96,12 @@ class HTMLRenderer : public OutputDev private: void close_cur_line(); - void outputUnicodes(const Unicode * u, int uLen); - // return the mapped font name long long install_font(GfxFont * font); static void output_to_file(void * outf, const char * data, int len); void install_embedded_font (GfxFont * font, long long fn_id); + void install_external_font (GfxFont * font, long long fn_id); void install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id); long long install_font_size(double font_size); @@ -126,7 +115,7 @@ class HTMLRenderer : public OutputDev */ void export_remote_font(long long fn_id, const string & suffix, const string & format, GfxFont * font); void export_remote_default_font(long long fn_id); - void export_local_font(long long fn_id, GfxFont * font, GfxFontLoc * font_loc, const string & original_font_name, const string & cssfont); + void export_local_font(long long fn_id, GfxFont * font, const string & original_font_name, const string & cssfont); std::string general_font_family(GfxFont * font); void export_font_size(long long fs_id, double font_size); diff --git a/src/util.h b/src/util.h new file mode 100644 index 0000000..8d9af9e --- /dev/null +++ b/src/util.h @@ -0,0 +1,63 @@ +/* + * Misc functions + * + * + * by WangLu + * 2012.08.10 + */ + + +#ifndef UTIL_H__ +#define UTIL_H__ + +#include +#include + +#include + +#include "Consts.h" + +static inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; } +static inline bool _is_positive(double x) { return x > EPS; } +static inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6) +{ + for(int i = 0; i < size; ++i) + if(!_equal(tm1[i], tm2[i])) + return false; + return true; +} + +static inline void outputUnicodes(std::ostream & out, const Unicode * u, int uLen) +{ + for(int i = 0; i < uLen; ++i) + { + switch(u[i]) + { + case '&': + out << "&"; + break; + case '\"': + out << """; + break; + case '\'': + out << "'"; + break; + case '<': + out << "<"; + break; + case '>': + out << ">"; + break; + default: + { + char buf[4]; + auto n = mapUTF8(u[i], buf, 4); + out.write(buf, n); + } + } + } +} + + + +#endif //UTIL_H__ From 548e7efbbcc8ed270fbfaac8de94e9b446877f69 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 10 Aug 2012 21:41:24 +0800 Subject: [PATCH 02/11] use tmp dir --- bin/pdf2htmlEX | 14 ++++++++++---- src/Consts.cc | 8 ++++++-- src/Consts.h | 6 ++++-- src/HTMLRenderer.cc | 6 +++--- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/bin/pdf2htmlEX b/bin/pdf2htmlEX index 8469cdc..a6e5682 100755 --- a/bin/pdf2htmlEX +++ b/bin/pdf2htmlEX @@ -1,6 +1,12 @@ #!/bin/bash set -e +TMPDIR=/tmp/pdf2htmlEX + +# prepare the temporary directory +test -d $TMPDIR || mkdir -p $TMPDIR +rm -f $TMPDIR/* 2>/dev/null + # Get directory of the script SOURCE="${BASH_SOURCE[0]}" while [ -h "$SOURCE" ] ; do SOURCE="$(readlink "$SOURCE")"; done @@ -8,13 +14,13 @@ SCRIPT_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"/ # Execute ${SCRIPT_DIR}/pdftohtmlEX $* -if [ -f convert.pe ]; then +if [ -f $TMPDIR/convert.pe ]; then echo -n "Converting fonts: " - fontforge -script convert.pe 2>/dev/null + fontforge -script $TMPDIR/convert.pe 2>/dev/null echo "." -# rm convert.pe fi -rm *.encoding 2>/dev/null +#clean +rm -f $TMPDIR/* 2>/dev/null echo "Done." diff --git a/src/Consts.cc b/src/Consts.cc index 87ee1cf..07abc62 100644 --- a/src/Consts.cc +++ b/src/Consts.cc @@ -9,7 +9,7 @@ const double EPS = 1e-6; -const char * HTML_HEAD = "\n\ +const std::string HTML_HEAD = "\n\ \ \