diff --git a/CMakeLists.txt b/CMakeLists.txt index c7bfe32..9a1ee61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ include_directories(${Boost_INCLUDE_DIRS}) link_directories ( ${Boost_LIBRARY_DIRS} ) include_directories(${CMAKE_SOURCE_DIR}/src) -set(PDF2HTMLEX_VERSION "0.1") +set(PDF2HTMLEX_VERSION "0.2") set(ARCHIVE_NAME pdf2htmlex-${PDF2HTMLEX_VERSION}) add_custom_target(dist COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD diff --git a/share/all.css b/share/all.css index e3f9f89..a52e522 100644 --- a/share/all.css +++ b/share/all.css @@ -1,3 +1,5 @@ +/* Base CSS */ +/* by Wang Lu */ #pdf-main { position:absolute; top:0; @@ -42,3 +44,4 @@ span { .i { position:absolute; } +/* Base CSS END */ diff --git a/share/head.html b/share/head.html index 7c404ba..4df3c44 100644 --- a/share/head.html +++ b/share/head.html @@ -4,3 +4,4 @@ + diff --git a/share/neck.html b/share/neck.html index df9901c..a34e3f9 100644 --- a/share/neck.html +++ b/share/neck.html @@ -20,3 +20,4 @@ function show_pages()
+ diff --git a/share/tail.html b/share/tail.html index 3d0757e..4624e30 100644 --- a/share/tail.html +++ b/share/tail.html @@ -2,3 +2,4 @@
+ diff --git a/share/unify.pe b/share/unify.pe index c1724a2..82f05ec 100644 --- a/share/unify.pe +++ b/share/unify.pe @@ -19,7 +19,7 @@ if (-td > d) d = -td endif if (-hd > d) - d = -hd) + d = -hd endif SetOS2Value("WinAscent", a) SetOS2Value("WinDescent", d) @@ -28,3 +28,4 @@ SetOS2Value("HHeadDescent", -d) Print(ta-td) Print(a) Print(d) +# script end diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index 61c9892..88e20c2 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -125,15 +125,16 @@ class HTMLRenderer : public OutputDev //////////////////////////////////////////////////// void add_tmp_file (const std::string & fn); void clean_tmp_files (); - std::string dump_embedded_font (GfxFont * font, long long fn_id); + boost::filesystem::path dump_embedded_font (GfxFont * font, long long fn_id); + void embed_font(const boost::filesystem::path & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false); //////////////////////////////////////////////////// // manage styles //////////////////////////////////////////////////// FontInfo install_font(GfxFont * font); - void install_embedded_font(GfxFont * font, const std::string & suffix, long long fn_id, FontInfo & info); - void install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id); - void install_external_font (GfxFont * font, long long fn_id); + void install_embedded_font(GfxFont * font, FontInfo & info); + void install_base_font(GfxFont * font, GfxFontLoc * font_loc, FontInfo & info); + void install_external_font (GfxFont * font, FontInfo & info); long long install_font_size(double font_size); long long install_transform_matrix(const double * tm); @@ -152,7 +153,7 @@ class HTMLRenderer : public OutputDev */ void export_remote_font(const FontInfo & info, const std::string & suffix, const std::string & fontfileformat, GfxFont * font); void export_remote_default_font(long long fn_id); - void export_local_font(long long fn_id, GfxFont * font, const std::string & original_font_name, const std::string & cssfont); + void export_local_font(const FontInfo & info, GfxFont * font, const std::string & original_font_name, const std::string & cssfont); void export_font_size(long long fs_id, double font_size); void export_transform_matrix(long long tm_id, const double * tm); diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index 1e14d70..a61c7e6 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -50,9 +50,9 @@ void HTMLRenderer::export_remote_default_font(long long fn_id) allcss_fout << format(".f%|1$x|{font-family:sans-serif;color:transparent;visibility:hidden;}")%fn_id << endl; } -void HTMLRenderer::export_local_font(long long fn_id, GfxFont * font, const string & original_font_name, const string & cssfont) +void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont) { - allcss_fout << format(".f%|1$x|{") % fn_id; + allcss_fout << format(".f%|1$x|{") % info.id; allcss_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";"; if(font->isBold() || ifind_first(original_font_name, "bold")) @@ -63,7 +63,7 @@ void HTMLRenderer::export_local_font(long long fn_id, GfxFont * font, const stri else if(font->isItalic() || ifind_first(original_font_name, "italic")) allcss_fout << "font-style:italic;"; - allcss_fout << "line-height:" << (font->getAscent() - font->getDescent()) << ";"; + allcss_fout << "line-height:" << (info.ascent - info.descent) << ";"; allcss_fout << "}" << endl; } diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index 3f5c636..daf41ed 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -13,16 +13,11 @@ #include -#include -#include +#include "Param.h" #include "HTMLRenderer.h" #include "namespace.h" -#include "config.h" - -using std::all_of; -using std::max; -using std::min; +#include "util.h" FontInfo HTMLRenderer::install_font(GfxFont * font) { @@ -69,23 +64,13 @@ FontInfo HTMLRenderer::install_font(GfxFont * font) switch(font_loc -> locType) { case gfxFontLocEmbedded: - { - string suffix = dump_embedded_font(font, new_fn_id); - if(suffix != "") - { - install_embedded_font(font, suffix, new_fn_id, cur_info_iter->second); - } - else - { - export_remote_default_font(new_fn_id); - } - } + install_embedded_font(font, cur_info_iter->second); break; case gfxFontLocExternal: - install_external_font(font, new_fn_id); + install_external_font(font, cur_info_iter->second); break; case gfxFontLocResident: - install_base_font(font, font_loc, new_fn_id); + install_base_font(font, font_loc, cur_info_iter->second); break; default: cerr << "TODO: other font loc" << endl; @@ -102,204 +87,38 @@ FontInfo HTMLRenderer::install_font(GfxFont * font) return cur_info_iter->second; } -// TODO -// add a new function and move to text.cc -void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix, long long fn_id, FontInfo & info) +void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info) { - string fn = (format("f%|1$x|") % fn_id).str(); - - path script_path = tmp_dir / (fn + ".pe"); - ofstream script_fout(script_path, ofstream::binary); - add_tmp_file(fn+".pe"); - - script_fout << format("Open(%1%, 1)") % (tmp_dir / (fn + suffix)) << endl; - - int * code2GID = nullptr; - int code2GID_len = 0; - int maxcode = 0; - - Gfx8BitFont * font_8bit = nullptr; - - /* - * Step 1 - * dump the font file directly from the font descriptor and put the glyphs into the correct slots - * - * for 8bit + nonTrueType - * re-encoding the font using a PostScript encoding list (glyph id <-> glpyh name) - * - * for 8bit + TrueType - * sort the glpyhs as the original order, and later will map GID (instead of char code) to Unicode - * - * for CID + nonTrueType - * Flatten the font - * - * for CID Truetype - * same as 8bitTrueType, except for that we have to check 65536 charcodes - */ - if(!font->isCIDFont()) + auto path = dump_embedded_font(font, info.id); + if(path != "") { - font_8bit = dynamic_cast(font); - maxcode = 0xff; - if(suffix == ".ttf") - { - script_fout << "Reencode(\"original\")" << endl; - int buflen; - char * buf = nullptr; - if((buf = font->readEmbFontFile(xref, &buflen))) - { - FoFiTrueType *fftt = nullptr; - if((fftt = FoFiTrueType::make(buf, buflen))) - { - code2GID = font_8bit->getCodeToGIDMap(fftt); - code2GID_len = 256; - delete fftt; - } - gfree(buf); - } - } - else - { - // move the slot such that it's consistent with the encoding seen in PDF - ofstream out(tmp_dir / (fn + "_.encoding")); - add_tmp_file(fn+"_.encoding"); - - out << format("/%1% [") % fn << endl; - for(int i = 0; i < 256; ++i) - { - auto cn = font_8bit->getCharName(i); - out << "/" << ((cn == nullptr) ? ".notdef" : cn) << endl; - } - out << "] def" << endl; - - script_fout << format("LoadEncodingFile(%1%)") % (tmp_dir / (fn+"_.encoding")) << endl; - script_fout << format("Reencode(\"%1%\")") % fn << endl; - } + embed_font(path, font, info); + export_remote_font(info, param->font_suffix, param->font_format, font); } else { - maxcode = 0xffff; - - if(suffix == ".ttf") - { - script_fout << "Reencode(\"original\")" << endl; - - GfxCIDFont * _font = dynamic_cast(font); - - // code2GID has been stored for embedded CID fonts - code2GID = _font->getCIDToGID(); - code2GID_len = _font->getCIDToGIDLen(); - } - else - { - script_fout << "CIDFlatten()" << endl; - } + export_remote_default_font(info.id); } - - /* - * Step 2 - * map charcode (or GID for CID truetype) - * generate an Consortium encoding file and let fontforge handle it. - * - * - Always map to Unicode for 8bit TrueType fonts and CID fonts - * - * - For 8bit nonTruetype fonts: - * Try to calculate the correct Unicode value from the glyph names, unless param->always_apply_tounicode is set - * - */ - - info.use_tounicode = ((suffix == ".ttf") || (font->isCIDFont()) || (param->always_apply_tounicode)); - - auto ctu = font->getToUnicode(); - - ofstream map_fout(tmp_dir / (fn + ".encoding")); - add_tmp_file(fn+".encoding"); - - int cnt = 0; - for(int i = 0; i <= maxcode; ++i) - { - if((suffix != ".ttf") && (font_8bit != nullptr) && (font_8bit->getCharName(i) == nullptr)) - continue; - - ++ cnt; - map_fout << format("0x%|1$X|") % ((code2GID && (i < code2GID_len))? code2GID[i] : i); - - Unicode u, *pu=&u; - - if(info.use_tounicode) - { - int n = 0; - if(ctu) - n = ctu->mapToUnicode(i, &pu); - u = check_unicode(pu, n, i, font); - } - else - { - u = unicode_from_font(i, font); - } - - map_fout << format(" 0x%|1$X|") % u; - map_fout << format(" # 0x%|1$X|") % i; - - map_fout << endl; - } - - if(cnt > 0) - { - script_fout << format("LoadEncodingFile(%1%, \"%2%\")") % (tmp_dir / (fn+".encoding")) % fn << endl; - script_fout << format("Reencode(\"%1%\", 1)") % fn << endl; - } - - if(ctu) - ctu->decRefCnt(); - - auto dest = ((param->single_html ? tmp_dir : dest_dir) / (fn+(param->font_suffix))); - if(param->single_html) - add_tmp_file(fn+(param->font_suffix)); - - /* - * [Win|Typo|HHead][Ascent|Descent] - * Firefox & Chrome interprets the values in different ways - * Trying to unify them - */ - script_fout << format("Generate(%1%)") % dest << endl; - script_fout << "Close()" << endl; - script_fout << format("Open(%1%, 1)") % dest << endl; - script_fout << ifstream(PDF2HTMLEX_DATA_PATH / UNIFY_SCRIPT_FILENAME).rdbuf(); - script_fout << format("Generate(%1%)") % dest << endl; - script_fout.close(); - - if(system((boost::format("fontforge -script %1% 1>%2% 2>%3%") % script_path % (tmp_dir / (fn+".info")) % (tmp_dir / NULL_FILENAME)).str().c_str()) != 0) - cerr << "Warning: fontforge failed." << endl; - - add_tmp_file(fn+".info"); - add_tmp_file(NULL_FILENAME); - - // read metric - int em, ascent, descent; - if(ifstream(tmp_dir / (fn+".info")) >> em >> ascent >> descent) - { - if(em != 0) - { - info.ascent = ((double)ascent) / em; - info.descent = -((double)descent) / em; - } - else - { - info.ascent = 0; - info.descent = 0; - } - } - - if(param->debug) - { - cerr << "Ascent: " << info.ascent << " Descent: " << info.descent << endl; - } - - export_remote_font(info, param->font_suffix, param->font_format, font); } -void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id) +void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, FontInfo & info) { + GfxFontLoc * localfontloc = font->locateFont(xref, gFalse); + if(param->embed_base_font) + { + if(localfontloc != nullptr) + { + embed_font(path(localfontloc->path->getCString()), font, info); + export_remote_font(info, param->font_suffix, param->font_format, font); + return; + } + else + { + cerr << format("Cannot embed base font: f%|1$x|") % info.id << endl; + } + + } + string psname(font_loc->path->getCString()); string basename = psname.substr(0, psname.find('-')); string cssfont; @@ -312,10 +131,22 @@ void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, long else cssfont = iter->second; - export_local_font(fn_id, font, psname, cssfont); + // still try to get an idea of read ascent/descent + if(localfontloc != nullptr) + { + // fill in ascent/descent only, do not embed + embed_font(path(localfontloc->path->getCString()), font, info, true); + } + else + { + info.ascent = font->getAscent(); + info.descent = font->getDescent(); + } + + export_local_font(info, font, psname, cssfont); } -void HTMLRenderer::install_external_font( GfxFont * font, long long fn_id) +void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info) { string fontname(font->getName()->getCString()); @@ -327,7 +158,18 @@ void HTMLRenderer::install_external_font( GfxFont * font, long long fn_id) cerr << "Warning: workaround for font names in bad encodings." << endl; } - export_local_font(fn_id, font, fontname, ""); + //debug + GooString gfn(fontname.c_str()); + GooString * path = globalParams->findFontFile(&gfn); + + cerr << "Find: " << fontname << endl; + if(path) + { + cerr << "MATCHED: " << path->getCString() << endl; + delete path; + } + + export_local_font(info, font, fontname, ""); } long long HTMLRenderer::install_font_size(double font_size) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 6759765..00f853b 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -1,7 +1,7 @@ /* * text.ccc * - * Handling text and relative stuffs + * Handling text & font, and relative stuffs * * by WangLu * 2012.08.14 @@ -11,19 +11,24 @@ #include #include +#include +#include -#include +#include +#include #include "HTMLRenderer.h" #include "namespace.h" +#include "config.h" -using std::all_of; +using boost::algorithm::to_lower; -string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) +path HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) { Object obj, obj1, obj2; Object font_obj, font_obj2, fontdesc_obj; string suffix; + path filepath; try { @@ -119,7 +124,8 @@ string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) string fn = (format("f%|1$x|")%fn_id).str(); ofstream outf; - outf.open(tmp_dir / (fn + suffix), ofstream::binary); + filepath = tmp_dir / (fn + suffix); + outf.open(filepath, ofstream::binary); add_tmp_file(fn+suffix); char buf[1024]; @@ -144,7 +150,210 @@ string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) font_obj2.free(); font_obj.free(); - return suffix; + return filepath; +} + +void HTMLRenderer::embed_font(const path & filepath, GfxFont * font, FontInfo & info, bool get_metric_only) +{ + string suffix = filepath.extension().string(); + to_lower(suffix); + + string fn = (format("f%|1$x|") % info.id).str(); + + path script_path = tmp_dir / (fn + ".pe"); + ofstream script_fout(script_path, ofstream::binary); + add_tmp_file(fn+".pe"); + + script_fout << format("Open(%1%, 1)") % filepath << endl; + + int * code2GID = nullptr; + int code2GID_len = 0; + int maxcode = 0; + + Gfx8BitFont * font_8bit = nullptr; + + if(get_metric_only) + { + info.use_tounicode = false; + } + else + { + /* + * Step 1 + * dump the font file directly from the font descriptor and put the glyphs into the correct slots + * + * for 8bit + nonTrueType + * re-encoding the font using a PostScript encoding list (glyph id <-> glpyh name) + * + * for 8bit + TrueType + * sort the glpyhs as the original order, and later will map GID (instead of char code) to Unicode + * + * for CID + nonTrueType + * Flatten the font + * + * for CID Truetype + * same as 8bitTrueType, except for that we have to check 65536 charcodes + */ + if(!font->isCIDFont()) + { + font_8bit = dynamic_cast(font); + maxcode = 0xff; + if((suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf")) + { + script_fout << "Reencode(\"original\")" << endl; + FoFiTrueType *fftt = nullptr; + if((fftt = FoFiTrueType::load((char*)filepath.c_str())) != nullptr) + { + code2GID = font_8bit->getCodeToGIDMap(fftt); + code2GID_len = 256; + delete fftt; + } + } + else + { + // move the slot such that it's consistent with the encoding seen in PDF + ofstream out(tmp_dir / (fn + "_.encoding")); + add_tmp_file(fn+"_.encoding"); + + out << format("/%1% [") % fn << endl; + for(int i = 0; i < 256; ++i) + { + auto cn = font_8bit->getCharName(i); + out << "/" << ((cn == nullptr) ? ".notdef" : cn) << endl; + } + out << "] def" << endl; + + script_fout << format("LoadEncodingFile(%1%)") % (tmp_dir / (fn+"_.encoding")) << endl; + script_fout << format("Reencode(\"%1%\")") % fn << endl; + } + } + else + { + maxcode = 0xffff; + + if(suffix == ".ttf") + { + script_fout << "Reencode(\"original\")" << endl; + + GfxCIDFont * _font = dynamic_cast(font); + + // code2GID has been stored for embedded CID fonts + code2GID = _font->getCIDToGID(); + code2GID_len = _font->getCIDToGIDLen(); + } + else + { + script_fout << "CIDFlatten()" << endl; + } + } + + /* + * Step 2 + * map charcode (or GID for CID truetype) + * generate an Consortium encoding file and let fontforge handle it. + * + * - Always map to Unicode for 8bit TrueType fonts and CID fonts + * + * - For 8bit nonTruetype fonts: + * Try to calculate the correct Unicode value from the glyph names, unless param->always_apply_tounicode is set + * + */ + + info.use_tounicode = ((suffix == ".ttf") || (font->isCIDFont()) || (param->always_apply_tounicode)); + + auto ctu = font->getToUnicode(); + + ofstream map_fout(tmp_dir / (fn + ".encoding")); + add_tmp_file(fn+".encoding"); + + int cnt = 0; + for(int i = 0; i <= maxcode; ++i) + { + if((suffix != ".ttf") && (font_8bit != nullptr) && (font_8bit->getCharName(i) == nullptr)) + continue; + + ++ cnt; + map_fout << format("0x%|1$X|") % ((code2GID && (i < code2GID_len))? code2GID[i] : i); + + Unicode u, *pu=&u; + + if(info.use_tounicode) + { + int n = 0; + if(ctu) + n = ctu->mapToUnicode(i, &pu); + u = check_unicode(pu, n, i, font); + } + else + { + u = unicode_from_font(i, font); + } + + map_fout << format(" 0x%|1$X|") % u; + map_fout << format(" # 0x%|1$X|") % i; + + map_fout << endl; + } + + if(cnt > 0) + { + script_fout << format("LoadEncodingFile(%1%, \"%2%\")") % (tmp_dir / (fn+".encoding")) % fn << endl; + script_fout << format("Reencode(\"%1%\", 1)") % fn << endl; + } + + if(ctu) + ctu->decRefCnt(); + } + + auto dest = ((param->single_html ? tmp_dir : dest_dir) / (fn+(param->font_suffix))); + if(param->single_html) + add_tmp_file(fn+(param->font_suffix)); + + /* + * [Win|Typo|HHead][Ascent|Descent] + * Firefox & Chrome interprets the values in different ways + * Trying to unify them + */ + add_tmp_file(fn + "_.ttf"); + script_fout << format("Generate(%1%)") % (tmp_dir / (fn + "_.ttf")) << endl; + script_fout << "Close()" << endl; + script_fout << format("Open(%1%, 1)") % (tmp_dir / (fn + "_.ttf")) << endl; + script_fout << ifstream(PDF2HTMLEX_DATA_PATH / UNIFY_SCRIPT_FILENAME).rdbuf(); + script_fout << format("Generate(%1%)") % dest << endl; + script_fout.close(); + + if(system((boost::format("fontforge -script %1% 1>%2% 2>%3%") % script_path % (tmp_dir / (fn+".info")) % (tmp_dir / NULL_FILENAME)).str().c_str()) != 0) + cerr << "Warning: fontforge failed." << endl; + + add_tmp_file(fn+".info"); + add_tmp_file(NULL_FILENAME); + + // read metric + int em, ascent, descent; + if(ifstream(tmp_dir / (fn+".info")) >> em >> ascent >> descent) + { + if(em != 0) + { + info.ascent = ((double)ascent) / em; + info.descent = -((double)descent) / em; + } + else + { + info.ascent = 0; + info.descent = 0; + } + } + else + { + cerr << "Warning: cannot read font info for " << fn << endl; + info.ascent = font->getAscent(); + info.descent = font->getDescent(); + } + + if(param->debug) + { + cerr << "Ascent: " << info.ascent << " Descent: " << info.descent << endl; + } } void HTMLRenderer::drawString(GfxState * state, GooString * s) diff --git a/src/Param.h b/src/Param.h index 6dccd68..136d6d6 100644 --- a/src/Param.h +++ b/src/Param.h @@ -28,6 +28,8 @@ struct Param int process_nontext; int single_html; + int embed_base_font; + int embed_external_font; // Advanced tweak double h_eps, v_eps; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 8b2ffe8..8e3aee8 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -76,6 +76,8 @@ po::variables_map parse_options (int argc, char **argv) ("process-nontext", po::value(¶m.process_nontext)->default_value(1), "process nontext objects") ("single-html", po::value(¶m.single_html)->default_value(1), "combine everything into one single HTML file") + ("embed-base-font", po::value(¶m.embed_base_font)->default_value(1), "embed local matched font for base 14 fonts in the PDF file") + ("embed-external-font", po::value(¶m.embed_external_font)->default_value(0), "embed local matched font for external fonts in the PDF file") ("heps", po::value(¶m.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)") ("veps", po::value(¶m.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")