From 1642b4a37eb99b7e375794060a811b224d9af290 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Mon, 27 Aug 2012 23:09:01 +0800 Subject: [PATCH] disable toUnicode for non-ttf fonts by default --- src/HTMLRenderer.h | 4 +- src/HTMLRenderer/general.cc | 2 +- src/HTMLRenderer/install.cc | 47 ++++---- src/HTMLRenderer/state.cc | 8 +- src/HTMLRenderer/text.cc | 215 +++++++++++++++++++++--------------- src/Param.h | 1 + src/pdf2htmlEX.cc | 1 + src/util.h | 24 ++-- 8 files changed, 173 insertions(+), 129 deletions(-) diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index 2e6fd2f..1214788 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -129,7 +129,7 @@ class HTMLRenderer : public OutputDev //////////////////////////////////////////////////// // manage styles //////////////////////////////////////////////////// - long long install_font(GfxFont * font); + FontInfo install_font(GfxFont * font); void install_embedded_font(GfxFont * font, const std::string & suffix, long long fn_id); void install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id); void install_external_font (GfxFont * font, long long fn_id); @@ -207,7 +207,7 @@ class HTMLRenderer : public OutputDev bool text_pos_changed; // font & size - long long cur_fn_id; + FontInfo cur_font_info; double cur_font_size; long long cur_fs_id; bool font_changed; diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index fcc2d24..5e307e1 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -155,7 +155,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) draw_scale = 1.0; - cur_fn_id = install_font(nullptr); + cur_font_info = install_font(nullptr); cur_font_size = draw_font_size = 0; cur_fs_id = install_font_size(cur_font_size); diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index 7c78d71..16569ec 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -19,7 +19,7 @@ using std::all_of; -long long HTMLRenderer::install_font(GfxFont * font) +FontInfo HTMLRenderer::install_font(GfxFont * font) { assert(sizeof(long long) == 2*sizeof(int)); @@ -27,16 +27,16 @@ long long HTMLRenderer::install_font(GfxFont * font) auto iter = font_name_map.find(fn_id); if(iter != font_name_map.end()) - return iter->second.fn_id; + return iter->second; long long new_fn_id = font_name_map.size(); - font_name_map.insert(make_pair(fn_id, FontInfo({new_fn_id}))); + auto cur_info_iter = font_name_map.insert(make_pair(fn_id, FontInfo({new_fn_id, true}))).first; if(font == nullptr) { export_remote_default_font(new_fn_id); - return new_fn_id; + return cur_info_iter->second; } if(param->debug) @@ -48,12 +48,12 @@ long long HTMLRenderer::install_font(GfxFont * font) if(font->getType() == fontType3) { cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl; export_remote_default_font(new_fn_id); - return new_fn_id; + return cur_info_iter->second; } if(font->getWMode()) { cerr << "Writing mode is unsupported and will be rendered as Image" << endl; export_remote_default_font(new_fn_id); - return new_fn_id; + return cur_info_iter->second; } auto * font_loc = font->locateFont(xref, gTrue); @@ -66,6 +66,10 @@ long long HTMLRenderer::install_font(GfxFont * font) string suffix = dump_embedded_font(font, new_fn_id); if(suffix != "") { + if(!((suffix == ".ttf") || (param->always_apply_tounicode))) + { + cur_info_iter->second.use_tounicode = false; + } install_embedded_font(font, suffix, new_fn_id); } else @@ -92,8 +96,7 @@ long long HTMLRenderer::install_font(GfxFont * font) export_remote_default_font(new_fn_id); } - return new_fn_id; - + return cur_info_iter->second; } // TODO @@ -129,7 +132,6 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix, script_fout << format("Open(%1%, 1)") % (tmp_dir / (fn + suffix)) << endl; - auto ctu = font->getToUnicode(); int * code2GID = nullptr; int code2GID_len = 0; int maxcode = 0; @@ -154,10 +156,6 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix, gfree(buf); } } - else if (suffix == ".cff") - { - script_fout << "Reencode(\"unicode\")" << endl; - } else { // pass @@ -183,6 +181,9 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix, } } + bool use_tounicode = ((suffix == ".ttf") || (param->always_apply_tounicode)); + auto ctu = font->getToUnicode(); + ofstream map_fout(tmp_dir / (fn + ".encoding")); add_tmp_file(fn+".encoding"); @@ -190,19 +191,23 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix, { map_fout << format("0x%|1$X|") % ((code2GID && (i < code2GID_len))? code2GID[i] : i); - Unicode u, *pu; - int n = 0; - if(ctu) - n = ctu->mapToUnicode(i, &pu); + Unicode u, *pu=&u; - u = check_unicode(pu, n, i, font); + if(use_tounicode) + { + int n = 0; + if(ctu) + n = ctu->mapToUnicode(i, &pu); + u = check_unicode(pu, n, i, font); + } + else + { + u = isLegalUnicode(i) ? i : map_to_private(i); + } map_fout << format(" 0x%|1$X|") % u; map_fout << format(" # 0x%|1$X|") % i; - for(int j = 0; j < n; ++j) - map_fout << format(" 0x%|1$X|") % pu[j]; - map_fout << endl; } diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 773aa24..06c918d 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -92,12 +92,12 @@ void HTMLRenderer::check_state_change(GfxState * state) // font name & size if(all_changed || font_changed) { - long long new_fn_id = install_font(state->getFont()); + FontInfo new_font_info = install_font(state->getFont()); - if(!(new_fn_id == cur_fn_id)) + if(!(new_font_info.id == cur_font_info.id)) { new_line_status = max(new_line_status, LineStatus::SPAN); - cur_fn_id = new_fn_id; + cur_font_info = new_font_info; } double new_font_size = state->getFontSize(); @@ -369,7 +369,7 @@ void HTMLRenderer::prepare_line(GfxState * state) } html_fout << format("") - % cur_fn_id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id; + % cur_font_info.id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id; line_status = LineStatus::SPAN; } diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index b29deae..6a0359f 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -12,6 +12,8 @@ #include +#include + #include "HTMLRenderer.h" #include "namespace.h" @@ -19,116 +21,143 @@ using std::all_of; string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) { - // mupdf consulted - - Object ref_obj, font_obj, font_obj2, fontdesc_obj; Object obj, obj1, obj2; - Dict * dict = nullptr; + Object font_obj, font_obj2, fontdesc_obj; + string suffix; - string suffix, subtype; - - char buf[1024]; - int len; - - string fn; - ofstream outf; - - auto * id = font->getID(); - ref_obj.initRef(id->num, id->gen); - ref_obj.fetch(xref, &font_obj); - ref_obj.free(); - - if(!font_obj.isDict()) + try { - cerr << "Font object is not a dictionary" << endl; - goto err; - } + // mupdf consulted + string subtype; - dict = font_obj.getDict(); - if(dict->lookup("DescendantFonts", &font_obj2)->isArray()) - { - if(font_obj2.arrayGetLength() == 0) + auto * id = font->getID(); + + Object ref_obj; + ref_obj.initRef(id->num, id->gen); + ref_obj.fetch(xref, &font_obj); + ref_obj.free(); + + if(!font_obj.isDict()) { - cerr << "Warning: empty DescendantFonts array" << endl; + cerr << "Font object is not a dictionary" << endl; + throw 0; } - else + + Dict * dict = font_obj.getDict(); + if(dict->lookup("DescendantFonts", &font_obj2)->isArray()) { - if(font_obj2.arrayGetLength() > 1) - cerr << "TODO: multiple entries in DescendantFonts array" << endl; - - if(font_obj2.arrayGet(0, &obj2)->isDict()) + if(font_obj2.arrayGetLength() == 0) { - dict = obj2.getDict(); - } - } - } - - if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict()) - { - cerr << "Cannot find FontDescriptor " << endl; - goto err; - } - - dict = fontdesc_obj.getDict(); - - if(dict->lookup("FontFile3", &obj)->isStream()) - { - if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName()) - { - subtype = obj1.getName(); - if(subtype == "Type1C") - { - suffix = ".cff"; - } - else if (subtype == "CIDFontType0C") - { - suffix = ".cid"; + cerr << "Warning: empty DescendantFonts array" << endl; } else { - cerr << "Unknown subtype: " << subtype << endl; - goto err; + if(font_obj2.arrayGetLength() > 1) + cerr << "TODO: multiple entries in DescendantFonts array" << endl; + + if(font_obj2.arrayGet(0, &obj2)->isDict()) + { + dict = obj2.getDict(); + } } } + + if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict()) + { + cerr << "Cannot find FontDescriptor " << endl; + throw 0; + } + + dict = fontdesc_obj.getDict(); + + if(dict->lookup("FontFile3", &obj)->isStream()) + { + if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName()) + { + subtype = obj1.getName(); + if(subtype == "Type1C") + { + suffix = ".cff"; + } + else if (subtype == "CIDFontType0C") + { + suffix = ".cid"; + } + else + { + cerr << "Unknown subtype: " << subtype << endl; + throw 0; + } + } + else + { + cerr << "Invalid subtype in font descriptor" << endl; + throw 0; + } + } + else if (dict->lookup("FontFile2", &obj)->isStream()) + { + suffix = ".ttf"; + } + else if (dict->lookup("FontFile", &obj)->isStream()) + { + suffix = ".pfa"; + } else { - cerr << "Invalid subtype in font descriptor" << endl; - goto err; + cerr << "Cannot find FontFile for dump" << endl; + throw 0; + } + + if(suffix == "") + { + cerr << "Font type unrecognized" << endl; + throw 0; + } + + obj.streamReset(); + + string fn = (format("f%|1$x|")%fn_id).str(); + ofstream outf; + outf.open(tmp_dir / (fn + suffix), ofstream::binary); + add_tmp_file(fn+suffix); + + char buf[1024]; + int len; + while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0) + { + outf.write(buf, len); + } + outf.close(); + obj.streamClose(); + + /* + * Pre re-encode the font such that it's consistent with the encoding used by PDF + */ + auto output_to_file = [](void * stream, const char * data, int len)->void + { + reinterpret_cast(stream)->write(data, len); + }; + + if(suffix == ".cff") + { + auto f = FoFiType1C::load((char*)((tmp_dir/(fn+suffix)).c_str())); + + suffix = ".pfa"; + outf.open(tmp_dir / (fn + suffix), ofstream::binary); + add_tmp_file(fn+suffix); + + f->convertToType1(nullptr, (const char **)dynamic_cast(font)->getEncoding(), false, output_to_file, &outf); + outf.close(); + + delete f; } } - else if (dict->lookup("FontFile2", &obj)->isStream()) - { - suffix = ".ttf"; - } - else if (dict->lookup("FontFile", &obj)->isStream()) + catch(int) { - suffix = ".pfa"; - } - else - { - cerr << "Cannot find FontFile for dump" << endl; - goto err; + cerr << format("Someting wrong when trying to dump font %|1$x|") % fn_id << endl; } - if(suffix == "") - { - cerr << "Font type unrecognized" << endl; - goto err; - } - - obj.streamReset(); - - fn = (format("f%|1$x|%2%")%fn_id%suffix).str(); - outf.open(tmp_dir / fn , ofstream::binary); - add_tmp_file(fn); - while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0) - { - outf.write(buf, len); - } - outf.close(); - obj.streamClose(); - -err: obj2.free(); obj1.free(); obj.free(); @@ -136,6 +165,7 @@ err: fontdesc_obj.free(); font_obj2.free(); font_obj.free(); + return suffix; } @@ -196,7 +226,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) ++nSpaces; } - Unicode uu = check_unicode(u, uLen, code, font); + Unicode uu = (cur_font_info.use_tounicode + ? check_unicode(u, uLen, code, font) + : (isLegalUnicode(code) ? code : map_to_private(code)) + ); outputUnicodes(html_fout, &uu, 1); dx += dx1; diff --git a/src/Param.h b/src/Param.h index 20f0b7a..3570f5d 100644 --- a/src/Param.h +++ b/src/Param.h @@ -27,6 +27,7 @@ struct Param int process_nontext; int single_html; + int always_apply_tounicode; int debug; int clean_tmp; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 99d8f37..37b6efb 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -76,6 +76,7 @@ po::variables_map parse_options (int argc, char **argv) ("heps", po::value(¶m.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)") ("veps", po::value(¶m.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)") ("single-html", po::value(¶m.single_html)->default_value(1), "combine everything into one single HTML file") + ("always-apply-tounicode", po::value(¶m.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on") ("process-nontext", po::value(¶m.process_nontext)->default_value(1), "process nontext objects") ("debug", po::value(¶m.debug)->default_value(0), "output debug information") ("clean-tmp", po::value(¶m.clean_tmp)->default_value(1), "clean temporary files after processing") diff --git a/src/util.h b/src/util.h index d44f74a..d6b9b20 100644 --- a/src/util.h +++ b/src/util.h @@ -69,12 +69,7 @@ static inline bool isLegalUnicode(Unicode u) return true; } -/* - * We have to use a single Unicode value to reencode fonts - * if we got multi-unicode values, it might be expanded ligature, try to restore it - * if we cannot figure it out at the end, use a private mapping - */ -static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font) +static inline Unicode map_to_private(CharCode code) { Unicode private_mapping = (Unicode)(code + 0xE000); if(private_mapping > 0xF8FF) @@ -89,10 +84,18 @@ static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont } } } + return private_mapping; +} - +/* + * We have to use a single Unicode value to reencode fonts + * if we got multi-unicode values, it might be expanded ligature, try to restore it + * if we cannot figure it out at the end, use a private mapping + */ +static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font) +{ if(len == 0) - return private_mapping; + return map_to_private(code); if(len == 1) { @@ -113,7 +116,7 @@ static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont } } - return private_mapping; + return map_to_private(code); } static inline void outputUnicodes(std::ostream & out, const Unicode * u, int uLen) @@ -165,7 +168,8 @@ static inline bool operator == (const GfxRGB & rgb1, const GfxRGB & rgb2) class FontInfo { public: - long long fn_id; + long long id; + bool use_tounicode; }; // wrapper of the transform matrix double[6]