diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h
index 2e6fd2f..1214788 100644
--- a/src/HTMLRenderer.h
+++ b/src/HTMLRenderer.h
@@ -129,7 +129,7 @@ class HTMLRenderer : public OutputDev
////////////////////////////////////////////////////
// manage styles
////////////////////////////////////////////////////
- long long install_font(GfxFont * font);
+ FontInfo install_font(GfxFont * font);
void install_embedded_font(GfxFont * font, const std::string & suffix, long long fn_id);
void install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id);
void install_external_font (GfxFont * font, long long fn_id);
@@ -207,7 +207,7 @@ class HTMLRenderer : public OutputDev
bool text_pos_changed;
// font & size
- long long cur_fn_id;
+ FontInfo cur_font_info;
double cur_font_size;
long long cur_fs_id;
bool font_changed;
diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc
index fcc2d24..5e307e1 100644
--- a/src/HTMLRenderer/general.cc
+++ b/src/HTMLRenderer/general.cc
@@ -155,7 +155,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
draw_scale = 1.0;
- cur_fn_id = install_font(nullptr);
+ cur_font_info = install_font(nullptr);
cur_font_size = draw_font_size = 0;
cur_fs_id = install_font_size(cur_font_size);
diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc
index 7c78d71..16569ec 100644
--- a/src/HTMLRenderer/install.cc
+++ b/src/HTMLRenderer/install.cc
@@ -19,7 +19,7 @@
using std::all_of;
-long long HTMLRenderer::install_font(GfxFont * font)
+FontInfo HTMLRenderer::install_font(GfxFont * font)
{
assert(sizeof(long long) == 2*sizeof(int));
@@ -27,16 +27,16 @@ long long HTMLRenderer::install_font(GfxFont * font)
auto iter = font_name_map.find(fn_id);
if(iter != font_name_map.end())
- return iter->second.fn_id;
+ return iter->second;
long long new_fn_id = font_name_map.size();
- font_name_map.insert(make_pair(fn_id, FontInfo({new_fn_id})));
+ auto cur_info_iter = font_name_map.insert(make_pair(fn_id, FontInfo({new_fn_id, true}))).first;
if(font == nullptr)
{
export_remote_default_font(new_fn_id);
- return new_fn_id;
+ return cur_info_iter->second;
}
if(param->debug)
@@ -48,12 +48,12 @@ long long HTMLRenderer::install_font(GfxFont * font)
if(font->getType() == fontType3) {
cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl;
export_remote_default_font(new_fn_id);
- return new_fn_id;
+ return cur_info_iter->second;
}
if(font->getWMode()) {
cerr << "Writing mode is unsupported and will be rendered as Image" << endl;
export_remote_default_font(new_fn_id);
- return new_fn_id;
+ return cur_info_iter->second;
}
auto * font_loc = font->locateFont(xref, gTrue);
@@ -66,6 +66,10 @@ long long HTMLRenderer::install_font(GfxFont * font)
string suffix = dump_embedded_font(font, new_fn_id);
if(suffix != "")
{
+ if(!((suffix == ".ttf") || (param->always_apply_tounicode)))
+ {
+ cur_info_iter->second.use_tounicode = false;
+ }
install_embedded_font(font, suffix, new_fn_id);
}
else
@@ -92,8 +96,7 @@ long long HTMLRenderer::install_font(GfxFont * font)
export_remote_default_font(new_fn_id);
}
- return new_fn_id;
-
+ return cur_info_iter->second;
}
// TODO
@@ -129,7 +132,6 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
script_fout << format("Open(%1%, 1)") % (tmp_dir / (fn + suffix)) << endl;
- auto ctu = font->getToUnicode();
int * code2GID = nullptr;
int code2GID_len = 0;
int maxcode = 0;
@@ -154,10 +156,6 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
gfree(buf);
}
}
- else if (suffix == ".cff")
- {
- script_fout << "Reencode(\"unicode\")" << endl;
- }
else
{
// pass
@@ -183,6 +181,9 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
}
}
+ bool use_tounicode = ((suffix == ".ttf") || (param->always_apply_tounicode));
+ auto ctu = font->getToUnicode();
+
ofstream map_fout(tmp_dir / (fn + ".encoding"));
add_tmp_file(fn+".encoding");
@@ -190,19 +191,23 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
{
map_fout << format("0x%|1$X|") % ((code2GID && (i < code2GID_len))? code2GID[i] : i);
- Unicode u, *pu;
- int n = 0;
- if(ctu)
- n = ctu->mapToUnicode(i, &pu);
+ Unicode u, *pu=&u;
- u = check_unicode(pu, n, i, font);
+ if(use_tounicode)
+ {
+ int n = 0;
+ if(ctu)
+ n = ctu->mapToUnicode(i, &pu);
+ u = check_unicode(pu, n, i, font);
+ }
+ else
+ {
+ u = isLegalUnicode(i) ? i : map_to_private(i);
+ }
map_fout << format(" 0x%|1$X|") % u;
map_fout << format(" # 0x%|1$X|") % i;
- for(int j = 0; j < n; ++j)
- map_fout << format(" 0x%|1$X|") % pu[j];
-
map_fout << endl;
}
diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc
index 773aa24..06c918d 100644
--- a/src/HTMLRenderer/state.cc
+++ b/src/HTMLRenderer/state.cc
@@ -92,12 +92,12 @@ void HTMLRenderer::check_state_change(GfxState * state)
// font name & size
if(all_changed || font_changed)
{
- long long new_fn_id = install_font(state->getFont());
+ FontInfo new_font_info = install_font(state->getFont());
- if(!(new_fn_id == cur_fn_id))
+ if(!(new_font_info.id == cur_font_info.id))
{
new_line_status = max(new_line_status, LineStatus::SPAN);
- cur_fn_id = new_fn_id;
+ cur_font_info = new_font_info;
}
double new_font_size = state->getFontSize();
@@ -369,7 +369,7 @@ void HTMLRenderer::prepare_line(GfxState * state)
}
html_fout << format("")
- % cur_fn_id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id;
+ % cur_font_info.id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id;
line_status = LineStatus::SPAN;
}
diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc
index b29deae..6a0359f 100644
--- a/src/HTMLRenderer/text.cc
+++ b/src/HTMLRenderer/text.cc
@@ -12,6 +12,8 @@
#include
+#include
+
#include "HTMLRenderer.h"
#include "namespace.h"
@@ -19,116 +21,143 @@ using std::all_of;
string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
{
- // mupdf consulted
-
- Object ref_obj, font_obj, font_obj2, fontdesc_obj;
Object obj, obj1, obj2;
- Dict * dict = nullptr;
+ Object font_obj, font_obj2, fontdesc_obj;
+ string suffix;
- string suffix, subtype;
-
- char buf[1024];
- int len;
-
- string fn;
- ofstream outf;
-
- auto * id = font->getID();
- ref_obj.initRef(id->num, id->gen);
- ref_obj.fetch(xref, &font_obj);
- ref_obj.free();
-
- if(!font_obj.isDict())
+ try
{
- cerr << "Font object is not a dictionary" << endl;
- goto err;
- }
+ // mupdf consulted
+ string subtype;
- dict = font_obj.getDict();
- if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
- {
- if(font_obj2.arrayGetLength() == 0)
+ auto * id = font->getID();
+
+ Object ref_obj;
+ ref_obj.initRef(id->num, id->gen);
+ ref_obj.fetch(xref, &font_obj);
+ ref_obj.free();
+
+ if(!font_obj.isDict())
{
- cerr << "Warning: empty DescendantFonts array" << endl;
+ cerr << "Font object is not a dictionary" << endl;
+ throw 0;
}
- else
+
+ Dict * dict = font_obj.getDict();
+ if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
{
- if(font_obj2.arrayGetLength() > 1)
- cerr << "TODO: multiple entries in DescendantFonts array" << endl;
-
- if(font_obj2.arrayGet(0, &obj2)->isDict())
+ if(font_obj2.arrayGetLength() == 0)
{
- dict = obj2.getDict();
- }
- }
- }
-
- if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
- {
- cerr << "Cannot find FontDescriptor " << endl;
- goto err;
- }
-
- dict = fontdesc_obj.getDict();
-
- if(dict->lookup("FontFile3", &obj)->isStream())
- {
- if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
- {
- subtype = obj1.getName();
- if(subtype == "Type1C")
- {
- suffix = ".cff";
- }
- else if (subtype == "CIDFontType0C")
- {
- suffix = ".cid";
+ cerr << "Warning: empty DescendantFonts array" << endl;
}
else
{
- cerr << "Unknown subtype: " << subtype << endl;
- goto err;
+ if(font_obj2.arrayGetLength() > 1)
+ cerr << "TODO: multiple entries in DescendantFonts array" << endl;
+
+ if(font_obj2.arrayGet(0, &obj2)->isDict())
+ {
+ dict = obj2.getDict();
+ }
}
}
+
+ if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
+ {
+ cerr << "Cannot find FontDescriptor " << endl;
+ throw 0;
+ }
+
+ dict = fontdesc_obj.getDict();
+
+ if(dict->lookup("FontFile3", &obj)->isStream())
+ {
+ if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
+ {
+ subtype = obj1.getName();
+ if(subtype == "Type1C")
+ {
+ suffix = ".cff";
+ }
+ else if (subtype == "CIDFontType0C")
+ {
+ suffix = ".cid";
+ }
+ else
+ {
+ cerr << "Unknown subtype: " << subtype << endl;
+ throw 0;
+ }
+ }
+ else
+ {
+ cerr << "Invalid subtype in font descriptor" << endl;
+ throw 0;
+ }
+ }
+ else if (dict->lookup("FontFile2", &obj)->isStream())
+ {
+ suffix = ".ttf";
+ }
+ else if (dict->lookup("FontFile", &obj)->isStream())
+ {
+ suffix = ".pfa";
+ }
else
{
- cerr << "Invalid subtype in font descriptor" << endl;
- goto err;
+ cerr << "Cannot find FontFile for dump" << endl;
+ throw 0;
+ }
+
+ if(suffix == "")
+ {
+ cerr << "Font type unrecognized" << endl;
+ throw 0;
+ }
+
+ obj.streamReset();
+
+ string fn = (format("f%|1$x|")%fn_id).str();
+ ofstream outf;
+ outf.open(tmp_dir / (fn + suffix), ofstream::binary);
+ add_tmp_file(fn+suffix);
+
+ char buf[1024];
+ int len;
+ while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
+ {
+ outf.write(buf, len);
+ }
+ outf.close();
+ obj.streamClose();
+
+ /*
+ * Pre re-encode the font such that it's consistent with the encoding used by PDF
+ */
+ auto output_to_file = [](void * stream, const char * data, int len)->void
+ {
+ reinterpret_cast(stream)->write(data, len);
+ };
+
+ if(suffix == ".cff")
+ {
+ auto f = FoFiType1C::load((char*)((tmp_dir/(fn+suffix)).c_str()));
+
+ suffix = ".pfa";
+ outf.open(tmp_dir / (fn + suffix), ofstream::binary);
+ add_tmp_file(fn+suffix);
+
+ f->convertToType1(nullptr, (const char **)dynamic_cast(font)->getEncoding(), false, output_to_file, &outf);
+ outf.close();
+
+ delete f;
}
}
- else if (dict->lookup("FontFile2", &obj)->isStream())
- {
- suffix = ".ttf";
- }
- else if (dict->lookup("FontFile", &obj)->isStream())
+ catch(int)
{
- suffix = ".pfa";
- }
- else
- {
- cerr << "Cannot find FontFile for dump" << endl;
- goto err;
+ cerr << format("Someting wrong when trying to dump font %|1$x|") % fn_id << endl;
}
- if(suffix == "")
- {
- cerr << "Font type unrecognized" << endl;
- goto err;
- }
-
- obj.streamReset();
-
- fn = (format("f%|1$x|%2%")%fn_id%suffix).str();
- outf.open(tmp_dir / fn , ofstream::binary);
- add_tmp_file(fn);
- while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
- {
- outf.write(buf, len);
- }
- outf.close();
- obj.streamClose();
-
-err:
obj2.free();
obj1.free();
obj.free();
@@ -136,6 +165,7 @@ err:
fontdesc_obj.free();
font_obj2.free();
font_obj.free();
+
return suffix;
}
@@ -196,7 +226,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
++nSpaces;
}
- Unicode uu = check_unicode(u, uLen, code, font);
+ Unicode uu = (cur_font_info.use_tounicode
+ ? check_unicode(u, uLen, code, font)
+ : (isLegalUnicode(code) ? code : map_to_private(code))
+ );
outputUnicodes(html_fout, &uu, 1);
dx += dx1;
diff --git a/src/Param.h b/src/Param.h
index 20f0b7a..3570f5d 100644
--- a/src/Param.h
+++ b/src/Param.h
@@ -27,6 +27,7 @@ struct Param
int process_nontext;
int single_html;
+ int always_apply_tounicode;
int debug;
int clean_tmp;
diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc
index 99d8f37..37b6efb 100644
--- a/src/pdf2htmlEX.cc
+++ b/src/pdf2htmlEX.cc
@@ -76,6 +76,7 @@ po::variables_map parse_options (int argc, char **argv)
("heps", po::value(¶m.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)")
("veps", po::value(¶m.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")
("single-html", po::value(¶m.single_html)->default_value(1), "combine everything into one single HTML file")
+ ("always-apply-tounicode", po::value(¶m.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
("process-nontext", po::value(¶m.process_nontext)->default_value(1), "process nontext objects")
("debug", po::value(¶m.debug)->default_value(0), "output debug information")
("clean-tmp", po::value(¶m.clean_tmp)->default_value(1), "clean temporary files after processing")
diff --git a/src/util.h b/src/util.h
index d44f74a..d6b9b20 100644
--- a/src/util.h
+++ b/src/util.h
@@ -69,12 +69,7 @@ static inline bool isLegalUnicode(Unicode u)
return true;
}
-/*
- * We have to use a single Unicode value to reencode fonts
- * if we got multi-unicode values, it might be expanded ligature, try to restore it
- * if we cannot figure it out at the end, use a private mapping
- */
-static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
+static inline Unicode map_to_private(CharCode code)
{
Unicode private_mapping = (Unicode)(code + 0xE000);
if(private_mapping > 0xF8FF)
@@ -89,10 +84,18 @@ static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont
}
}
}
+ return private_mapping;
+}
-
+/*
+ * We have to use a single Unicode value to reencode fonts
+ * if we got multi-unicode values, it might be expanded ligature, try to restore it
+ * if we cannot figure it out at the end, use a private mapping
+ */
+static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
+{
if(len == 0)
- return private_mapping;
+ return map_to_private(code);
if(len == 1)
{
@@ -113,7 +116,7 @@ static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont
}
}
- return private_mapping;
+ return map_to_private(code);
}
static inline void outputUnicodes(std::ostream & out, const Unicode * u, int uLen)
@@ -165,7 +168,8 @@ static inline bool operator == (const GfxRGB & rgb1, const GfxRGB & rgb2)
class FontInfo
{
public:
- long long fn_id;
+ long long id;
+ bool use_tounicode;
};
// wrapper of the transform matrix double[6]