mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
disable toUnicode for non-ttf fonts by default
This commit is contained in:
parent
45c7c1e2da
commit
1642b4a37e
@ -129,7 +129,7 @@ class HTMLRenderer : public OutputDev
|
||||
////////////////////////////////////////////////////
|
||||
// manage styles
|
||||
////////////////////////////////////////////////////
|
||||
long long install_font(GfxFont * font);
|
||||
FontInfo install_font(GfxFont * font);
|
||||
void install_embedded_font(GfxFont * font, const std::string & suffix, long long fn_id);
|
||||
void install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id);
|
||||
void install_external_font (GfxFont * font, long long fn_id);
|
||||
@ -207,7 +207,7 @@ class HTMLRenderer : public OutputDev
|
||||
bool text_pos_changed;
|
||||
|
||||
// font & size
|
||||
long long cur_fn_id;
|
||||
FontInfo cur_font_info;
|
||||
double cur_font_size;
|
||||
long long cur_fs_id;
|
||||
bool font_changed;
|
||||
|
@ -155,7 +155,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
||||
|
||||
draw_scale = 1.0;
|
||||
|
||||
cur_fn_id = install_font(nullptr);
|
||||
cur_font_info = install_font(nullptr);
|
||||
cur_font_size = draw_font_size = 0;
|
||||
cur_fs_id = install_font_size(cur_font_size);
|
||||
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
using std::all_of;
|
||||
|
||||
long long HTMLRenderer::install_font(GfxFont * font)
|
||||
FontInfo HTMLRenderer::install_font(GfxFont * font)
|
||||
{
|
||||
assert(sizeof(long long) == 2*sizeof(int));
|
||||
|
||||
@ -27,16 +27,16 @@ long long HTMLRenderer::install_font(GfxFont * font)
|
||||
|
||||
auto iter = font_name_map.find(fn_id);
|
||||
if(iter != font_name_map.end())
|
||||
return iter->second.fn_id;
|
||||
return iter->second;
|
||||
|
||||
long long new_fn_id = font_name_map.size();
|
||||
|
||||
font_name_map.insert(make_pair(fn_id, FontInfo({new_fn_id})));
|
||||
auto cur_info_iter = font_name_map.insert(make_pair(fn_id, FontInfo({new_fn_id, true}))).first;
|
||||
|
||||
if(font == nullptr)
|
||||
{
|
||||
export_remote_default_font(new_fn_id);
|
||||
return new_fn_id;
|
||||
return cur_info_iter->second;
|
||||
}
|
||||
|
||||
if(param->debug)
|
||||
@ -48,12 +48,12 @@ long long HTMLRenderer::install_font(GfxFont * font)
|
||||
if(font->getType() == fontType3) {
|
||||
cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl;
|
||||
export_remote_default_font(new_fn_id);
|
||||
return new_fn_id;
|
||||
return cur_info_iter->second;
|
||||
}
|
||||
if(font->getWMode()) {
|
||||
cerr << "Writing mode is unsupported and will be rendered as Image" << endl;
|
||||
export_remote_default_font(new_fn_id);
|
||||
return new_fn_id;
|
||||
return cur_info_iter->second;
|
||||
}
|
||||
|
||||
auto * font_loc = font->locateFont(xref, gTrue);
|
||||
@ -66,6 +66,10 @@ long long HTMLRenderer::install_font(GfxFont * font)
|
||||
string suffix = dump_embedded_font(font, new_fn_id);
|
||||
if(suffix != "")
|
||||
{
|
||||
if(!((suffix == ".ttf") || (param->always_apply_tounicode)))
|
||||
{
|
||||
cur_info_iter->second.use_tounicode = false;
|
||||
}
|
||||
install_embedded_font(font, suffix, new_fn_id);
|
||||
}
|
||||
else
|
||||
@ -92,8 +96,7 @@ long long HTMLRenderer::install_font(GfxFont * font)
|
||||
export_remote_default_font(new_fn_id);
|
||||
}
|
||||
|
||||
return new_fn_id;
|
||||
|
||||
return cur_info_iter->second;
|
||||
}
|
||||
|
||||
// TODO
|
||||
@ -129,7 +132,6 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
|
||||
|
||||
script_fout << format("Open(%1%, 1)") % (tmp_dir / (fn + suffix)) << endl;
|
||||
|
||||
auto ctu = font->getToUnicode();
|
||||
int * code2GID = nullptr;
|
||||
int code2GID_len = 0;
|
||||
int maxcode = 0;
|
||||
@ -154,10 +156,6 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
|
||||
gfree(buf);
|
||||
}
|
||||
}
|
||||
else if (suffix == ".cff")
|
||||
{
|
||||
script_fout << "Reencode(\"unicode\")" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
// pass
|
||||
@ -183,6 +181,9 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
|
||||
}
|
||||
}
|
||||
|
||||
bool use_tounicode = ((suffix == ".ttf") || (param->always_apply_tounicode));
|
||||
auto ctu = font->getToUnicode();
|
||||
|
||||
ofstream map_fout(tmp_dir / (fn + ".encoding"));
|
||||
add_tmp_file(fn+".encoding");
|
||||
|
||||
@ -190,19 +191,23 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
|
||||
{
|
||||
map_fout << format("0x%|1$X|") % ((code2GID && (i < code2GID_len))? code2GID[i] : i);
|
||||
|
||||
Unicode u, *pu;
|
||||
int n = 0;
|
||||
if(ctu)
|
||||
n = ctu->mapToUnicode(i, &pu);
|
||||
Unicode u, *pu=&u;
|
||||
|
||||
u = check_unicode(pu, n, i, font);
|
||||
if(use_tounicode)
|
||||
{
|
||||
int n = 0;
|
||||
if(ctu)
|
||||
n = ctu->mapToUnicode(i, &pu);
|
||||
u = check_unicode(pu, n, i, font);
|
||||
}
|
||||
else
|
||||
{
|
||||
u = isLegalUnicode(i) ? i : map_to_private(i);
|
||||
}
|
||||
|
||||
map_fout << format(" 0x%|1$X|") % u;
|
||||
map_fout << format(" # 0x%|1$X|") % i;
|
||||
|
||||
for(int j = 0; j < n; ++j)
|
||||
map_fout << format(" 0x%|1$X|") % pu[j];
|
||||
|
||||
map_fout << endl;
|
||||
}
|
||||
|
||||
|
@ -92,12 +92,12 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
// font name & size
|
||||
if(all_changed || font_changed)
|
||||
{
|
||||
long long new_fn_id = install_font(state->getFont());
|
||||
FontInfo new_font_info = install_font(state->getFont());
|
||||
|
||||
if(!(new_fn_id == cur_fn_id))
|
||||
if(!(new_font_info.id == cur_font_info.id))
|
||||
{
|
||||
new_line_status = max(new_line_status, LineStatus::SPAN);
|
||||
cur_fn_id = new_fn_id;
|
||||
cur_font_info = new_font_info;
|
||||
}
|
||||
|
||||
double new_font_size = state->getFontSize();
|
||||
@ -369,7 +369,7 @@ void HTMLRenderer::prepare_line(GfxState * state)
|
||||
}
|
||||
|
||||
html_fout << format("<span class=\"f%|1$x| s%|2$x| c%|3$x| l%|4$x| w%|5$x| r%|6$x|\">")
|
||||
% cur_fn_id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id;
|
||||
% cur_font_info.id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id;
|
||||
|
||||
line_status = LineStatus::SPAN;
|
||||
}
|
||||
|
@ -12,6 +12,8 @@
|
||||
|
||||
#include <boost/format.hpp>
|
||||
|
||||
#include <fofi/FoFiType1C.h>
|
||||
|
||||
#include "HTMLRenderer.h"
|
||||
#include "namespace.h"
|
||||
|
||||
@ -19,116 +21,143 @@ using std::all_of;
|
||||
|
||||
string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
|
||||
{
|
||||
// mupdf consulted
|
||||
|
||||
Object ref_obj, font_obj, font_obj2, fontdesc_obj;
|
||||
Object obj, obj1, obj2;
|
||||
Dict * dict = nullptr;
|
||||
Object font_obj, font_obj2, fontdesc_obj;
|
||||
string suffix;
|
||||
|
||||
string suffix, subtype;
|
||||
|
||||
char buf[1024];
|
||||
int len;
|
||||
|
||||
string fn;
|
||||
ofstream outf;
|
||||
|
||||
auto * id = font->getID();
|
||||
ref_obj.initRef(id->num, id->gen);
|
||||
ref_obj.fetch(xref, &font_obj);
|
||||
ref_obj.free();
|
||||
|
||||
if(!font_obj.isDict())
|
||||
try
|
||||
{
|
||||
cerr << "Font object is not a dictionary" << endl;
|
||||
goto err;
|
||||
}
|
||||
// mupdf consulted
|
||||
string subtype;
|
||||
|
||||
dict = font_obj.getDict();
|
||||
if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
|
||||
{
|
||||
if(font_obj2.arrayGetLength() == 0)
|
||||
auto * id = font->getID();
|
||||
|
||||
Object ref_obj;
|
||||
ref_obj.initRef(id->num, id->gen);
|
||||
ref_obj.fetch(xref, &font_obj);
|
||||
ref_obj.free();
|
||||
|
||||
if(!font_obj.isDict())
|
||||
{
|
||||
cerr << "Warning: empty DescendantFonts array" << endl;
|
||||
cerr << "Font object is not a dictionary" << endl;
|
||||
throw 0;
|
||||
}
|
||||
else
|
||||
|
||||
Dict * dict = font_obj.getDict();
|
||||
if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
|
||||
{
|
||||
if(font_obj2.arrayGetLength() > 1)
|
||||
cerr << "TODO: multiple entries in DescendantFonts array" << endl;
|
||||
|
||||
if(font_obj2.arrayGet(0, &obj2)->isDict())
|
||||
if(font_obj2.arrayGetLength() == 0)
|
||||
{
|
||||
dict = obj2.getDict();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
|
||||
{
|
||||
cerr << "Cannot find FontDescriptor " << endl;
|
||||
goto err;
|
||||
}
|
||||
|
||||
dict = fontdesc_obj.getDict();
|
||||
|
||||
if(dict->lookup("FontFile3", &obj)->isStream())
|
||||
{
|
||||
if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
|
||||
{
|
||||
subtype = obj1.getName();
|
||||
if(subtype == "Type1C")
|
||||
{
|
||||
suffix = ".cff";
|
||||
}
|
||||
else if (subtype == "CIDFontType0C")
|
||||
{
|
||||
suffix = ".cid";
|
||||
cerr << "Warning: empty DescendantFonts array" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
cerr << "Unknown subtype: " << subtype << endl;
|
||||
goto err;
|
||||
if(font_obj2.arrayGetLength() > 1)
|
||||
cerr << "TODO: multiple entries in DescendantFonts array" << endl;
|
||||
|
||||
if(font_obj2.arrayGet(0, &obj2)->isDict())
|
||||
{
|
||||
dict = obj2.getDict();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
|
||||
{
|
||||
cerr << "Cannot find FontDescriptor " << endl;
|
||||
throw 0;
|
||||
}
|
||||
|
||||
dict = fontdesc_obj.getDict();
|
||||
|
||||
if(dict->lookup("FontFile3", &obj)->isStream())
|
||||
{
|
||||
if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
|
||||
{
|
||||
subtype = obj1.getName();
|
||||
if(subtype == "Type1C")
|
||||
{
|
||||
suffix = ".cff";
|
||||
}
|
||||
else if (subtype == "CIDFontType0C")
|
||||
{
|
||||
suffix = ".cid";
|
||||
}
|
||||
else
|
||||
{
|
||||
cerr << "Unknown subtype: " << subtype << endl;
|
||||
throw 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cerr << "Invalid subtype in font descriptor" << endl;
|
||||
throw 0;
|
||||
}
|
||||
}
|
||||
else if (dict->lookup("FontFile2", &obj)->isStream())
|
||||
{
|
||||
suffix = ".ttf";
|
||||
}
|
||||
else if (dict->lookup("FontFile", &obj)->isStream())
|
||||
{
|
||||
suffix = ".pfa";
|
||||
}
|
||||
else
|
||||
{
|
||||
cerr << "Invalid subtype in font descriptor" << endl;
|
||||
goto err;
|
||||
cerr << "Cannot find FontFile for dump" << endl;
|
||||
throw 0;
|
||||
}
|
||||
|
||||
if(suffix == "")
|
||||
{
|
||||
cerr << "Font type unrecognized" << endl;
|
||||
throw 0;
|
||||
}
|
||||
|
||||
obj.streamReset();
|
||||
|
||||
string fn = (format("f%|1$x|")%fn_id).str();
|
||||
ofstream outf;
|
||||
outf.open(tmp_dir / (fn + suffix), ofstream::binary);
|
||||
add_tmp_file(fn+suffix);
|
||||
|
||||
char buf[1024];
|
||||
int len;
|
||||
while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
|
||||
{
|
||||
outf.write(buf, len);
|
||||
}
|
||||
outf.close();
|
||||
obj.streamClose();
|
||||
|
||||
/*
|
||||
* Pre re-encode the font such that it's consistent with the encoding used by PDF
|
||||
*/
|
||||
auto output_to_file = [](void * stream, const char * data, int len)->void
|
||||
{
|
||||
reinterpret_cast<ostream*>(stream)->write(data, len);
|
||||
};
|
||||
|
||||
if(suffix == ".cff")
|
||||
{
|
||||
auto f = FoFiType1C::load((char*)((tmp_dir/(fn+suffix)).c_str()));
|
||||
|
||||
suffix = ".pfa";
|
||||
outf.open(tmp_dir / (fn + suffix), ofstream::binary);
|
||||
add_tmp_file(fn+suffix);
|
||||
|
||||
f->convertToType1(nullptr, (const char **)dynamic_cast<Gfx8BitFont*>(font)->getEncoding(), false, output_to_file, &outf);
|
||||
outf.close();
|
||||
|
||||
delete f;
|
||||
}
|
||||
}
|
||||
else if (dict->lookup("FontFile2", &obj)->isStream())
|
||||
catch(int)
|
||||
{
|
||||
suffix = ".ttf";
|
||||
}
|
||||
else if (dict->lookup("FontFile", &obj)->isStream())
|
||||
{
|
||||
suffix = ".pfa";
|
||||
}
|
||||
else
|
||||
{
|
||||
cerr << "Cannot find FontFile for dump" << endl;
|
||||
goto err;
|
||||
cerr << format("Someting wrong when trying to dump font %|1$x|") % fn_id << endl;
|
||||
}
|
||||
|
||||
if(suffix == "")
|
||||
{
|
||||
cerr << "Font type unrecognized" << endl;
|
||||
goto err;
|
||||
}
|
||||
|
||||
obj.streamReset();
|
||||
|
||||
fn = (format("f%|1$x|%2%")%fn_id%suffix).str();
|
||||
outf.open(tmp_dir / fn , ofstream::binary);
|
||||
add_tmp_file(fn);
|
||||
while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
|
||||
{
|
||||
outf.write(buf, len);
|
||||
}
|
||||
outf.close();
|
||||
obj.streamClose();
|
||||
|
||||
err:
|
||||
obj2.free();
|
||||
obj1.free();
|
||||
obj.free();
|
||||
@ -136,6 +165,7 @@ err:
|
||||
fontdesc_obj.free();
|
||||
font_obj2.free();
|
||||
font_obj.free();
|
||||
|
||||
return suffix;
|
||||
}
|
||||
|
||||
@ -196,7 +226,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
++nSpaces;
|
||||
}
|
||||
|
||||
Unicode uu = check_unicode(u, uLen, code, font);
|
||||
Unicode uu = (cur_font_info.use_tounicode
|
||||
? check_unicode(u, uLen, code, font)
|
||||
: (isLegalUnicode(code) ? code : map_to_private(code))
|
||||
);
|
||||
outputUnicodes(html_fout, &uu, 1);
|
||||
|
||||
dx += dx1;
|
||||
|
@ -27,6 +27,7 @@ struct Param
|
||||
|
||||
int process_nontext;
|
||||
int single_html;
|
||||
int always_apply_tounicode;
|
||||
|
||||
int debug;
|
||||
int clean_tmp;
|
||||
|
@ -76,6 +76,7 @@ po::variables_map parse_options (int argc, char **argv)
|
||||
("heps", po::value<double>(¶m.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)")
|
||||
("veps", po::value<double>(¶m.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")
|
||||
("single-html", po::value<int>(¶m.single_html)->default_value(1), "combine everything into one single HTML file")
|
||||
("always-apply-tounicode", po::value<int>(¶m.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
|
||||
("process-nontext", po::value<int>(¶m.process_nontext)->default_value(1), "process nontext objects")
|
||||
("debug", po::value<int>(¶m.debug)->default_value(0), "output debug information")
|
||||
("clean-tmp", po::value<int>(¶m.clean_tmp)->default_value(1), "clean temporary files after processing")
|
||||
|
24
src/util.h
24
src/util.h
@ -69,12 +69,7 @@ static inline bool isLegalUnicode(Unicode u)
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* We have to use a single Unicode value to reencode fonts
|
||||
* if we got multi-unicode values, it might be expanded ligature, try to restore it
|
||||
* if we cannot figure it out at the end, use a private mapping
|
||||
*/
|
||||
static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
|
||||
static inline Unicode map_to_private(CharCode code)
|
||||
{
|
||||
Unicode private_mapping = (Unicode)(code + 0xE000);
|
||||
if(private_mapping > 0xF8FF)
|
||||
@ -89,10 +84,18 @@ static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont
|
||||
}
|
||||
}
|
||||
}
|
||||
return private_mapping;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* We have to use a single Unicode value to reencode fonts
|
||||
* if we got multi-unicode values, it might be expanded ligature, try to restore it
|
||||
* if we cannot figure it out at the end, use a private mapping
|
||||
*/
|
||||
static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
|
||||
{
|
||||
if(len == 0)
|
||||
return private_mapping;
|
||||
return map_to_private(code);
|
||||
|
||||
if(len == 1)
|
||||
{
|
||||
@ -113,7 +116,7 @@ static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont
|
||||
}
|
||||
}
|
||||
|
||||
return private_mapping;
|
||||
return map_to_private(code);
|
||||
}
|
||||
|
||||
static inline void outputUnicodes(std::ostream & out, const Unicode * u, int uLen)
|
||||
@ -165,7 +168,8 @@ static inline bool operator == (const GfxRGB & rgb1, const GfxRGB & rgb2)
|
||||
class FontInfo
|
||||
{
|
||||
public:
|
||||
long long fn_id;
|
||||
long long id;
|
||||
bool use_tounicode;
|
||||
};
|
||||
|
||||
// wrapper of the transform matrix double[6]
|
||||
|
Loading…
Reference in New Issue
Block a user