1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-09-28 17:21:29 +00:00

disable toUnicode for non-ttf fonts by default

This commit is contained in:
Lu Wang 2012-08-27 23:09:01 +08:00
parent 45c7c1e2da
commit 1642b4a37e
8 changed files with 173 additions and 129 deletions

View File

@ -129,7 +129,7 @@ class HTMLRenderer : public OutputDev
////////////////////////////////////////////////////
// manage styles
////////////////////////////////////////////////////
long long install_font(GfxFont * font);
FontInfo install_font(GfxFont * font);
void install_embedded_font(GfxFont * font, const std::string & suffix, long long fn_id);
void install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id);
void install_external_font (GfxFont * font, long long fn_id);
@ -207,7 +207,7 @@ class HTMLRenderer : public OutputDev
bool text_pos_changed;
// font & size
long long cur_fn_id;
FontInfo cur_font_info;
double cur_font_size;
long long cur_fs_id;
bool font_changed;

View File

@ -155,7 +155,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
draw_scale = 1.0;
cur_fn_id = install_font(nullptr);
cur_font_info = install_font(nullptr);
cur_font_size = draw_font_size = 0;
cur_fs_id = install_font_size(cur_font_size);

View File

@ -19,7 +19,7 @@
using std::all_of;
long long HTMLRenderer::install_font(GfxFont * font)
FontInfo HTMLRenderer::install_font(GfxFont * font)
{
assert(sizeof(long long) == 2*sizeof(int));
@ -27,16 +27,16 @@ long long HTMLRenderer::install_font(GfxFont * font)
auto iter = font_name_map.find(fn_id);
if(iter != font_name_map.end())
return iter->second.fn_id;
return iter->second;
long long new_fn_id = font_name_map.size();
font_name_map.insert(make_pair(fn_id, FontInfo({new_fn_id})));
auto cur_info_iter = font_name_map.insert(make_pair(fn_id, FontInfo({new_fn_id, true}))).first;
if(font == nullptr)
{
export_remote_default_font(new_fn_id);
return new_fn_id;
return cur_info_iter->second;
}
if(param->debug)
@ -48,12 +48,12 @@ long long HTMLRenderer::install_font(GfxFont * font)
if(font->getType() == fontType3) {
cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl;
export_remote_default_font(new_fn_id);
return new_fn_id;
return cur_info_iter->second;
}
if(font->getWMode()) {
cerr << "Writing mode is unsupported and will be rendered as Image" << endl;
export_remote_default_font(new_fn_id);
return new_fn_id;
return cur_info_iter->second;
}
auto * font_loc = font->locateFont(xref, gTrue);
@ -66,6 +66,10 @@ long long HTMLRenderer::install_font(GfxFont * font)
string suffix = dump_embedded_font(font, new_fn_id);
if(suffix != "")
{
if(!((suffix == ".ttf") || (param->always_apply_tounicode)))
{
cur_info_iter->second.use_tounicode = false;
}
install_embedded_font(font, suffix, new_fn_id);
}
else
@ -92,8 +96,7 @@ long long HTMLRenderer::install_font(GfxFont * font)
export_remote_default_font(new_fn_id);
}
return new_fn_id;
return cur_info_iter->second;
}
// TODO
@ -129,7 +132,6 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
script_fout << format("Open(%1%, 1)") % (tmp_dir / (fn + suffix)) << endl;
auto ctu = font->getToUnicode();
int * code2GID = nullptr;
int code2GID_len = 0;
int maxcode = 0;
@ -154,10 +156,6 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
gfree(buf);
}
}
else if (suffix == ".cff")
{
script_fout << "Reencode(\"unicode\")" << endl;
}
else
{
// pass
@ -183,6 +181,9 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
}
}
bool use_tounicode = ((suffix == ".ttf") || (param->always_apply_tounicode));
auto ctu = font->getToUnicode();
ofstream map_fout(tmp_dir / (fn + ".encoding"));
add_tmp_file(fn+".encoding");
@ -190,19 +191,23 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix,
{
map_fout << format("0x%|1$X|") % ((code2GID && (i < code2GID_len))? code2GID[i] : i);
Unicode u, *pu;
int n = 0;
if(ctu)
n = ctu->mapToUnicode(i, &pu);
Unicode u, *pu=&u;
u = check_unicode(pu, n, i, font);
if(use_tounicode)
{
int n = 0;
if(ctu)
n = ctu->mapToUnicode(i, &pu);
u = check_unicode(pu, n, i, font);
}
else
{
u = isLegalUnicode(i) ? i : map_to_private(i);
}
map_fout << format(" 0x%|1$X|") % u;
map_fout << format(" # 0x%|1$X|") % i;
for(int j = 0; j < n; ++j)
map_fout << format(" 0x%|1$X|") % pu[j];
map_fout << endl;
}

View File

@ -92,12 +92,12 @@ void HTMLRenderer::check_state_change(GfxState * state)
// font name & size
if(all_changed || font_changed)
{
long long new_fn_id = install_font(state->getFont());
FontInfo new_font_info = install_font(state->getFont());
if(!(new_fn_id == cur_fn_id))
if(!(new_font_info.id == cur_font_info.id))
{
new_line_status = max(new_line_status, LineStatus::SPAN);
cur_fn_id = new_fn_id;
cur_font_info = new_font_info;
}
double new_font_size = state->getFontSize();
@ -369,7 +369,7 @@ void HTMLRenderer::prepare_line(GfxState * state)
}
html_fout << format("<span class=\"f%|1$x| s%|2$x| c%|3$x| l%|4$x| w%|5$x| r%|6$x|\">")
% cur_fn_id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id;
% cur_font_info.id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id;
line_status = LineStatus::SPAN;
}

View File

@ -12,6 +12,8 @@
#include <boost/format.hpp>
#include <fofi/FoFiType1C.h>
#include "HTMLRenderer.h"
#include "namespace.h"
@ -19,116 +21,143 @@ using std::all_of;
string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
{
// mupdf consulted
Object ref_obj, font_obj, font_obj2, fontdesc_obj;
Object obj, obj1, obj2;
Dict * dict = nullptr;
Object font_obj, font_obj2, fontdesc_obj;
string suffix;
string suffix, subtype;
char buf[1024];
int len;
string fn;
ofstream outf;
auto * id = font->getID();
ref_obj.initRef(id->num, id->gen);
ref_obj.fetch(xref, &font_obj);
ref_obj.free();
if(!font_obj.isDict())
try
{
cerr << "Font object is not a dictionary" << endl;
goto err;
}
// mupdf consulted
string subtype;
dict = font_obj.getDict();
if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
{
if(font_obj2.arrayGetLength() == 0)
auto * id = font->getID();
Object ref_obj;
ref_obj.initRef(id->num, id->gen);
ref_obj.fetch(xref, &font_obj);
ref_obj.free();
if(!font_obj.isDict())
{
cerr << "Warning: empty DescendantFonts array" << endl;
cerr << "Font object is not a dictionary" << endl;
throw 0;
}
else
Dict * dict = font_obj.getDict();
if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
{
if(font_obj2.arrayGetLength() > 1)
cerr << "TODO: multiple entries in DescendantFonts array" << endl;
if(font_obj2.arrayGet(0, &obj2)->isDict())
if(font_obj2.arrayGetLength() == 0)
{
dict = obj2.getDict();
}
}
}
if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
{
cerr << "Cannot find FontDescriptor " << endl;
goto err;
}
dict = fontdesc_obj.getDict();
if(dict->lookup("FontFile3", &obj)->isStream())
{
if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
{
subtype = obj1.getName();
if(subtype == "Type1C")
{
suffix = ".cff";
}
else if (subtype == "CIDFontType0C")
{
suffix = ".cid";
cerr << "Warning: empty DescendantFonts array" << endl;
}
else
{
cerr << "Unknown subtype: " << subtype << endl;
goto err;
if(font_obj2.arrayGetLength() > 1)
cerr << "TODO: multiple entries in DescendantFonts array" << endl;
if(font_obj2.arrayGet(0, &obj2)->isDict())
{
dict = obj2.getDict();
}
}
}
if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
{
cerr << "Cannot find FontDescriptor " << endl;
throw 0;
}
dict = fontdesc_obj.getDict();
if(dict->lookup("FontFile3", &obj)->isStream())
{
if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
{
subtype = obj1.getName();
if(subtype == "Type1C")
{
suffix = ".cff";
}
else if (subtype == "CIDFontType0C")
{
suffix = ".cid";
}
else
{
cerr << "Unknown subtype: " << subtype << endl;
throw 0;
}
}
else
{
cerr << "Invalid subtype in font descriptor" << endl;
throw 0;
}
}
else if (dict->lookup("FontFile2", &obj)->isStream())
{
suffix = ".ttf";
}
else if (dict->lookup("FontFile", &obj)->isStream())
{
suffix = ".pfa";
}
else
{
cerr << "Invalid subtype in font descriptor" << endl;
goto err;
cerr << "Cannot find FontFile for dump" << endl;
throw 0;
}
if(suffix == "")
{
cerr << "Font type unrecognized" << endl;
throw 0;
}
obj.streamReset();
string fn = (format("f%|1$x|")%fn_id).str();
ofstream outf;
outf.open(tmp_dir / (fn + suffix), ofstream::binary);
add_tmp_file(fn+suffix);
char buf[1024];
int len;
while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
{
outf.write(buf, len);
}
outf.close();
obj.streamClose();
/*
* Pre re-encode the font such that it's consistent with the encoding used by PDF
*/
auto output_to_file = [](void * stream, const char * data, int len)->void
{
reinterpret_cast<ostream*>(stream)->write(data, len);
};
if(suffix == ".cff")
{
auto f = FoFiType1C::load((char*)((tmp_dir/(fn+suffix)).c_str()));
suffix = ".pfa";
outf.open(tmp_dir / (fn + suffix), ofstream::binary);
add_tmp_file(fn+suffix);
f->convertToType1(nullptr, (const char **)dynamic_cast<Gfx8BitFont*>(font)->getEncoding(), false, output_to_file, &outf);
outf.close();
delete f;
}
}
else if (dict->lookup("FontFile2", &obj)->isStream())
{
suffix = ".ttf";
}
else if (dict->lookup("FontFile", &obj)->isStream())
catch(int)
{
suffix = ".pfa";
}
else
{
cerr << "Cannot find FontFile for dump" << endl;
goto err;
cerr << format("Someting wrong when trying to dump font %|1$x|") % fn_id << endl;
}
if(suffix == "")
{
cerr << "Font type unrecognized" << endl;
goto err;
}
obj.streamReset();
fn = (format("f%|1$x|%2%")%fn_id%suffix).str();
outf.open(tmp_dir / fn , ofstream::binary);
add_tmp_file(fn);
while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
{
outf.write(buf, len);
}
outf.close();
obj.streamClose();
err:
obj2.free();
obj1.free();
obj.free();
@ -136,6 +165,7 @@ err:
fontdesc_obj.free();
font_obj2.free();
font_obj.free();
return suffix;
}
@ -196,7 +226,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
++nSpaces;
}
Unicode uu = check_unicode(u, uLen, code, font);
Unicode uu = (cur_font_info.use_tounicode
? check_unicode(u, uLen, code, font)
: (isLegalUnicode(code) ? code : map_to_private(code))
);
outputUnicodes(html_fout, &uu, 1);
dx += dx1;

View File

@ -27,6 +27,7 @@ struct Param
int process_nontext;
int single_html;
int always_apply_tounicode;
int debug;
int clean_tmp;

View File

@ -76,6 +76,7 @@ po::variables_map parse_options (int argc, char **argv)
("heps", po::value<double>(&param.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)")
("veps", po::value<double>(&param.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")
("single-html", po::value<int>(&param.single_html)->default_value(1), "combine everything into one single HTML file")
("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
("process-nontext", po::value<int>(&param.process_nontext)->default_value(1), "process nontext objects")
("debug", po::value<int>(&param.debug)->default_value(0), "output debug information")
("clean-tmp", po::value<int>(&param.clean_tmp)->default_value(1), "clean temporary files after processing")

View File

@ -69,12 +69,7 @@ static inline bool isLegalUnicode(Unicode u)
return true;
}
/*
* We have to use a single Unicode value to reencode fonts
* if we got multi-unicode values, it might be expanded ligature, try to restore it
* if we cannot figure it out at the end, use a private mapping
*/
static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
static inline Unicode map_to_private(CharCode code)
{
Unicode private_mapping = (Unicode)(code + 0xE000);
if(private_mapping > 0xF8FF)
@ -89,10 +84,18 @@ static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont
}
}
}
return private_mapping;
}
/*
* We have to use a single Unicode value to reencode fonts
* if we got multi-unicode values, it might be expanded ligature, try to restore it
* if we cannot figure it out at the end, use a private mapping
*/
static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
{
if(len == 0)
return private_mapping;
return map_to_private(code);
if(len == 1)
{
@ -113,7 +116,7 @@ static inline Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont
}
}
return private_mapping;
return map_to_private(code);
}
static inline void outputUnicodes(std::ostream & out, const Unicode * u, int uLen)
@ -165,7 +168,8 @@ static inline bool operator == (const GfxRGB & rgb1, const GfxRGB & rgb2)
class FontInfo
{
public:
long long fn_id;
long long id;
bool use_tounicode;
};
// wrapper of the transform matrix double[6]