mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-09-19 05:40:07 +00:00
461 lines
13 KiB
C++
461 lines
13 KiB
C++
/*
|
|
* text.ccc
|
|
*
|
|
* Handling text & font, and relative stuffs
|
|
*
|
|
* by WangLu
|
|
* 2012.08.14
|
|
*/
|
|
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include <unordered_set>
|
|
|
|
#include <boost/format.hpp>
|
|
#include <boost/algorithm/string.hpp>
|
|
#include <boost/filesystem.hpp>
|
|
|
|
#include <CharCodeToUnicode.h>
|
|
#include <fofi/FoFiTrueType.h>
|
|
|
|
#include "HTMLRenderer.h"
|
|
#include "namespace.h"
|
|
#include "config.h"
|
|
|
|
using boost::algorithm::to_lower;
|
|
using std::unordered_set;
|
|
|
|
path HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
|
|
{
|
|
Object obj, obj1, obj2;
|
|
Object font_obj, font_obj2, fontdesc_obj;
|
|
string suffix;
|
|
path filepath;
|
|
|
|
try
|
|
{
|
|
// mupdf consulted
|
|
string subtype;
|
|
|
|
auto * id = font->getID();
|
|
|
|
Object ref_obj;
|
|
ref_obj.initRef(id->num, id->gen);
|
|
ref_obj.fetch(xref, &font_obj);
|
|
ref_obj.free();
|
|
|
|
if(!font_obj.isDict())
|
|
{
|
|
cerr << "Font object is not a dictionary" << endl;
|
|
throw 0;
|
|
}
|
|
|
|
Dict * dict = font_obj.getDict();
|
|
if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
|
|
{
|
|
if(font_obj2.arrayGetLength() == 0)
|
|
{
|
|
cerr << "Warning: empty DescendantFonts array" << endl;
|
|
}
|
|
else
|
|
{
|
|
if(font_obj2.arrayGetLength() > 1)
|
|
cerr << "TODO: multiple entries in DescendantFonts array" << endl;
|
|
|
|
if(font_obj2.arrayGet(0, &obj2)->isDict())
|
|
{
|
|
dict = obj2.getDict();
|
|
}
|
|
}
|
|
}
|
|
|
|
if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
|
|
{
|
|
cerr << "Cannot find FontDescriptor " << endl;
|
|
throw 0;
|
|
}
|
|
|
|
dict = fontdesc_obj.getDict();
|
|
|
|
if(dict->lookup("FontFile3", &obj)->isStream())
|
|
{
|
|
if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
|
|
{
|
|
subtype = obj1.getName();
|
|
if(subtype == "Type1C")
|
|
{
|
|
suffix = ".cff";
|
|
}
|
|
else if (subtype == "CIDFontType0C")
|
|
{
|
|
suffix = ".cid";
|
|
}
|
|
else
|
|
{
|
|
cerr << "Unknown subtype: " << subtype << endl;
|
|
throw 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
cerr << "Invalid subtype in font descriptor" << endl;
|
|
throw 0;
|
|
}
|
|
}
|
|
else if (dict->lookup("FontFile2", &obj)->isStream())
|
|
{
|
|
suffix = ".ttf";
|
|
}
|
|
else if (dict->lookup("FontFile", &obj)->isStream())
|
|
{
|
|
suffix = ".pfa";
|
|
}
|
|
else
|
|
{
|
|
cerr << "Cannot find FontFile for dump" << endl;
|
|
throw 0;
|
|
}
|
|
|
|
if(suffix == "")
|
|
{
|
|
cerr << "Font type unrecognized" << endl;
|
|
throw 0;
|
|
}
|
|
|
|
obj.streamReset();
|
|
|
|
string fn = (format("f%|1$x|")%fn_id).str();
|
|
ofstream outf;
|
|
filepath = tmp_dir / (fn + suffix);
|
|
outf.open(filepath, ofstream::binary);
|
|
add_tmp_file(fn+suffix);
|
|
|
|
char buf[1024];
|
|
int len;
|
|
while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
|
|
{
|
|
outf.write(buf, len);
|
|
}
|
|
outf.close();
|
|
obj.streamClose();
|
|
}
|
|
catch(int)
|
|
{
|
|
cerr << format("Someting wrong when trying to dump font %|1$x|") % fn_id << endl;
|
|
}
|
|
|
|
obj2.free();
|
|
obj1.free();
|
|
obj.free();
|
|
|
|
fontdesc_obj.free();
|
|
font_obj2.free();
|
|
font_obj.free();
|
|
|
|
return filepath;
|
|
}
|
|
|
|
void HTMLRenderer::embed_font(const path & filepath, GfxFont * font, FontInfo & info, bool get_metric_only)
|
|
{
|
|
string suffix = filepath.extension().string();
|
|
to_lower(suffix);
|
|
|
|
string fn = (format("f%|1$x|") % info.id).str();
|
|
|
|
path script_path = tmp_dir / (fn + ".pe");
|
|
ofstream script_fout(script_path, ofstream::binary);
|
|
add_tmp_file(fn+".pe");
|
|
|
|
script_fout << format("Open(%1%, 1)") % filepath << endl;
|
|
|
|
int * code2GID = nullptr;
|
|
int code2GID_len = 0;
|
|
int maxcode = 0;
|
|
|
|
Gfx8BitFont * font_8bit = nullptr;
|
|
|
|
info.use_tounicode = ((suffix == ".ttf") || (font->isCIDFont()) || (param->always_apply_tounicode));
|
|
|
|
if(!get_metric_only)
|
|
{
|
|
/*
|
|
* Step 1
|
|
* dump the font file directly from the font descriptor and put the glyphs into the correct slots
|
|
*
|
|
* for 8bit + nonTrueType
|
|
* re-encoding the font using a PostScript encoding list (glyph id <-> glpyh name)
|
|
*
|
|
* for 8bit + TrueType
|
|
* sort the glpyhs as the original order, and later will map GID (instead of char code) to Unicode
|
|
*
|
|
* for CID + nonTrueType
|
|
* Flatten the font
|
|
*
|
|
* for CID Truetype
|
|
* same as 8bitTrueType, except for that we have to check 65536 charcodes
|
|
*/
|
|
if(!font->isCIDFont())
|
|
{
|
|
font_8bit = dynamic_cast<Gfx8BitFont*>(font);
|
|
maxcode = 0xff;
|
|
if((suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf"))
|
|
{
|
|
script_fout << "Reencode(\"original\")" << endl;
|
|
FoFiTrueType *fftt = nullptr;
|
|
if((fftt = FoFiTrueType::load((char*)filepath.c_str())) != nullptr)
|
|
{
|
|
code2GID = font_8bit->getCodeToGIDMap(fftt);
|
|
code2GID_len = 256;
|
|
delete fftt;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// move the slot such that it's consistent with the encoding seen in PDF
|
|
ofstream out(tmp_dir / (fn + "_.encoding"));
|
|
add_tmp_file(fn+"_.encoding");
|
|
|
|
unordered_set<string> nameset;
|
|
bool name_conflict_warned = false;
|
|
|
|
out << format("/%1% [") % fn << endl;
|
|
for(int i = 0; i < 256; ++i)
|
|
{
|
|
auto cn = font_8bit->getCharName(i);
|
|
if(cn == nullptr)
|
|
{
|
|
out << "/.notdef" << endl;
|
|
}
|
|
else
|
|
{
|
|
if(nameset.insert(string(cn)).second)
|
|
{
|
|
out << "/" << cn << endl;
|
|
}
|
|
else
|
|
{
|
|
if(!name_conflict_warned)
|
|
{
|
|
name_conflict_warned = false;
|
|
//TODO: may be resolved using advanced font properties?
|
|
cerr << "Warning: encoding confliction detected in font: " << fn << endl;
|
|
}
|
|
out << "/.notdef" << endl;
|
|
}
|
|
}
|
|
}
|
|
out << "] def" << endl;
|
|
|
|
script_fout << format("LoadEncodingFile(%1%)") % (tmp_dir / (fn+"_.encoding")) << endl;
|
|
script_fout << format("Reencode(\"%1%\")") % fn << endl;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
maxcode = 0xffff;
|
|
|
|
if(suffix == ".ttf")
|
|
{
|
|
script_fout << "Reencode(\"original\")" << endl;
|
|
|
|
GfxCIDFont * _font = dynamic_cast<GfxCIDFont*>(font);
|
|
|
|
// code2GID has been stored for embedded CID fonts
|
|
code2GID = _font->getCIDToGID();
|
|
code2GID_len = _font->getCIDToGIDLen();
|
|
}
|
|
else
|
|
{
|
|
script_fout << "CIDFlatten()" << endl;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Step 2
|
|
* map charcode (or GID for CID truetype)
|
|
* generate an Consortium encoding file and let fontforge handle it.
|
|
*
|
|
* - Always map to Unicode for 8bit TrueType fonts and CID fonts
|
|
*
|
|
* - For 8bit nonTruetype fonts:
|
|
* Try to calculate the correct Unicode value from the glyph names, unless param->always_apply_tounicode is set
|
|
*
|
|
*/
|
|
|
|
auto ctu = font->getToUnicode();
|
|
|
|
ofstream map_fout(tmp_dir / (fn + ".encoding"));
|
|
add_tmp_file(fn+".encoding");
|
|
|
|
int cnt = 0;
|
|
for(int i = 0; i <= maxcode; ++i)
|
|
{
|
|
if((suffix != ".ttf") && (font_8bit != nullptr) && (font_8bit->getCharName(i) == nullptr))
|
|
continue;
|
|
|
|
++ cnt;
|
|
map_fout << format("0x%|1$X|") % ((code2GID && (i < code2GID_len))? code2GID[i] : i);
|
|
|
|
Unicode u, *pu=&u;
|
|
|
|
if(info.use_tounicode)
|
|
{
|
|
int n = 0;
|
|
if(ctu)
|
|
n = ctu->mapToUnicode(i, &pu);
|
|
u = check_unicode(pu, n, i, font);
|
|
}
|
|
else
|
|
{
|
|
u = unicode_from_font(i, font);
|
|
}
|
|
|
|
map_fout << format(" 0x%|1$X|") % u;
|
|
map_fout << format(" # 0x%|1$X|") % i;
|
|
|
|
map_fout << endl;
|
|
}
|
|
|
|
if(cnt > 0)
|
|
{
|
|
script_fout << format("LoadEncodingFile(%1%, \"%2%\")") % (tmp_dir / (fn+".encoding")) % fn << endl;
|
|
script_fout << format("Reencode(\"%1%\", 1)") % fn << endl;
|
|
}
|
|
|
|
if(ctu)
|
|
ctu->decRefCnt();
|
|
}
|
|
|
|
auto dest = ((param->single_html ? tmp_dir : dest_dir) / (fn+(param->font_suffix)));
|
|
if(param->single_html)
|
|
add_tmp_file(fn+(param->font_suffix));
|
|
|
|
/*
|
|
* [Win|Typo|HHead][Ascent|Descent]
|
|
* Firefox & Chrome interprets the values in different ways
|
|
* Trying to unify them
|
|
*/
|
|
// Generate an intermediate ttf font in order to retrieve the metrics
|
|
add_tmp_file(fn + "_.ttf");
|
|
script_fout << format("Generate(%1%)") % (tmp_dir / (fn + "_.ttf")) << endl;
|
|
script_fout << "Close()" << endl;
|
|
script_fout << format("Open(%1%, 1)") % (tmp_dir / (fn + "_.ttf")) << endl;
|
|
script_fout << ifstream(PDF2HTMLEX_DATA_PATH / UNIFY_SCRIPT_FILENAME).rdbuf();
|
|
script_fout << format("Generate(%1%)") % dest << endl;
|
|
script_fout.close();
|
|
|
|
if(system((boost::format("fontforge -script %1% 1>%2% 2>%3%") % script_path % (tmp_dir / (fn+".info")) % (tmp_dir / NULL_FILENAME)).str().c_str()) != 0)
|
|
cerr << "Warning: fontforge failed." << endl;
|
|
|
|
add_tmp_file(fn+".info");
|
|
add_tmp_file(NULL_FILENAME);
|
|
|
|
// read metric
|
|
int em, ascent, descent;
|
|
if(ifstream(tmp_dir / (fn+".info")) >> em >> ascent >> descent)
|
|
{
|
|
if(em != 0)
|
|
{
|
|
info.ascent = ((double)ascent) / em;
|
|
info.descent = -((double)descent) / em;
|
|
}
|
|
else
|
|
{
|
|
info.ascent = 0;
|
|
info.descent = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
cerr << "Warning: cannot read font info for " << fn << endl;
|
|
info.ascent = font->getAscent();
|
|
info.descent = font->getDescent();
|
|
}
|
|
|
|
if(param->debug)
|
|
{
|
|
cerr << "Ascent: " << info.ascent << " Descent: " << info.descent << endl;
|
|
}
|
|
}
|
|
|
|
void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
|
{
|
|
if(s->getLength() == 0)
|
|
return;
|
|
|
|
auto font = state->getFont();
|
|
if((font == nullptr) || (font->getWMode()))
|
|
{
|
|
return;
|
|
}
|
|
|
|
//hidden
|
|
if((state->getRender() & 3) == 3)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// see if the line has to be closed due to state change
|
|
check_state_change(state);
|
|
prepare_line(state);
|
|
|
|
// Now ready to output
|
|
// get the unicodes
|
|
char *p = s->getCString();
|
|
int len = s->getLength();
|
|
|
|
double dx = 0;
|
|
double dy = 0;
|
|
double dxerr = 0;
|
|
double dx1,dy1;
|
|
double ox, oy;
|
|
|
|
int nChars = 0;
|
|
int nSpaces = 0;
|
|
int uLen;
|
|
|
|
CharCode code;
|
|
Unicode *u = nullptr;
|
|
|
|
double fs = state->getFontSize();
|
|
double cs = state->getCharSpace();
|
|
double ws = state->getWordSpace();
|
|
double hs = state->getHorizScaling();
|
|
|
|
while (len > 0) {
|
|
auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy);
|
|
|
|
if(!(_equal(ox, 0) && _equal(oy, 0)))
|
|
{
|
|
cerr << "TODO: non-zero origins" << endl;
|
|
}
|
|
|
|
if (n == 1 && *p == ' ')
|
|
{
|
|
++nSpaces;
|
|
}
|
|
|
|
Unicode uu = (cur_font_info.use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
|
|
outputUnicodes(line_buf, &uu, 1);
|
|
|
|
dx += dx1;
|
|
dy += dy1;
|
|
|
|
++nChars;
|
|
p += n;
|
|
len -= n;
|
|
}
|
|
|
|
// horiz_scaling is merged into ctm now,
|
|
// so the coordinate system is ugly
|
|
dx = (dx * fs + nChars * cs + nSpaces * ws) * hs;
|
|
|
|
dy *= fs;
|
|
|
|
cur_tx += dx;
|
|
cur_ty += dy;
|
|
|
|
draw_tx += dx + dxerr * state->getFontSize() * state->getHorizScaling();
|
|
draw_ty += dy;
|
|
}
|