1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-05 09:38:40 +00:00

Merge branch 'devv'

This commit is contained in:
Lu Wang 2012-08-31 17:06:27 +08:00
commit f9298adbcf
12 changed files with 291 additions and 228 deletions

View File

@ -10,7 +10,7 @@ include_directories(${Boost_INCLUDE_DIRS})
link_directories ( ${Boost_LIBRARY_DIRS} )
include_directories(${CMAKE_SOURCE_DIR}/src)
set(PDF2HTMLEX_VERSION "0.1")
set(PDF2HTMLEX_VERSION "0.2")
set(ARCHIVE_NAME pdf2htmlex-${PDF2HTMLEX_VERSION})
add_custom_target(dist
COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD

View File

@ -1,3 +1,5 @@
/* Base CSS */
/* by Wang Lu */
#pdf-main {
position:absolute;
top:0;
@ -42,3 +44,4 @@ span {
.i {
position:absolute;
}
/* Base CSS END */

View File

@ -4,3 +4,4 @@
<html>
<head>
<meta charset="utf-8">
<!-- head.html END -->

View File

@ -20,3 +20,4 @@ function show_pages()
</head>
<body onload="show_pages();">
<div id="pdf-main">
<!-- neck.html END -->

View File

@ -2,3 +2,4 @@
</div>
</body>
</html>
<!-- tail.html END -->

View File

@ -19,7 +19,7 @@ if (-td > d)
d = -td
endif
if (-hd > d)
d = -hd)
d = -hd
endif
SetOS2Value("WinAscent", a)
SetOS2Value("WinDescent", d)
@ -28,3 +28,4 @@ SetOS2Value("HHeadDescent", -d)
Print(ta-td)
Print(a)
Print(d)
# script end

View File

@ -125,15 +125,16 @@ class HTMLRenderer : public OutputDev
////////////////////////////////////////////////////
void add_tmp_file (const std::string & fn);
void clean_tmp_files ();
std::string dump_embedded_font (GfxFont * font, long long fn_id);
boost::filesystem::path dump_embedded_font (GfxFont * font, long long fn_id);
void embed_font(const boost::filesystem::path & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false);
////////////////////////////////////////////////////
// manage styles
////////////////////////////////////////////////////
FontInfo install_font(GfxFont * font);
void install_embedded_font(GfxFont * font, const std::string & suffix, long long fn_id, FontInfo & info);
void install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id);
void install_external_font (GfxFont * font, long long fn_id);
void install_embedded_font(GfxFont * font, FontInfo & info);
void install_base_font(GfxFont * font, GfxFontLoc * font_loc, FontInfo & info);
void install_external_font (GfxFont * font, FontInfo & info);
long long install_font_size(double font_size);
long long install_transform_matrix(const double * tm);
@ -152,7 +153,7 @@ class HTMLRenderer : public OutputDev
*/
void export_remote_font(const FontInfo & info, const std::string & suffix, const std::string & fontfileformat, GfxFont * font);
void export_remote_default_font(long long fn_id);
void export_local_font(long long fn_id, GfxFont * font, const std::string & original_font_name, const std::string & cssfont);
void export_local_font(const FontInfo & info, GfxFont * font, const std::string & original_font_name, const std::string & cssfont);
void export_font_size(long long fs_id, double font_size);
void export_transform_matrix(long long tm_id, const double * tm);

View File

@ -50,9 +50,9 @@ void HTMLRenderer::export_remote_default_font(long long fn_id)
allcss_fout << format(".f%|1$x|{font-family:sans-serif;color:transparent;visibility:hidden;}")%fn_id << endl;
}
void HTMLRenderer::export_local_font(long long fn_id, GfxFont * font, const string & original_font_name, const string & cssfont)
void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont)
{
allcss_fout << format(".f%|1$x|{") % fn_id;
allcss_fout << format(".f%|1$x|{") % info.id;
allcss_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
if(font->isBold() || ifind_first(original_font_name, "bold"))
@ -63,7 +63,7 @@ void HTMLRenderer::export_local_font(long long fn_id, GfxFont * font, const stri
else if(font->isItalic() || ifind_first(original_font_name, "italic"))
allcss_fout << "font-style:italic;";
allcss_fout << "line-height:" << (font->getAscent() - font->getDescent()) << ";";
allcss_fout << "line-height:" << (info.ascent - info.descent) << ";";
allcss_fout << "}" << endl;
}

View File

@ -13,16 +13,11 @@
#include <boost/format.hpp>
#include <CharCodeToUnicode.h>
#include <fofi/FoFiTrueType.h>
#include "Param.h"
#include "HTMLRenderer.h"
#include "namespace.h"
#include "config.h"
using std::all_of;
using std::max;
using std::min;
#include "util.h"
FontInfo HTMLRenderer::install_font(GfxFont * font)
{
@ -69,23 +64,13 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
switch(font_loc -> locType)
{
case gfxFontLocEmbedded:
{
string suffix = dump_embedded_font(font, new_fn_id);
if(suffix != "")
{
install_embedded_font(font, suffix, new_fn_id, cur_info_iter->second);
}
else
{
export_remote_default_font(new_fn_id);
}
}
install_embedded_font(font, cur_info_iter->second);
break;
case gfxFontLocExternal:
install_external_font(font, new_fn_id);
install_external_font(font, cur_info_iter->second);
break;
case gfxFontLocResident:
install_base_font(font, font_loc, new_fn_id);
install_base_font(font, font_loc, cur_info_iter->second);
break;
default:
cerr << "TODO: other font loc" << endl;
@ -102,204 +87,38 @@ FontInfo HTMLRenderer::install_font(GfxFont * font)
return cur_info_iter->second;
}
// TODO
// add a new function and move to text.cc
void HTMLRenderer::install_embedded_font(GfxFont * font, const string & suffix, long long fn_id, FontInfo & info)
void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info)
{
string fn = (format("f%|1$x|") % fn_id).str();
path script_path = tmp_dir / (fn + ".pe");
ofstream script_fout(script_path, ofstream::binary);
add_tmp_file(fn+".pe");
script_fout << format("Open(%1%, 1)") % (tmp_dir / (fn + suffix)) << endl;
int * code2GID = nullptr;
int code2GID_len = 0;
int maxcode = 0;
Gfx8BitFont * font_8bit = nullptr;
/*
* Step 1
* dump the font file directly from the font descriptor and put the glyphs into the correct slots
*
* for 8bit + nonTrueType
* re-encoding the font using a PostScript encoding list (glyph id <-> glpyh name)
*
* for 8bit + TrueType
* sort the glpyhs as the original order, and later will map GID (instead of char code) to Unicode
*
* for CID + nonTrueType
* Flatten the font
*
* for CID Truetype
* same as 8bitTrueType, except for that we have to check 65536 charcodes
*/
if(!font->isCIDFont())
auto path = dump_embedded_font(font, info.id);
if(path != "")
{
font_8bit = dynamic_cast<Gfx8BitFont*>(font);
maxcode = 0xff;
if(suffix == ".ttf")
{
script_fout << "Reencode(\"original\")" << endl;
int buflen;
char * buf = nullptr;
if((buf = font->readEmbFontFile(xref, &buflen)))
{
FoFiTrueType *fftt = nullptr;
if((fftt = FoFiTrueType::make(buf, buflen)))
{
code2GID = font_8bit->getCodeToGIDMap(fftt);
code2GID_len = 256;
delete fftt;
}
gfree(buf);
}
}
else
{
// move the slot such that it's consistent with the encoding seen in PDF
ofstream out(tmp_dir / (fn + "_.encoding"));
add_tmp_file(fn+"_.encoding");
out << format("/%1% [") % fn << endl;
for(int i = 0; i < 256; ++i)
{
auto cn = font_8bit->getCharName(i);
out << "/" << ((cn == nullptr) ? ".notdef" : cn) << endl;
}
out << "] def" << endl;
script_fout << format("LoadEncodingFile(%1%)") % (tmp_dir / (fn+"_.encoding")) << endl;
script_fout << format("Reencode(\"%1%\")") % fn << endl;
}
embed_font(path, font, info);
export_remote_font(info, param->font_suffix, param->font_format, font);
}
else
{
maxcode = 0xffff;
if(suffix == ".ttf")
{
script_fout << "Reencode(\"original\")" << endl;
GfxCIDFont * _font = dynamic_cast<GfxCIDFont*>(font);
// code2GID has been stored for embedded CID fonts
code2GID = _font->getCIDToGID();
code2GID_len = _font->getCIDToGIDLen();
}
else
{
script_fout << "CIDFlatten()" << endl;
}
export_remote_default_font(info.id);
}
/*
* Step 2
* map charcode (or GID for CID truetype)
* generate an Consortium encoding file and let fontforge handle it.
*
* - Always map to Unicode for 8bit TrueType fonts and CID fonts
*
* - For 8bit nonTruetype fonts:
* Try to calculate the correct Unicode value from the glyph names, unless param->always_apply_tounicode is set
*
*/
info.use_tounicode = ((suffix == ".ttf") || (font->isCIDFont()) || (param->always_apply_tounicode));
auto ctu = font->getToUnicode();
ofstream map_fout(tmp_dir / (fn + ".encoding"));
add_tmp_file(fn+".encoding");
int cnt = 0;
for(int i = 0; i <= maxcode; ++i)
{
if((suffix != ".ttf") && (font_8bit != nullptr) && (font_8bit->getCharName(i) == nullptr))
continue;
++ cnt;
map_fout << format("0x%|1$X|") % ((code2GID && (i < code2GID_len))? code2GID[i] : i);
Unicode u, *pu=&u;
if(info.use_tounicode)
{
int n = 0;
if(ctu)
n = ctu->mapToUnicode(i, &pu);
u = check_unicode(pu, n, i, font);
}
else
{
u = unicode_from_font(i, font);
}
map_fout << format(" 0x%|1$X|") % u;
map_fout << format(" # 0x%|1$X|") % i;
map_fout << endl;
}
if(cnt > 0)
{
script_fout << format("LoadEncodingFile(%1%, \"%2%\")") % (tmp_dir / (fn+".encoding")) % fn << endl;
script_fout << format("Reencode(\"%1%\", 1)") % fn << endl;
}
if(ctu)
ctu->decRefCnt();
auto dest = ((param->single_html ? tmp_dir : dest_dir) / (fn+(param->font_suffix)));
if(param->single_html)
add_tmp_file(fn+(param->font_suffix));
/*
* [Win|Typo|HHead][Ascent|Descent]
* Firefox & Chrome interprets the values in different ways
* Trying to unify them
*/
script_fout << format("Generate(%1%)") % dest << endl;
script_fout << "Close()" << endl;
script_fout << format("Open(%1%, 1)") % dest << endl;
script_fout << ifstream(PDF2HTMLEX_DATA_PATH / UNIFY_SCRIPT_FILENAME).rdbuf();
script_fout << format("Generate(%1%)") % dest << endl;
script_fout.close();
if(system((boost::format("fontforge -script %1% 1>%2% 2>%3%") % script_path % (tmp_dir / (fn+".info")) % (tmp_dir / NULL_FILENAME)).str().c_str()) != 0)
cerr << "Warning: fontforge failed." << endl;
add_tmp_file(fn+".info");
add_tmp_file(NULL_FILENAME);
// read metric
int em, ascent, descent;
if(ifstream(tmp_dir / (fn+".info")) >> em >> ascent >> descent)
{
if(em != 0)
{
info.ascent = ((double)ascent) / em;
info.descent = -((double)descent) / em;
}
else
{
info.ascent = 0;
info.descent = 0;
}
}
if(param->debug)
{
cerr << "Ascent: " << info.ascent << " Descent: " << info.descent << endl;
}
export_remote_font(info, param->font_suffix, param->font_format, font);
}
void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id)
void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, FontInfo & info)
{
GfxFontLoc * localfontloc = font->locateFont(xref, gFalse);
if(param->embed_base_font)
{
if(localfontloc != nullptr)
{
embed_font(path(localfontloc->path->getCString()), font, info);
export_remote_font(info, param->font_suffix, param->font_format, font);
return;
}
else
{
cerr << format("Cannot embed base font: f%|1$x|") % info.id << endl;
}
}
string psname(font_loc->path->getCString());
string basename = psname.substr(0, psname.find('-'));
string cssfont;
@ -312,10 +131,22 @@ void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, long
else
cssfont = iter->second;
export_local_font(fn_id, font, psname, cssfont);
// still try to get an idea of read ascent/descent
if(localfontloc != nullptr)
{
// fill in ascent/descent only, do not embed
embed_font(path(localfontloc->path->getCString()), font, info, true);
}
else
{
info.ascent = font->getAscent();
info.descent = font->getDescent();
}
export_local_font(info, font, psname, cssfont);
}
void HTMLRenderer::install_external_font( GfxFont * font, long long fn_id)
void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info)
{
string fontname(font->getName()->getCString());
@ -327,7 +158,18 @@ void HTMLRenderer::install_external_font( GfxFont * font, long long fn_id)
cerr << "Warning: workaround for font names in bad encodings." << endl;
}
export_local_font(fn_id, font, fontname, "");
//debug
GooString gfn(fontname.c_str());
GooString * path = globalParams->findFontFile(&gfn);
cerr << "Find: " << fontname << endl;
if(path)
{
cerr << "MATCHED: " << path->getCString() << endl;
delete path;
}
export_local_font(info, font, fontname, "");
}
long long HTMLRenderer::install_font_size(double font_size)

View File

@ -1,7 +1,7 @@
/*
* text.ccc
*
* Handling text and relative stuffs
* Handling text & font, and relative stuffs
*
* by WangLu
* 2012.08.14
@ -11,19 +11,24 @@
#include <algorithm>
#include <boost/format.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/filesystem.hpp>
#include <fofi/FoFiType1C.h>
#include <CharCodeToUnicode.h>
#include <fofi/FoFiTrueType.h>
#include "HTMLRenderer.h"
#include "namespace.h"
#include "config.h"
using std::all_of;
using boost::algorithm::to_lower;
string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
path HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
{
Object obj, obj1, obj2;
Object font_obj, font_obj2, fontdesc_obj;
string suffix;
path filepath;
try
{
@ -119,7 +124,8 @@ string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
string fn = (format("f%|1$x|")%fn_id).str();
ofstream outf;
outf.open(tmp_dir / (fn + suffix), ofstream::binary);
filepath = tmp_dir / (fn + suffix);
outf.open(filepath, ofstream::binary);
add_tmp_file(fn+suffix);
char buf[1024];
@ -144,7 +150,210 @@ string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
font_obj2.free();
font_obj.free();
return suffix;
return filepath;
}
void HTMLRenderer::embed_font(const path & filepath, GfxFont * font, FontInfo & info, bool get_metric_only)
{
string suffix = filepath.extension().string();
to_lower(suffix);
string fn = (format("f%|1$x|") % info.id).str();
path script_path = tmp_dir / (fn + ".pe");
ofstream script_fout(script_path, ofstream::binary);
add_tmp_file(fn+".pe");
script_fout << format("Open(%1%, 1)") % filepath << endl;
int * code2GID = nullptr;
int code2GID_len = 0;
int maxcode = 0;
Gfx8BitFont * font_8bit = nullptr;
if(get_metric_only)
{
info.use_tounicode = false;
}
else
{
/*
* Step 1
* dump the font file directly from the font descriptor and put the glyphs into the correct slots
*
* for 8bit + nonTrueType
* re-encoding the font using a PostScript encoding list (glyph id <-> glpyh name)
*
* for 8bit + TrueType
* sort the glpyhs as the original order, and later will map GID (instead of char code) to Unicode
*
* for CID + nonTrueType
* Flatten the font
*
* for CID Truetype
* same as 8bitTrueType, except for that we have to check 65536 charcodes
*/
if(!font->isCIDFont())
{
font_8bit = dynamic_cast<Gfx8BitFont*>(font);
maxcode = 0xff;
if((suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf"))
{
script_fout << "Reencode(\"original\")" << endl;
FoFiTrueType *fftt = nullptr;
if((fftt = FoFiTrueType::load((char*)filepath.c_str())) != nullptr)
{
code2GID = font_8bit->getCodeToGIDMap(fftt);
code2GID_len = 256;
delete fftt;
}
}
else
{
// move the slot such that it's consistent with the encoding seen in PDF
ofstream out(tmp_dir / (fn + "_.encoding"));
add_tmp_file(fn+"_.encoding");
out << format("/%1% [") % fn << endl;
for(int i = 0; i < 256; ++i)
{
auto cn = font_8bit->getCharName(i);
out << "/" << ((cn == nullptr) ? ".notdef" : cn) << endl;
}
out << "] def" << endl;
script_fout << format("LoadEncodingFile(%1%)") % (tmp_dir / (fn+"_.encoding")) << endl;
script_fout << format("Reencode(\"%1%\")") % fn << endl;
}
}
else
{
maxcode = 0xffff;
if(suffix == ".ttf")
{
script_fout << "Reencode(\"original\")" << endl;
GfxCIDFont * _font = dynamic_cast<GfxCIDFont*>(font);
// code2GID has been stored for embedded CID fonts
code2GID = _font->getCIDToGID();
code2GID_len = _font->getCIDToGIDLen();
}
else
{
script_fout << "CIDFlatten()" << endl;
}
}
/*
* Step 2
* map charcode (or GID for CID truetype)
* generate an Consortium encoding file and let fontforge handle it.
*
* - Always map to Unicode for 8bit TrueType fonts and CID fonts
*
* - For 8bit nonTruetype fonts:
* Try to calculate the correct Unicode value from the glyph names, unless param->always_apply_tounicode is set
*
*/
info.use_tounicode = ((suffix == ".ttf") || (font->isCIDFont()) || (param->always_apply_tounicode));
auto ctu = font->getToUnicode();
ofstream map_fout(tmp_dir / (fn + ".encoding"));
add_tmp_file(fn+".encoding");
int cnt = 0;
for(int i = 0; i <= maxcode; ++i)
{
if((suffix != ".ttf") && (font_8bit != nullptr) && (font_8bit->getCharName(i) == nullptr))
continue;
++ cnt;
map_fout << format("0x%|1$X|") % ((code2GID && (i < code2GID_len))? code2GID[i] : i);
Unicode u, *pu=&u;
if(info.use_tounicode)
{
int n = 0;
if(ctu)
n = ctu->mapToUnicode(i, &pu);
u = check_unicode(pu, n, i, font);
}
else
{
u = unicode_from_font(i, font);
}
map_fout << format(" 0x%|1$X|") % u;
map_fout << format(" # 0x%|1$X|") % i;
map_fout << endl;
}
if(cnt > 0)
{
script_fout << format("LoadEncodingFile(%1%, \"%2%\")") % (tmp_dir / (fn+".encoding")) % fn << endl;
script_fout << format("Reencode(\"%1%\", 1)") % fn << endl;
}
if(ctu)
ctu->decRefCnt();
}
auto dest = ((param->single_html ? tmp_dir : dest_dir) / (fn+(param->font_suffix)));
if(param->single_html)
add_tmp_file(fn+(param->font_suffix));
/*
* [Win|Typo|HHead][Ascent|Descent]
* Firefox & Chrome interprets the values in different ways
* Trying to unify them
*/
add_tmp_file(fn + "_.ttf");
script_fout << format("Generate(%1%)") % (tmp_dir / (fn + "_.ttf")) << endl;
script_fout << "Close()" << endl;
script_fout << format("Open(%1%, 1)") % (tmp_dir / (fn + "_.ttf")) << endl;
script_fout << ifstream(PDF2HTMLEX_DATA_PATH / UNIFY_SCRIPT_FILENAME).rdbuf();
script_fout << format("Generate(%1%)") % dest << endl;
script_fout.close();
if(system((boost::format("fontforge -script %1% 1>%2% 2>%3%") % script_path % (tmp_dir / (fn+".info")) % (tmp_dir / NULL_FILENAME)).str().c_str()) != 0)
cerr << "Warning: fontforge failed." << endl;
add_tmp_file(fn+".info");
add_tmp_file(NULL_FILENAME);
// read metric
int em, ascent, descent;
if(ifstream(tmp_dir / (fn+".info")) >> em >> ascent >> descent)
{
if(em != 0)
{
info.ascent = ((double)ascent) / em;
info.descent = -((double)descent) / em;
}
else
{
info.ascent = 0;
info.descent = 0;
}
}
else
{
cerr << "Warning: cannot read font info for " << fn << endl;
info.ascent = font->getAscent();
info.descent = font->getDescent();
}
if(param->debug)
{
cerr << "Ascent: " << info.ascent << " Descent: " << info.descent << endl;
}
}
void HTMLRenderer::drawString(GfxState * state, GooString * s)

View File

@ -28,6 +28,8 @@ struct Param
int process_nontext;
int single_html;
int embed_base_font;
int embed_external_font;
// Advanced tweak
double h_eps, v_eps;

View File

@ -76,6 +76,8 @@ po::variables_map parse_options (int argc, char **argv)
("process-nontext", po::value<int>(&param.process_nontext)->default_value(1), "process nontext objects")
("single-html", po::value<int>(&param.single_html)->default_value(1), "combine everything into one single HTML file")
("embed-base-font", po::value<int>(&param.embed_base_font)->default_value(1), "embed local matched font for base 14 fonts in the PDF file")
("embed-external-font", po::value<int>(&param.embed_external_font)->default_value(0), "embed local matched font for external fonts in the PDF file")
("heps", po::value<double>(&param.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)")
("veps", po::value<double>(&param.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")