1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-03 08:38:39 +00:00

working on integrating splash/cairo

This commit is contained in:
Lu Wang 2012-10-02 20:56:40 +08:00
parent a9790d68b3
commit 8c5851863c
9 changed files with 658 additions and 54 deletions

View File

@ -0,0 +1,26 @@
/*
* CairoBackgroundRenderer.cc
*
* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>
*/
#include "pdf2htmlEX-config.h"
#if HAVE_CAIRO
#include "CairoBackgroundRenderer.h"
namespace pdf2htmlEX {
void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen)
{
// CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen);
}
} // namespace pdf2htmlEX
#endif // HAVE_CAIRO

View File

@ -3,7 +3,7 @@
*
* Handling general stuffs
*
* by WangLu
* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>
* 2012.08.14
*/
@ -13,8 +13,6 @@
#include <algorithm>
#include <vector>
#include <splash/SplashBitmap.h>
#include "HTMLRenderer.h"
#include "BackgroundRenderer.h"
#include "namespace.h"
@ -117,12 +115,7 @@ void HTMLRenderer::process(PDFDoc *doc)
BackgroundRenderer * bg_renderer = nullptr;
if(param->process_nontext)
{
// Render non-text objects as image
// copied from poppler
SplashColor color;
color[0] = color[1] = color[2] = 255;
bg_renderer = new BackgroundRenderer(splashModeRGB8, 4, gFalse, color);
bg_renderer = new BackgroundRenderer();
bg_renderer->startDoc(doc);
}

View File

@ -0,0 +1,567 @@
/*
* text.cc
*
* Handling text & font, and relative stuffs
*
* by WangLu
* 2012.08.14
*/
#include <iostream>
#include <algorithm>
#include <unordered_set>
#include <cctype>
#include <cmath>
#include <CharCodeToUnicode.h>
#include <fofi/FoFiTrueType.h>
#include "ffw.h"
#include "HTMLRenderer.h"
#include "namespace.h"
namespace pdf2htmlEX {
using std::unordered_set;
using std::min;
using std::all_of;
using std::floor;
using std::swap;
string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
{
Object obj, obj1, obj2;
Object font_obj, font_obj2, fontdesc_obj;
string suffix;
string filepath;
try
{
// mupdf consulted
string subtype;
auto * id = font->getID();
Object ref_obj;
ref_obj.initRef(id->num, id->gen);
ref_obj.fetch(xref, &font_obj);
ref_obj.free();
if(!font_obj.isDict())
{
cerr << "Font object is not a dictionary" << endl;
throw 0;
}
Dict * dict = font_obj.getDict();
if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
{
if(font_obj2.arrayGetLength() == 0)
{
cerr << "Warning: empty DescendantFonts array" << endl;
}
else
{
if(font_obj2.arrayGetLength() > 1)
cerr << "TODO: multiple entries in DescendantFonts array" << endl;
if(font_obj2.arrayGet(0, &obj2)->isDict())
{
dict = obj2.getDict();
}
}
}
if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
{
cerr << "Cannot find FontDescriptor " << endl;
throw 0;
}
dict = fontdesc_obj.getDict();
if(dict->lookup("FontFile3", &obj)->isStream())
{
if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
{
subtype = obj1.getName();
if(subtype == "Type1C")
{
suffix = ".cff";
}
else if (subtype == "CIDFontType0C")
{
suffix = ".cid";
}
else
{
cerr << "Unknown subtype: " << subtype << endl;
throw 0;
}
}
else
{
cerr << "Invalid subtype in font descriptor" << endl;
throw 0;
}
}
else if (dict->lookup("FontFile2", &obj)->isStream())
{
suffix = ".ttf";
}
else if (dict->lookup("FontFile", &obj)->isStream())
{
suffix = ".pfa";
}
else
{
cerr << "Cannot find FontFile for dump" << endl;
throw 0;
}
if(suffix == "")
{
cerr << "Font type unrecognized" << endl;
throw 0;
}
obj.streamReset();
filepath = (char*)str_fmt("%s/f%llx%s", param->tmp_dir.c_str(), fn_id, suffix.c_str());
add_tmp_file(filepath);
ofstream outf(filepath, ofstream::binary);
if(!outf)
throw string("Cannot open file ") + filepath + " for writing";
char buf[1024];
int len;
while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
{
outf.write(buf, len);
}
outf.close();
obj.streamClose();
}
catch(int)
{
cerr << "Someting wrong when trying to dump font " << hex << fn_id << dec << endl;
}
obj2.free();
obj1.free();
obj.free();
fontdesc_obj.free();
font_obj2.free();
font_obj.free();
return filepath;
}
void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only)
{
if(param->debug)
{
cerr << "Embed font: " << filepath << " " << info.id << endl;
}
ffw_load_font(filepath.c_str());
int * code2GID = nullptr;
int code2GID_len = 0;
int maxcode = 0;
Gfx8BitFont * font_8bit = nullptr;
GfxCIDFont * font_cid = nullptr;
string suffix = get_suffix(filepath);
for(auto iter = suffix.begin(); iter != suffix.end(); ++iter)
*iter = tolower(*iter);
/*
* if parm->tounicode is 0, try the provided tounicode map first
*/
info.use_tounicode = (is_truetype_suffix(suffix) || (param->tounicode >= 0));
info.has_space = false;
const char * used_map = nullptr;
info.em_size = ffw_get_em_size();
if(get_metric_only)
return;
used_map = preprocessor.get_code_map(hash_ref(font->getID()));
/*
* Step 1
* dump the font file directly from the font descriptor and put the glyphs into the correct slots
*
* for 8bit + nonTrueType
* re-encoding the font using a PostScript encoding list (glyph id <-> glpyh name)
*
* for 8bit + TrueType
* sort the glpyhs as the original order, and later will map GID (instead of char code) to Unicode
*
* for CID + nonTrueType
* Flatten the font
*
* for CID Truetype
* same as 8bitTrueType, except for that we have to check 65536 charcodes
*/
if(!font->isCIDFont())
{
font_8bit = dynamic_cast<Gfx8BitFont*>(font);
maxcode = 0xff;
if(is_truetype_suffix(suffix))
{
ffw_reencode_glyph_order();
FoFiTrueType *fftt = nullptr;
if((fftt = FoFiTrueType::load((char*)filepath.c_str())) != nullptr)
{
code2GID = font_8bit->getCodeToGIDMap(fftt);
code2GID_len = 256;
delete fftt;
}
}
else
{
// move the slot such that it's consistent with the encoding seen in PDF
unordered_set<string> nameset;
bool name_conflict_warned = false;
memset(cur_mapping2, 0, 0x100 * sizeof(char*));
for(int i = 0; i < 256; ++i)
{
if(!used_map[i]) continue;
auto cn = font_8bit->getCharName(i);
if(cn == nullptr)
{
continue;
}
else
{
if(nameset.insert(string(cn)).second)
{
cur_mapping2[i] = cn;
}
else
{
if(!name_conflict_warned)
{
name_conflict_warned = true;
//TODO: may be resolved using advanced font properties?
cerr << "Warning: encoding confliction detected in font: " << hex << info.id << dec << endl;
}
}
}
}
ffw_reencode_raw2(cur_mapping2, 256, 0);
}
}
else
{
font_cid = dynamic_cast<GfxCIDFont*>(font);
maxcode = 0xffff;
if(is_truetype_suffix(suffix))
{
ffw_reencode_glyph_order();
GfxCIDFont * _font = dynamic_cast<GfxCIDFont*>(font);
// code2GID has been stored for embedded CID fonts
code2GID = _font->getCIDToGID();
code2GID_len = _font->getCIDToGIDLen();
}
else
{
ffw_cidflatten();
}
}
/*
* Step 2
* map charcode (or GID for CID truetype)
* generate an Consortium encoding file and let fontforge handle it.
*
* - Always map to Unicode for 8bit TrueType fonts and CID fonts
*
* - For 8bit nonTruetype fonts:
* Try to calculate the correct Unicode value from the glyph names, unless param->always_apply_tounicode is set
*
*
* Also fill in the width_list, and set widths accordingly
*/
{
unordered_set<int> codeset;
bool name_conflict_warned = false;
auto ctu = font->getToUnicode();
memset(cur_mapping, -1, 0x10000 * sizeof(*cur_mapping));
memset(width_list, -1, 0x10000 * sizeof(*width_list));
if(code2GID)
maxcode = min(maxcode, code2GID_len - 1);
bool is_truetype = is_truetype_suffix(suffix);
int max_key = maxcode;
/*
* Traverse all possible codes
*/
bool retried = false; // avoid infinite loop
for(int i = 0; i <= maxcode; ++i)
{
if(!used_map[i])
continue;
/*
* Skip glyphs without names (only for non-ttf fonts)
*/
if(!is_truetype && (font_8bit != nullptr)
&& (font_8bit->getCharName(i) == nullptr))
{
continue;
}
int k = i;
if(code2GID)
{
if((k = code2GID[i]) == 0) continue;
}
if(k > max_key)
max_key = k;
Unicode u, *pu=&u;
if(info.use_tounicode)
{
int n = ctu ? (ctu->mapToUnicode(i, &pu)) : 0;
u = check_unicode(pu, n, i, font);
}
else
{
u = unicode_from_font(i, font);
}
if(u == ' ')
info.has_space = true;
if(codeset.insert(u).second)
{
cur_mapping[k] = u;
}
else
{
// collision detected
if(param->tounicode == 0)
{
// in auto mode, just drop the tounicode map
if(!retried)
{
cerr << "ToUnicode CMap is not valid and got dropped" << endl;
retried = true;
codeset.clear();
info.use_tounicode = false;
memset(cur_mapping, -1, 0x10000 * sizeof(*cur_mapping));
memset(width_list, -1, 0x10000 * sizeof(*width_list));
i = -1;
continue;
}
}
if(!name_conflict_warned)
{
name_conflict_warned = true;
//TODO: may be resolved using advanced font properties?
cerr << "Warning: encoding confliction detected in font: " << hex << info.id << dec << endl;
}
}
if(font_8bit)
{
width_list[k] = (int)floor(font_8bit->getWidth(i) * info.em_size + 0.5);
}
else
{
char buf[2];
buf[0] = (i >> 8) & 0xff;
buf[1] = (i & 0xff);
width_list[k] = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5);
}
}
ffw_reencode_raw(cur_mapping, max_key + 1, 1);
ffw_set_widths(width_list, max_key + 1);
if(ctu)
ctu->decRefCnt();
}
/*
* Step 3
*
* Generate the font as desired
*
*/
string cur_tmp_fn = (char*)str_fmt("%s/__tmp_font1%s", param->tmp_dir.c_str(), param->font_suffix.c_str());
add_tmp_file(cur_tmp_fn);
string other_tmp_fn = (char*)str_fmt("%s/__tmp_font2%s", param->tmp_dir.c_str(), param->font_suffix.c_str());
add_tmp_file(other_tmp_fn);
ffw_save(cur_tmp_fn.c_str());
ffw_close();
/*
* Step 4
* Font Hinting
*/
bool hinted = false;
// Call external hinting program if specified
if(param->external_hint_tool != "")
{
hinted = (system((char*)str_fmt("%s \"%s\" \"%s\"", param->external_hint_tool.c_str(), cur_tmp_fn.c_str(), other_tmp_fn.c_str())) == 0);
}
// Call internal hinting procedure if specified
if((!hinted) && (param->auto_hint))
{
ffw_load_font(cur_tmp_fn.c_str());
ffw_auto_hint();
ffw_save(other_tmp_fn.c_str());
ffw_close();
hinted = true;
}
if(hinted)
{
swap(cur_tmp_fn, other_tmp_fn);
}
/*
* Step 5
* Generate the font
* Reload to retrieve/fix accurate ascent/descent
*/
string fn = (char*)str_fmt("%s/f%llx%s",
(param->single_html ? param->tmp_dir : param->dest_dir).c_str(),
info.id, param->font_suffix.c_str());
if(param->single_html)
add_tmp_file(fn);
ffw_load_font(cur_tmp_fn.c_str());
ffw_metric(&info.ascent, &info.descent);
ffw_save(fn.c_str());
ffw_close();
}
void HTMLRenderer::drawString(GfxState * state, GooString * s)
{
if(s->getLength() == 0)
return;
auto font = state->getFont();
if((font == nullptr) || (font->getWMode()))
{
return;
}
//hidden
if((state->getRender() & 3) == 3)
{
return;
}
// see if the line has to be closed due to state change
check_state_change(state);
prepare_line(state);
// Now ready to output
// get the unicodes
char *p = s->getCString();
int len = s->getLength();
double dx = 0;
double dy = 0;
double dxerr = 0;
double dx1,dy1;
double ox, oy;
int nChars = 0;
int nSpaces = 0;
int uLen;
CharCode code;
Unicode *u = nullptr;
while (len > 0) {
auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy);
if(!(_equal(ox, 0) && _equal(oy, 0)))
{
cerr << "TODO: non-zero origins" << endl;
}
bool is_space = false;
if (n == 1 && *p == ' ')
{
++nSpaces;
is_space = true;
}
if(is_space && (param->space_as_offset))
{
// ignore horiz_scaling, as it's merged in CTM
line_buf.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_scale);
}
else
{
if((param->decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
{
line_buf.append_unicodes(u, uLen);
}
else
{
if(cur_font_info->use_tounicode)
{
Unicode uu = check_unicode(u, uLen, code, font);
line_buf.append_unicodes(&uu, 1);
}
else
{
Unicode uu = unicode_from_font(code, font);
line_buf.append_unicodes(&uu, 1);
}
}
}
dx += dx1;
dy += dy1;
++nChars;
p += n;
len -= n;
}
double hs = state->getHorizScaling();
// horiz_scaling is merged into ctm now,
// so the coordinate system is ugly
dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs;
dy *= cur_font_size;
cur_tx += dx;
cur_ty += dy;
draw_tx += dx + dxerr * cur_font_size * hs;
draw_ty += dy;
}
} // namespace pdf2htmlEX

View File

@ -1,19 +1,16 @@
/*
* BackgroundRenderer.cc
* SplashBackgroundRenderer.cc
*
* Copyright (C) 2012 by Lu Wang coolwanglu<at>gmail.com
* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>
*/
#include <algorithm>
#include "SplashBackgroundRenderer.h"
#include "GfxFont.h"
namespace pdf2htmlEX {
#include "BackgroundRenderer.h"
#include "util.h"
const SplashColor SplashBackgroundRenderer::white = {255,255,255};
using namespace pdf2htmlEX;
void BackgroundRenderer::drawChar(GfxState *state, double x, double y,
void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen)
@ -21,3 +18,4 @@ void BackgroundRenderer::drawChar(GfxState *state, double x, double y,
// SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen);
}
} // namespace pdf2htmlEX

View File

@ -2,37 +2,31 @@
* Background renderer
* Render all those things not supported as Image
*
* by WangLu
* 2012.08.06
* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>
*/
#ifndef BACKGROUND_RENDERER_H__
#define BACKGROUND_RENDERER_H__
#include <SplashOutputDev.h>
#include "pdf2htmlEX-config.h"
#if HAVE_CAIRO
#include "CairoBackgroundRenderer.h"
namespace pdf2htmlEX {
// Based on BackgroundRenderer from poppler
class BackgroundRenderer : public SplashOutputDev {
public:
BackgroundRenderer(SplashColorMode colorModeA, int bitmapRowPadA,
GBool reverseVideoA, SplashColorPtr paperColorA,
GBool bitmapTopDownA = gTrue,
GBool allowAntialiasA = gTrue)
: SplashOutputDev(colorModeA,
bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA,
allowAntialiasA)
{ }
virtual ~BackgroundRenderer() { }
virtual void drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen);
};
typedef CairoBackgroundRenderer BackgroundRenderer;
}
#else
#include "SplashBackgroundRenderer.h"
namespace pdf2htmlEX {
typedef SplashBackgroundRenderer BackgroundRenderer;
}
#endif // HAVE_CAIRO
#endif //BACKGROUND_RENDERER_H__

View File

@ -6,8 +6,8 @@
*/
#ifndef SPLASH_BACKGROUND_RENDERER_H__
#define SPLASH_BACKGROUND_RENDERER_H__
#ifndef CAIRO_BACKGROUND_RENDERER_H__
#define CAIRO_BACKGROUND_RENDERER_H__
#include <splash/SplashBitmap.h>
#include <SplashOutputDev.h>
@ -34,4 +34,4 @@ public:
}
#endif //SPLASH_BACKGROUND_RENDERER_H__
#endif //CAIRO_BACKGROUND_RENDERER_H__

View File

@ -7,8 +7,8 @@
*/
#ifndef BACKGROUND_RENDERER_H__
#define BACKGROUND_RENDERER_H__
#ifndef SPLASH_BACKGROUND_RENDERER_H__
#define SPLASH_BACKGROUND_RENDERER_H__
#include <splash/SplashBitmap.h>
#include <SplashOutputDev.h>
@ -19,13 +19,13 @@ namespace pdf2htmlEX {
class SplashBackgroundRenderer : public SplashOutputDev
{
public:
static const SplashColor white;
SplashBackgroundRenderer()
{
SplashColor color;
color[0] = color[1] = color[2] = 255;
SplashOutputDev(splashModeRGB8, 4, gFlase, color, gTrue, gTrue)`
}
virtual ~BackgroundRenderer() { }
: SplashOutputDev(splashModeRGB8, 4, gFalse, (SplashColorPtr)&white, gTrue, gTrue)
{ }
virtual ~SplashBackgroundRenderer() { }
virtual void drawChar(GfxState *state, double x, double y,
double dx, double dy,
@ -33,6 +33,6 @@ public:
CharCode code, int nBytes, Unicode *u, int uLen);
};
}
} // namespace pdf2htmlEX
#endif //BACKGROUND_RENDERER_H__
#endif // SPLASH_BACKGROUND_RENDERER_H__

View File

@ -0,0 +1,24 @@
/*
* config.h
* Compile time constants
*
* by WangLu
*/
#ifndef PDF2HTMLEX_CONFIG_H__
#define PDF2HTMLEX_CONFIG_H__
#include <string>
#define HAVE_CAIRO 0
namespace pdf2htmlEX {
static const std::string PDF2HTMLEX_VERSION = "0.5";
static const std::string PDF2HTMLEX_PREFIX = "/usr/local";
static const std::string PDF2HTMLEX_DATA_PATH = "/usr/local""/share/pdf2htmlEX";
} // namespace pdf2htmlEX
#endif //PDF2HTMLEX_CONFIG_H__

View File

@ -11,6 +11,8 @@
#include <string>
#define HAVE_CAIRO @HAVE_CAIRO@
namespace pdf2htmlEX {
static const std::string PDF2HTMLEX_VERSION = "@PDF2HTMLEX_VERSION@";