diff --git a/src/CairoBackgroundRenderer.cc b/src/CairoBackgroundRenderer.cc new file mode 100644 index 0000000..9005ba9 --- /dev/null +++ b/src/CairoBackgroundRenderer.cc @@ -0,0 +1,26 @@ +/* + * CairoBackgroundRenderer.cc + * + * Copyright (C) 2012 Lu Wang + */ + +#include "pdf2htmlEX-config.h" + +#if HAVE_CAIRO + +#include "CairoBackgroundRenderer.h" + +namespace pdf2htmlEX { + +void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen) +{ + // CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen); +} + +} // namespace pdf2htmlEX + +#endif // HAVE_CAIRO + diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index b48227c..427af7b 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -3,7 +3,7 @@ * * Handling general stuffs * - * by WangLu + * Copyright (C) 2012 Lu Wang * 2012.08.14 */ @@ -13,8 +13,6 @@ #include #include -#include - #include "HTMLRenderer.h" #include "BackgroundRenderer.h" #include "namespace.h" @@ -117,12 +115,7 @@ void HTMLRenderer::process(PDFDoc *doc) BackgroundRenderer * bg_renderer = nullptr; if(param->process_nontext) { - // Render non-text objects as image - // copied from poppler - SplashColor color; - color[0] = color[1] = color[2] = 255; - - bg_renderer = new BackgroundRenderer(splashModeRGB8, 4, gFalse, color); + bg_renderer = new BackgroundRenderer(); bg_renderer->startDoc(doc); } diff --git a/src/HTMLRenderer/text.cc.orig b/src/HTMLRenderer/text.cc.orig new file mode 100644 index 0000000..1a4b44f --- /dev/null +++ b/src/HTMLRenderer/text.cc.orig @@ -0,0 +1,567 @@ +/* + * text.cc + * + * Handling text & font, and relative stuffs + * + * by WangLu + * 2012.08.14 + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "ffw.h" +#include "HTMLRenderer.h" +#include "namespace.h" + +namespace pdf2htmlEX { + +using std::unordered_set; +using std::min; +using std::all_of; +using std::floor; +using std::swap; + +string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) +{ + Object obj, obj1, obj2; + Object font_obj, font_obj2, fontdesc_obj; + string suffix; + string filepath; + + try + { + // mupdf consulted + string subtype; + + auto * id = font->getID(); + + Object ref_obj; + ref_obj.initRef(id->num, id->gen); + ref_obj.fetch(xref, &font_obj); + ref_obj.free(); + + if(!font_obj.isDict()) + { + cerr << "Font object is not a dictionary" << endl; + throw 0; + } + + Dict * dict = font_obj.getDict(); + if(dict->lookup("DescendantFonts", &font_obj2)->isArray()) + { + if(font_obj2.arrayGetLength() == 0) + { + cerr << "Warning: empty DescendantFonts array" << endl; + } + else + { + if(font_obj2.arrayGetLength() > 1) + cerr << "TODO: multiple entries in DescendantFonts array" << endl; + + if(font_obj2.arrayGet(0, &obj2)->isDict()) + { + dict = obj2.getDict(); + } + } + } + + if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict()) + { + cerr << "Cannot find FontDescriptor " << endl; + throw 0; + } + + dict = fontdesc_obj.getDict(); + + if(dict->lookup("FontFile3", &obj)->isStream()) + { + if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName()) + { + subtype = obj1.getName(); + if(subtype == "Type1C") + { + suffix = ".cff"; + } + else if (subtype == "CIDFontType0C") + { + suffix = ".cid"; + } + else + { + cerr << "Unknown subtype: " << subtype << endl; + throw 0; + } + } + else + { + cerr << "Invalid subtype in font descriptor" << endl; + throw 0; + } + } + else if (dict->lookup("FontFile2", &obj)->isStream()) + { + suffix = ".ttf"; + } + else if (dict->lookup("FontFile", &obj)->isStream()) + { + suffix = ".pfa"; + } + else + { + cerr << "Cannot find FontFile for dump" << endl; + throw 0; + } + + if(suffix == "") + { + cerr << "Font type unrecognized" << endl; + throw 0; + } + + obj.streamReset(); + + filepath = (char*)str_fmt("%s/f%llx%s", param->tmp_dir.c_str(), fn_id, suffix.c_str()); + add_tmp_file(filepath); + + ofstream outf(filepath, ofstream::binary); + if(!outf) + throw string("Cannot open file ") + filepath + " for writing"; + + char buf[1024]; + int len; + while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0) + { + outf.write(buf, len); + } + outf.close(); + obj.streamClose(); + } + catch(int) + { + cerr << "Someting wrong when trying to dump font " << hex << fn_id << dec << endl; + } + + obj2.free(); + obj1.free(); + obj.free(); + + fontdesc_obj.free(); + font_obj2.free(); + font_obj.free(); + + return filepath; +} + +void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only) +{ + if(param->debug) + { + cerr << "Embed font: " << filepath << " " << info.id << endl; + } + + ffw_load_font(filepath.c_str()); + int * code2GID = nullptr; + int code2GID_len = 0; + int maxcode = 0; + + Gfx8BitFont * font_8bit = nullptr; + GfxCIDFont * font_cid = nullptr; + + string suffix = get_suffix(filepath); + for(auto iter = suffix.begin(); iter != suffix.end(); ++iter) + *iter = tolower(*iter); + + /* + * if parm->tounicode is 0, try the provided tounicode map first + */ + info.use_tounicode = (is_truetype_suffix(suffix) || (param->tounicode >= 0)); + info.has_space = false; + + const char * used_map = nullptr; + + info.em_size = ffw_get_em_size(); + + if(get_metric_only) + return; + + used_map = preprocessor.get_code_map(hash_ref(font->getID())); + + /* + * Step 1 + * dump the font file directly from the font descriptor and put the glyphs into the correct slots + * + * for 8bit + nonTrueType + * re-encoding the font using a PostScript encoding list (glyph id <-> glpyh name) + * + * for 8bit + TrueType + * sort the glpyhs as the original order, and later will map GID (instead of char code) to Unicode + * + * for CID + nonTrueType + * Flatten the font + * + * for CID Truetype + * same as 8bitTrueType, except for that we have to check 65536 charcodes + */ + if(!font->isCIDFont()) + { + font_8bit = dynamic_cast(font); + maxcode = 0xff; + if(is_truetype_suffix(suffix)) + { + ffw_reencode_glyph_order(); + FoFiTrueType *fftt = nullptr; + if((fftt = FoFiTrueType::load((char*)filepath.c_str())) != nullptr) + { + code2GID = font_8bit->getCodeToGIDMap(fftt); + code2GID_len = 256; + delete fftt; + } + } + else + { + // move the slot such that it's consistent with the encoding seen in PDF + unordered_set nameset; + bool name_conflict_warned = false; + + memset(cur_mapping2, 0, 0x100 * sizeof(char*)); + + for(int i = 0; i < 256; ++i) + { + if(!used_map[i]) continue; + + auto cn = font_8bit->getCharName(i); + if(cn == nullptr) + { + continue; + } + else + { + if(nameset.insert(string(cn)).second) + { + cur_mapping2[i] = cn; + } + else + { + if(!name_conflict_warned) + { + name_conflict_warned = true; + //TODO: may be resolved using advanced font properties? + cerr << "Warning: encoding confliction detected in font: " << hex << info.id << dec << endl; + } + } + } + } + + ffw_reencode_raw2(cur_mapping2, 256, 0); + } + } + else + { + font_cid = dynamic_cast(font); + maxcode = 0xffff; + + if(is_truetype_suffix(suffix)) + { + ffw_reencode_glyph_order(); + + GfxCIDFont * _font = dynamic_cast(font); + + // code2GID has been stored for embedded CID fonts + code2GID = _font->getCIDToGID(); + code2GID_len = _font->getCIDToGIDLen(); + } + else + { + ffw_cidflatten(); + } + } + + /* + * Step 2 + * map charcode (or GID for CID truetype) + * generate an Consortium encoding file and let fontforge handle it. + * + * - Always map to Unicode for 8bit TrueType fonts and CID fonts + * + * - For 8bit nonTruetype fonts: + * Try to calculate the correct Unicode value from the glyph names, unless param->always_apply_tounicode is set + * + * + * Also fill in the width_list, and set widths accordingly + */ + + + { + unordered_set codeset; + bool name_conflict_warned = false; + + auto ctu = font->getToUnicode(); + memset(cur_mapping, -1, 0x10000 * sizeof(*cur_mapping)); + memset(width_list, -1, 0x10000 * sizeof(*width_list)); + + if(code2GID) + maxcode = min(maxcode, code2GID_len - 1); + + bool is_truetype = is_truetype_suffix(suffix); + int max_key = maxcode; + /* + * Traverse all possible codes + */ + bool retried = false; // avoid infinite loop + for(int i = 0; i <= maxcode; ++i) + { + if(!used_map[i]) + continue; + + /* + * Skip glyphs without names (only for non-ttf fonts) + */ + if(!is_truetype && (font_8bit != nullptr) + && (font_8bit->getCharName(i) == nullptr)) + { + continue; + } + + int k = i; + if(code2GID) + { + if((k = code2GID[i]) == 0) continue; + } + + if(k > max_key) + max_key = k; + + Unicode u, *pu=&u; + if(info.use_tounicode) + { + int n = ctu ? (ctu->mapToUnicode(i, &pu)) : 0; + u = check_unicode(pu, n, i, font); + } + else + { + u = unicode_from_font(i, font); + } + + if(u == ' ') + info.has_space = true; + + if(codeset.insert(u).second) + { + cur_mapping[k] = u; + } + else + { + // collision detected + if(param->tounicode == 0) + { + // in auto mode, just drop the tounicode map + if(!retried) + { + cerr << "ToUnicode CMap is not valid and got dropped" << endl; + retried = true; + codeset.clear(); + info.use_tounicode = false; + memset(cur_mapping, -1, 0x10000 * sizeof(*cur_mapping)); + memset(width_list, -1, 0x10000 * sizeof(*width_list)); + i = -1; + continue; + } + } + if(!name_conflict_warned) + { + name_conflict_warned = true; + //TODO: may be resolved using advanced font properties? + cerr << "Warning: encoding confliction detected in font: " << hex << info.id << dec << endl; + } + } + + if(font_8bit) + { + width_list[k] = (int)floor(font_8bit->getWidth(i) * info.em_size + 0.5); + } + else + { + char buf[2]; + buf[0] = (i >> 8) & 0xff; + buf[1] = (i & 0xff); + width_list[k] = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5); + } + } + + ffw_reencode_raw(cur_mapping, max_key + 1, 1); + ffw_set_widths(width_list, max_key + 1); + + if(ctu) + ctu->decRefCnt(); + } + + /* + * Step 3 + * + * Generate the font as desired + * + */ + string cur_tmp_fn = (char*)str_fmt("%s/__tmp_font1%s", param->tmp_dir.c_str(), param->font_suffix.c_str()); + add_tmp_file(cur_tmp_fn); + string other_tmp_fn = (char*)str_fmt("%s/__tmp_font2%s", param->tmp_dir.c_str(), param->font_suffix.c_str()); + add_tmp_file(other_tmp_fn); + + ffw_save(cur_tmp_fn.c_str()); + ffw_close(); + + /* + * Step 4 + * Font Hinting + */ + bool hinted = false; + + // Call external hinting program if specified + if(param->external_hint_tool != "") + { + hinted = (system((char*)str_fmt("%s \"%s\" \"%s\"", param->external_hint_tool.c_str(), cur_tmp_fn.c_str(), other_tmp_fn.c_str())) == 0); + } + + // Call internal hinting procedure if specified + if((!hinted) && (param->auto_hint)) + { + ffw_load_font(cur_tmp_fn.c_str()); + ffw_auto_hint(); + ffw_save(other_tmp_fn.c_str()); + ffw_close(); + hinted = true; + } + + if(hinted) + { + swap(cur_tmp_fn, other_tmp_fn); + } + + /* + * Step 5 + * Generate the font + * Reload to retrieve/fix accurate ascent/descent + */ + string fn = (char*)str_fmt("%s/f%llx%s", + (param->single_html ? param->tmp_dir : param->dest_dir).c_str(), + info.id, param->font_suffix.c_str()); + + if(param->single_html) + add_tmp_file(fn); + + ffw_load_font(cur_tmp_fn.c_str()); + ffw_metric(&info.ascent, &info.descent); + ffw_save(fn.c_str()); + ffw_close(); +} + +void HTMLRenderer::drawString(GfxState * state, GooString * s) +{ + if(s->getLength() == 0) + return; + + auto font = state->getFont(); + if((font == nullptr) || (font->getWMode())) + { + return; + } + + //hidden + if((state->getRender() & 3) == 3) + { + return; + } + + // see if the line has to be closed due to state change + check_state_change(state); + prepare_line(state); + + // Now ready to output + // get the unicodes + char *p = s->getCString(); + int len = s->getLength(); + + double dx = 0; + double dy = 0; + double dxerr = 0; + double dx1,dy1; + double ox, oy; + + int nChars = 0; + int nSpaces = 0; + int uLen; + + CharCode code; + Unicode *u = nullptr; + + while (len > 0) { + auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy); + + if(!(_equal(ox, 0) && _equal(oy, 0))) + { + cerr << "TODO: non-zero origins" << endl; + } + + bool is_space = false; + if (n == 1 && *p == ' ') + { + ++nSpaces; + is_space = true; + } + + if(is_space && (param->space_as_offset)) + { + // ignore horiz_scaling, as it's merged in CTM + line_buf.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_scale); + } + else + { + if((param->decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) + { + line_buf.append_unicodes(u, uLen); + } + else + { + if(cur_font_info->use_tounicode) + { + Unicode uu = check_unicode(u, uLen, code, font); + line_buf.append_unicodes(&uu, 1); + } + else + { + Unicode uu = unicode_from_font(code, font); + line_buf.append_unicodes(&uu, 1); + } + } + } + + dx += dx1; + dy += dy1; + + ++nChars; + p += n; + len -= n; + } + + double hs = state->getHorizScaling(); + + // horiz_scaling is merged into ctm now, + // so the coordinate system is ugly + dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs; + + dy *= cur_font_size; + + cur_tx += dx; + cur_ty += dy; + + draw_tx += dx + dxerr * cur_font_size * hs; + draw_ty += dy; +} + +} // namespace pdf2htmlEX diff --git a/src/SplashBackgroundRenderer.cc b/src/SplashBackgroundRenderer.cc index 6956408..4492160 100644 --- a/src/SplashBackgroundRenderer.cc +++ b/src/SplashBackgroundRenderer.cc @@ -1,19 +1,16 @@ /* - * BackgroundRenderer.cc + * SplashBackgroundRenderer.cc * - * Copyright (C) 2012 by Lu Wang coolwanglugmail.com + * Copyright (C) 2012 Lu Wang */ -#include +#include "SplashBackgroundRenderer.h" -#include "GfxFont.h" +namespace pdf2htmlEX { -#include "BackgroundRenderer.h" -#include "util.h" +const SplashColor SplashBackgroundRenderer::white = {255,255,255}; -using namespace pdf2htmlEX; - -void BackgroundRenderer::drawChar(GfxState *state, double x, double y, +void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode *u, int uLen) @@ -21,3 +18,4 @@ void BackgroundRenderer::drawChar(GfxState *state, double x, double y, // SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen); } +} // namespace pdf2htmlEX diff --git a/src/include/BackgroundRenderer.h b/src/include/BackgroundRenderer.h index 39cbd07..b3cd623 100644 --- a/src/include/BackgroundRenderer.h +++ b/src/include/BackgroundRenderer.h @@ -2,37 +2,31 @@ * Background renderer * Render all those things not supported as Image * - * by WangLu - * 2012.08.06 + * Copyright (C) 2012 Lu Wang */ #ifndef BACKGROUND_RENDERER_H__ #define BACKGROUND_RENDERER_H__ -#include +#include "pdf2htmlEX-config.h" + +#if HAVE_CAIRO + +#include "CairoBackgroundRenderer.h" namespace pdf2htmlEX { - -// Based on BackgroundRenderer from poppler -class BackgroundRenderer : public SplashOutputDev { -public: - BackgroundRenderer(SplashColorMode colorModeA, int bitmapRowPadA, - GBool reverseVideoA, SplashColorPtr paperColorA, - GBool bitmapTopDownA = gTrue, - GBool allowAntialiasA = gTrue) - : SplashOutputDev(colorModeA, - bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA, - allowAntialiasA) - { } - virtual ~BackgroundRenderer() { } - - virtual void drawChar(GfxState *state, double x, double y, - double dx, double dy, - double originX, double originY, - CharCode code, int nBytes, Unicode *u, int uLen); -}; - + typedef CairoBackgroundRenderer BackgroundRenderer; } +#else + +#include "SplashBackgroundRenderer.h" + +namespace pdf2htmlEX { + typedef SplashBackgroundRenderer BackgroundRenderer; +} + +#endif // HAVE_CAIRO + #endif //BACKGROUND_RENDERER_H__ diff --git a/src/include/CairoBackgroundRenderer.h b/src/include/CairoBackgroundRenderer.h index dfa47c1..5a2d2ab 100644 --- a/src/include/CairoBackgroundRenderer.h +++ b/src/include/CairoBackgroundRenderer.h @@ -6,8 +6,8 @@ */ -#ifndef SPLASH_BACKGROUND_RENDERER_H__ -#define SPLASH_BACKGROUND_RENDERER_H__ +#ifndef CAIRO_BACKGROUND_RENDERER_H__ +#define CAIRO_BACKGROUND_RENDERER_H__ #include #include @@ -34,4 +34,4 @@ public: } -#endif //SPLASH_BACKGROUND_RENDERER_H__ +#endif //CAIRO_BACKGROUND_RENDERER_H__ diff --git a/src/include/SplashBackgroundRenderer.h b/src/include/SplashBackgroundRenderer.h index 119046f..aced0fb 100644 --- a/src/include/SplashBackgroundRenderer.h +++ b/src/include/SplashBackgroundRenderer.h @@ -7,8 +7,8 @@ */ -#ifndef BACKGROUND_RENDERER_H__ -#define BACKGROUND_RENDERER_H__ +#ifndef SPLASH_BACKGROUND_RENDERER_H__ +#define SPLASH_BACKGROUND_RENDERER_H__ #include #include @@ -19,13 +19,13 @@ namespace pdf2htmlEX { class SplashBackgroundRenderer : public SplashOutputDev { public: + static const SplashColor white; + SplashBackgroundRenderer() - { - SplashColor color; - color[0] = color[1] = color[2] = 255; - SplashOutputDev(splashModeRGB8, 4, gFlase, color, gTrue, gTrue)` - } - virtual ~BackgroundRenderer() { } + : SplashOutputDev(splashModeRGB8, 4, gFalse, (SplashColorPtr)&white, gTrue, gTrue) + { } + + virtual ~SplashBackgroundRenderer() { } virtual void drawChar(GfxState *state, double x, double y, double dx, double dy, @@ -33,6 +33,6 @@ public: CharCode code, int nBytes, Unicode *u, int uLen); }; -} +} // namespace pdf2htmlEX -#endif //BACKGROUND_RENDERER_H__ +#endif // SPLASH_BACKGROUND_RENDERER_H__ diff --git a/src/include/pdf2htmlEX-config.h b/src/include/pdf2htmlEX-config.h new file mode 100644 index 0000000..ffb69e8 --- /dev/null +++ b/src/include/pdf2htmlEX-config.h @@ -0,0 +1,24 @@ +/* + * config.h + * Compile time constants + * + * by WangLu + */ + + +#ifndef PDF2HTMLEX_CONFIG_H__ +#define PDF2HTMLEX_CONFIG_H__ + +#include + +#define HAVE_CAIRO 0 + +namespace pdf2htmlEX { + +static const std::string PDF2HTMLEX_VERSION = "0.5"; +static const std::string PDF2HTMLEX_PREFIX = "/usr/local"; +static const std::string PDF2HTMLEX_DATA_PATH = "/usr/local""/share/pdf2htmlEX"; + +} // namespace pdf2htmlEX + +#endif //PDF2HTMLEX_CONFIG_H__ diff --git a/src/include/pdf2htmlEX-config.h.in b/src/include/pdf2htmlEX-config.h.in index 7a7ef62..695a8dd 100644 --- a/src/include/pdf2htmlEX-config.h.in +++ b/src/include/pdf2htmlEX-config.h.in @@ -11,6 +11,8 @@ #include +#define HAVE_CAIRO @HAVE_CAIRO@ + namespace pdf2htmlEX { static const std::string PDF2HTMLEX_VERSION = "@PDF2HTMLEX_VERSION@";