diff --git a/3rdparty/poppler/git/CairoOutputDev.h b/3rdparty/poppler/git/CairoOutputDev.h index c9ae33d..727c687 100644 --- a/3rdparty/poppler/git/CairoOutputDev.h +++ b/3rdparty/poppler/git/CairoOutputDev.h @@ -272,7 +272,7 @@ protected: cairo_filter_t getFilterForSurface(cairo_surface_t *image, GBool interpolate); GBool getStreamData (Stream *str, char **buffer, int *length); - void setMimeData(Stream *str, Object *ref, cairo_surface_t *image); + virtual void setMimeData(Stream *str, Object *ref, cairo_surface_t *image); void fillToStrokePathClip(GfxState *state); void alignStrokeCoords(GfxSubpath *subpath, int i, double *x, double *y); diff --git a/AUTHORS b/AUTHORS index 95f0be3..ed7cc51 100644 --- a/AUTHORS +++ b/AUTHORS @@ -6,6 +6,7 @@ Chris Cinelli Daniel Bonniot de Ruisselet Deepak Denis Sablic +Duan Yao filodej hasufell Herbert Jones diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 8b01c34..b63b1d9 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -248,6 +248,16 @@ If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for .B --bg-format (Default: png) Specify the background image format. Run `pdf2htmlEX -v` to check all supported formats. +.TP +.B --svg-node-count-limit (Default: -1) +If node count in a svg background image exceeds this limit, fall back this page to bitmap background; negative value means no limit. +This option is only useful when '--bg-format svg' is specified. Note that node count in svg is just calculated approximately. + +.TP +.B --svg-embed-bitmap <0|1> (Default: 1) +Whether embed bitmaps in svg background image. 1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible. +JPEG images in a PDF are most possibly dumped. This option is only useful when '--bg-format svg' is specified. + .SS PDF Protection .TP diff --git a/share/pdf2htmlEX.js.in b/share/pdf2htmlEX.js.in index 25352fe..f21d3df 100644 --- a/share/pdf2htmlEX.js.in +++ b/share/pdf2htmlEX.js.in @@ -372,9 +372,8 @@ Viewer.prototype = { var _idx = idx; var xhr = new XMLHttpRequest(); xhr.open('GET', url, true); - xhr.onreadystatechange = function(){ - if (xhr.readyState != 4) return; - if (xhr.status === 200) { + xhr.onload = function(){ + if (xhr.status === 200 || xhr.status === 0) { // find the page element in the data var div = document.createElement('div'); div.innerHTML = xhr.responseText; diff --git a/src/BackgroundRenderer/BackgroundRenderer.cc b/src/BackgroundRenderer/BackgroundRenderer.cc index e6cf59c..1ae298c 100644 --- a/src/BackgroundRenderer/BackgroundRenderer.cc +++ b/src/BackgroundRenderer/BackgroundRenderer.cc @@ -23,13 +23,13 @@ BackgroundRenderer * BackgroundRenderer::getBackgroundRenderer(const std::string #ifdef ENABLE_LIBPNG if(format == "png") { - return new SplashBackgroundRenderer(html_renderer, param); + return new SplashBackgroundRenderer(format, html_renderer, param); } #endif #ifdef ENABLE_LIBJPEG if(format == "jpg") { - return new SplashBackgroundRenderer(html_renderer, param); + return new SplashBackgroundRenderer(format, html_renderer, param); } #endif #if ENABLE_SVG @@ -42,4 +42,11 @@ BackgroundRenderer * BackgroundRenderer::getBackgroundRenderer(const std::string return nullptr; } +BackgroundRenderer * BackgroundRenderer::getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param) +{ + if (param.bg_format == "svg" && param.svg_node_count_limit >= 0) + return new SplashBackgroundRenderer("", html_renderer, param); + return nullptr; +} + } // namespace pdf2htmlEX diff --git a/src/BackgroundRenderer/BackgroundRenderer.h b/src/BackgroundRenderer/BackgroundRenderer.h index f6d898e..29e03b6 100644 --- a/src/BackgroundRenderer/BackgroundRenderer.h +++ b/src/BackgroundRenderer/BackgroundRenderer.h @@ -22,12 +22,16 @@ class BackgroundRenderer public: // return nullptr upon failure static BackgroundRenderer * getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param); + // Return a fallback bg renderer according to param.bg_format. + // Currently only svg bg format might need a bitmap fallback. + static BackgroundRenderer * getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param); BackgroundRenderer() {} virtual ~BackgroundRenderer() {} virtual void init(PDFDoc * doc) = 0; - virtual void render_page(PDFDoc * doc, int pageno) = 0; + //return true on success, false otherwise (e.g. need a fallback) + virtual bool render_page(PDFDoc * doc, int pageno) = 0; virtual void embed_image(int pageno) = 0; }; diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc index 074e4c1..63a6a81 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.cc +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc @@ -15,11 +15,33 @@ #if ENABLE_SVG #include "CairoBackgroundRenderer.h" +#include "SplashBackgroundRenderer.h" namespace pdf2htmlEX { using std::string; using std::ifstream; +using std::ofstream; +using std::vector; +using std::unordered_map; + +CairoBackgroundRenderer::CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param) + : CairoOutputDev() + , html_renderer(html_renderer) + , param(param) + , surface(nullptr) +{ } + +CairoBackgroundRenderer::~CairoBackgroundRenderer() +{ + for(auto itr = bitmaps_ref_count.begin(); itr != bitmaps_ref_count.end(); ++itr) + { + if (itr->second == 0) + { + html_renderer->tmp_files.add(this->build_bitmap_path(itr->first)); + } + } +} void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y, double dx, double dy, @@ -52,7 +74,7 @@ static GBool annot_cb(Annot *, void * pflag) { return (*((bool*)pflag)) ? gTrue : gFalse; }; -void CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno) +bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno) { double page_width; double page_height; @@ -67,19 +89,19 @@ void CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno) page_height = doc->getPageMediaHeight(pageno); } - { - auto fn = html_renderer->str_fmt("%s/bg%x.svg", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno); - if(param.embed_image) - html_renderer->tmp_files.add((char*)fn); + string fn = (char*)html_renderer->str_fmt("%s/bg%x.svg", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno); + if(param.embed_image) + html_renderer->tmp_files.add(fn); - surface = cairo_svg_surface_create((char*)fn, page_width * param.h_dpi / DEFAULT_DPI, page_height * param.v_dpi / DEFAULT_DPI); - } + surface = cairo_svg_surface_create(fn.c_str(), page_width * param.h_dpi / DEFAULT_DPI, page_height * param.v_dpi / DEFAULT_DPI); cairo_svg_surface_restrict_to_version(surface, CAIRO_SVG_VERSION_1_2); cairo_surface_set_fallback_resolution(surface, param.h_dpi, param.v_dpi); cairo_t * cr = cairo_create(surface); setCairo(cr); + bitmaps_in_current_page.clear(); + bool process_annotation = param.process_annotation; doc->displayPage(this, pageno, param.h_dpi, param.v_dpi, 0, @@ -105,13 +127,50 @@ void CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno) if(status) throw string("Error in cairo: ") + cairo_status_to_string(status); } + + //check node count in the svg file, fall back to bitmap_renderer if necessary. + if (param.svg_node_count_limit >= 0) + { + int n = 0; + char c; + ifstream svgfile(fn); + //count of '<' in the file should be an approximation of node count. + while(svgfile >> c) + { + if (c == '<') + ++n; + if (n > param.svg_node_count_limit) + { + html_renderer->tmp_files.add(fn); + return false; + } + } + } + + // the svg file is actually used, so add its bitmaps' ref count. + for (auto itr = bitmaps_in_current_page.begin(); itr != bitmaps_in_current_page.end(); itr++) + ++bitmaps_ref_count[*itr]; + + return true; } void CairoBackgroundRenderer::embed_image(int pageno) { auto & f_page = *(html_renderer->f_curpage); - f_page << " or background-image can't have external resources; + // SVGs introduced by and can, but they are more expensive for browsers. + // So we use if the SVG contains no external bitmaps, and use otherwise. + // See also: + // https://developer.mozilla.org/en-US/docs/Web/SVG/SVG_as_an_Image + // http://stackoverflow.com/questions/4476526/do-i-use-img-object-or-embed-for-svg-files + + if (param.svg_embed_bitmap || bitmaps_in_current_page.empty()) + f_page << ""; } +string CairoBackgroundRenderer::build_bitmap_path(int id) +{ + // "o" for "PDF Object" + return string(html_renderer->str_fmt("%s/o%d.jpg", param.dest_dir.c_str(), id)); +} +// Override CairoOutputDev::setMimeData() and dump bitmaps in SVG to external files. +void CairoBackgroundRenderer::setMimeData(Stream *str, Object *ref, cairo_surface_t *image) +{ + if (param.svg_embed_bitmap) + { + CairoOutputDev::setMimeData(str, ref, image); + return; + } + + // TODO dump bitmaps in other formats. + if (str->getKind() != strDCT) + return; + + // TODO inline image? + if (ref == nullptr || !ref->isRef()) + return; + + int imgId = ref->getRef().num; + auto uri = strdup((char*) html_renderer->str_fmt("o%d.jpg", imgId)); + auto st = cairo_surface_set_mime_data(image, CAIRO_MIME_TYPE_URI, + (unsigned char*) uri, strlen(uri), free, uri); + if (st) + { + free(uri); + return; + } + bitmaps_in_current_page.push_back(imgId); + + if(bitmaps_ref_count.find(imgId) != bitmaps_ref_count.end()) + return; + + bitmaps_ref_count[imgId] = 0; + + char *strBuffer; + int len; + if (getStreamData(str->getNextStream(), &strBuffer, &len)) + { + ofstream imgfile(build_bitmap_path(imgId), ofstream::binary); + imgfile.write(strBuffer, len); + free(strBuffer); + } +} + } // namespace pdf2htmlEX #endif // ENABLE_SVG diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.h b/src/BackgroundRenderer/CairoBackgroundRenderer.h index 37372f1..8abe5f3 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.h +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.h @@ -12,6 +12,9 @@ #include #include #include +#include +#include +#include #include "pdf2htmlEX-config.h" @@ -24,17 +27,12 @@ namespace pdf2htmlEX { class CairoBackgroundRenderer : public BackgroundRenderer, CairoOutputDev { public: - CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param) - : CairoOutputDev() - , html_renderer(html_renderer) - , param(param) - , surface(nullptr) - { } + CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param); - virtual ~CairoBackgroundRenderer() { } + virtual ~CairoBackgroundRenderer(); virtual void init(PDFDoc * doc); - virtual void render_page(PDFDoc * doc, int pageno); + virtual bool render_page(PDFDoc * doc, int pageno); virtual void embed_image(int pageno); // Does this device use beginType3Char/endType3Char? Otherwise, @@ -46,10 +44,23 @@ public: double originX, double originY, CharCode code, int nBytes, Unicode *u, int uLen); +protected: + virtual void setMimeData(Stream *str, Object *ref, cairo_surface_t *image); + protected: HTMLRenderer * html_renderer; const Param & param; cairo_surface_t * surface; + +private: + // convert bitmap stream id to bitmap file name. No pageno prefix, + // because a bitmap may be shared by multiple pages. + std::string build_bitmap_path(int id); + // map + // note: if a svg bg fallbacks to bitmap bg, its bitmaps are not taken into account. + std::unordered_map bitmaps_ref_count; + // id of bitmaps' stream used by current page + std::vector bitmaps_in_current_page; }; } diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc index 0e42b32..c596508 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.cc +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -28,6 +28,29 @@ using std::unique_ptr; const SplashColor SplashBackgroundRenderer::white = {255,255,255}; +SplashBackgroundRenderer::SplashBackgroundRenderer(const string & imgFormat, HTMLRenderer * html_renderer, const Param & param) + : SplashOutputDev(splashModeRGB8, 4, gFalse, (SplashColorPtr)(&white), gTrue, gTrue) + , html_renderer(html_renderer) + , param(param) + , format(imgFormat) +{ + bool supported = false; +#ifdef ENABLE_LIBPNG + if (format.empty()) + format = "png"; + supported = supported || format == "png"; +#endif +#ifdef ENABLE_LIBJPEG + if (format.empty()) + format = "jpg"; + supported = supported || format == "jpg"; +#endif + if (!supported) + { + throw string("Image format not supported: ") + format; + } +} + /* * SplashOutputDev::startPage would paint the whole page with the background color * And thus have modified region set to the whole page area @@ -76,7 +99,7 @@ static GBool annot_cb(Annot *, void * pflag) { return (*((bool*)pflag)) ? gTrue : gFalse; }; -void SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno) +bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno) { bool process_annotation = param.process_annotation; doc->displayPage(this, pageno, param.h_dpi, param.v_dpi, @@ -84,6 +107,7 @@ void SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno) (!(param.use_cropbox)), false, false, nullptr, nullptr, &annot_cb, &process_annotation); + return true; } void SplashBackgroundRenderer::embed_image(int pageno) @@ -96,7 +120,7 @@ void SplashBackgroundRenderer::embed_image(int pageno) if((xmin <= xmax) && (ymin <= ymax)) { { - auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, param.bg_format.c_str()); + auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, format.c_str()); if(param.embed_image) html_renderer->tmp_files.add((char*)fn); @@ -118,21 +142,21 @@ void SplashBackgroundRenderer::embed_image(int pageno) if(param.embed_image) { - auto path = html_renderer->str_fmt("%s/bg%x.%s", param.tmp_dir.c_str(), pageno, param.bg_format.c_str()); + auto path = html_renderer->str_fmt("%s/bg%x.%s", param.tmp_dir.c_str(), pageno, format.c_str()); ifstream fin((char*)path, ifstream::binary); if(!fin) throw string("Cannot read background image ") + (char*)path; - auto iter = FORMAT_MIME_TYPE_MAP.find(param.bg_format); + auto iter = FORMAT_MIME_TYPE_MAP.find(format); if(iter == FORMAT_MIME_TYPE_MAP.end()) - throw string("Image format not supported: ") + param.bg_format; + throw string("Image format not supported: ") + format; string mime_type = iter->second; f_page << "data:" << mime_type << ";base64," << Base64Stream(fin); } else { - f_page << (char*)html_renderer->str_fmt("bg%x.%s", pageno, param.bg_format.c_str()); + f_page << (char*)html_renderer->str_fmt("bg%x.%s", pageno, format.c_str()); } f_page << "\"/>"; } @@ -155,20 +179,20 @@ void SplashBackgroundRenderer::dump_image(const char * filename, int x1, int y1, if(false) { } #ifdef ENABLE_LIBPNG - else if(param.bg_format == "png") + else if(format == "png") { writer = unique_ptr(new PNGWriter); } #endif #ifdef ENABLE_LIBJPEG - else if(param.bg_format == "jpg") + else if(format == "jpg") { writer = unique_ptr(new JpegWriter); } #endif else { - throw string("Image format not supported: ") + param.bg_format; + throw string("Image format not supported: ") + format; } if(!writer->init(f, width, height, param.h_dpi, param.v_dpi)) diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.h b/src/BackgroundRenderer/SplashBackgroundRenderer.h index e999a10..9ec8de9 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.h +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.h @@ -26,17 +26,13 @@ class SplashBackgroundRenderer : public BackgroundRenderer, SplashOutputDev { public: static const SplashColor white; - - SplashBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param) - : SplashOutputDev(splashModeRGB8, 4, gFalse, (SplashColorPtr)(&white), gTrue, gTrue) - , html_renderer(html_renderer) - , param(param) - { } + //format: "png" or "jpg", or "" for a default format + SplashBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param); virtual ~SplashBackgroundRenderer() { } virtual void init(PDFDoc * doc); - virtual void render_page(PDFDoc * doc, int pageno); + virtual bool render_page(PDFDoc * doc, int pageno); virtual void embed_image(int pageno); // Does this device use beginType3Char/endType3Char? Otherwise, @@ -68,6 +64,7 @@ protected: void dump_image(const char * filename, int x1, int y1, int x2, int y2); HTMLRenderer * html_renderer; const Param & param; + std::string format; }; } // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 7d67f70..73929ab 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -327,7 +327,8 @@ protected: friend class CairoBackgroundRenderer; // ugly! #endif BackgroundRenderer * bg_renderer; - + BackgroundRenderer * fallback_bg_renderer; + bool fallback_bg_required; struct { std::ofstream fs; diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 9c85a97..803bc2d 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -98,12 +98,17 @@ void HTMLRenderer::process(PDFDoc *doc) // Process pages bg_renderer = nullptr; + fallback_bg_renderer = nullptr; if(param.process_nontext) { bg_renderer = BackgroundRenderer::getBackgroundRenderer(param.bg_format, this, param); if(!bg_renderer) throw "Cannot initialize background renderer, unsupported format"; bg_renderer->init(doc); + + fallback_bg_renderer = BackgroundRenderer::getFallbackBackgroundRenderer(this, param); + if (fallback_bg_renderer) + fallback_bg_renderer->init(doc); } int page_count = (param.last_page - param.first_page + 1); @@ -130,7 +135,9 @@ void HTMLRenderer::process(PDFDoc *doc) if(param.process_nontext) { - bg_renderer->render_page(doc, i); + fallback_bg_required = !bg_renderer->render_page(doc, i); + if (fallback_bg_required && fallback_bg_renderer != nullptr) + fallback_bg_renderer->render_page(doc, i); } doc->displayPage(this, i, @@ -163,6 +170,11 @@ void HTMLRenderer::process(PDFDoc *doc) delete bg_renderer; bg_renderer = nullptr; } + if(fallback_bg_renderer) + { + delete fallback_bg_renderer; + fallback_bg_renderer = nullptr; + } cerr << endl; } @@ -219,7 +231,10 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) if(param.process_nontext) { - bg_renderer->embed_image(pageNum); + if (!fallback_bg_required) + bg_renderer->embed_image(pageNum); + else if (fallback_bg_renderer != nullptr) + fallback_bg_renderer->embed_image(pageNum); } reset_state(); diff --git a/src/Param.h b/src/Param.h index 9d42620..8c16802 100644 --- a/src/Param.h +++ b/src/Param.h @@ -63,6 +63,8 @@ struct Param // background image std::string bg_format; + int svg_node_count_limit; + int svg_embed_bitmap; // encryption std::string owner_password, user_password; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 23e8d73..f20dc2b 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -190,6 +190,9 @@ void parse_options (int argc, char **argv) // background image .add("bg-format", ¶m.bg_format, "png", "specify background image format") + .add("svg-node-count-limit", ¶m.svg_node_count_limit, -1, "if node count in a svg background image exceeds this limit," + " fall back this page to bitmap background; negative value means no limit.") + .add("svg-embed-bitmap", ¶m.svg_embed_bitmap, 1, "1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible.") // encryption .add("owner-password,o", ¶m.owner_password, "", "owner password (for encrypted files)", true)