1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

Merge pull request #364 from duanyao/svg_extern_bitmap

Svg extern bitmap & svg node count limit
This commit is contained in:
Lu Wang 2014-06-16 21:56:55 -07:00
commit 04630b3020
14 changed files with 223 additions and 42 deletions

View File

@ -272,7 +272,7 @@ protected:
cairo_filter_t getFilterForSurface(cairo_surface_t *image,
GBool interpolate);
GBool getStreamData (Stream *str, char **buffer, int *length);
void setMimeData(Stream *str, Object *ref, cairo_surface_t *image);
virtual void setMimeData(Stream *str, Object *ref, cairo_surface_t *image);
void fillToStrokePathClip(GfxState *state);
void alignStrokeCoords(GfxSubpath *subpath, int i, double *x, double *y);

View File

@ -6,6 +6,7 @@ Chris Cinelli <chris@allestelle.com>
Daniel Bonniot de Ruisselet <dbonniot@chemaxon.com>
Deepak <iapain@gmail.com>
Denis Sablic <denis.sablic@gmail.com>
Duan Yao <duanyao@ustc.edu>
filodej <philode@gmail.com>
hasufell <julian.ospald@googlemail.com>
Herbert Jones <herbert@mediafire.com>

View File

@ -248,6 +248,16 @@ If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for
.B --bg-format <format> (Default: png)
Specify the background image format. Run `pdf2htmlEX -v` to check all supported formats.
.TP
.B --svg-node-count-limit <limit> (Default: -1)
If node count in a svg background image exceeds this limit, fall back this page to bitmap background; negative value means no limit.
This option is only useful when '--bg-format svg' is specified. Note that node count in svg is just calculated approximately.
.TP
.B --svg-embed-bitmap <0|1> (Default: 1)
Whether embed bitmaps in svg background image. 1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible.
JPEG images in a PDF are most possibly dumped. This option is only useful when '--bg-format svg' is specified.
.SS PDF Protection
.TP

View File

@ -372,9 +372,8 @@ Viewer.prototype = {
var _idx = idx;
var xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onreadystatechange = function(){
if (xhr.readyState != 4) return;
if (xhr.status === 200) {
xhr.onload = function(){
if (xhr.status === 200 || xhr.status === 0) {
// find the page element in the data
var div = document.createElement('div');
div.innerHTML = xhr.responseText;

View File

@ -23,13 +23,13 @@ BackgroundRenderer * BackgroundRenderer::getBackgroundRenderer(const std::string
#ifdef ENABLE_LIBPNG
if(format == "png")
{
return new SplashBackgroundRenderer(html_renderer, param);
return new SplashBackgroundRenderer(format, html_renderer, param);
}
#endif
#ifdef ENABLE_LIBJPEG
if(format == "jpg")
{
return new SplashBackgroundRenderer(html_renderer, param);
return new SplashBackgroundRenderer(format, html_renderer, param);
}
#endif
#if ENABLE_SVG
@ -42,4 +42,11 @@ BackgroundRenderer * BackgroundRenderer::getBackgroundRenderer(const std::string
return nullptr;
}
BackgroundRenderer * BackgroundRenderer::getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param)
{
if (param.bg_format == "svg" && param.svg_node_count_limit >= 0)
return new SplashBackgroundRenderer("", html_renderer, param);
return nullptr;
}
} // namespace pdf2htmlEX

View File

@ -22,12 +22,16 @@ class BackgroundRenderer
public:
// return nullptr upon failure
static BackgroundRenderer * getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param);
// Return a fallback bg renderer according to param.bg_format.
// Currently only svg bg format might need a bitmap fallback.
static BackgroundRenderer * getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param);
BackgroundRenderer() {}
virtual ~BackgroundRenderer() {}
virtual void init(PDFDoc * doc) = 0;
virtual void render_page(PDFDoc * doc, int pageno) = 0;
//return true on success, false otherwise (e.g. need a fallback)
virtual bool render_page(PDFDoc * doc, int pageno) = 0;
virtual void embed_image(int pageno) = 0;
};

View File

@ -15,11 +15,33 @@
#if ENABLE_SVG
#include "CairoBackgroundRenderer.h"
#include "SplashBackgroundRenderer.h"
namespace pdf2htmlEX {
using std::string;
using std::ifstream;
using std::ofstream;
using std::vector;
using std::unordered_map;
CairoBackgroundRenderer::CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param)
: CairoOutputDev()
, html_renderer(html_renderer)
, param(param)
, surface(nullptr)
{ }
CairoBackgroundRenderer::~CairoBackgroundRenderer()
{
for(auto itr = bitmaps_ref_count.begin(); itr != bitmaps_ref_count.end(); ++itr)
{
if (itr->second == 0)
{
html_renderer->tmp_files.add(this->build_bitmap_path(itr->first));
}
}
}
void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
double dx, double dy,
@ -52,7 +74,7 @@ static GBool annot_cb(Annot *, void * pflag) {
return (*((bool*)pflag)) ? gTrue : gFalse;
};
void CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
{
double page_width;
double page_height;
@ -67,19 +89,19 @@ void CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
page_height = doc->getPageMediaHeight(pageno);
}
{
auto fn = html_renderer->str_fmt("%s/bg%x.svg", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno);
if(param.embed_image)
html_renderer->tmp_files.add((char*)fn);
string fn = (char*)html_renderer->str_fmt("%s/bg%x.svg", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno);
if(param.embed_image)
html_renderer->tmp_files.add(fn);
surface = cairo_svg_surface_create((char*)fn, page_width * param.h_dpi / DEFAULT_DPI, page_height * param.v_dpi / DEFAULT_DPI);
}
surface = cairo_svg_surface_create(fn.c_str(), page_width * param.h_dpi / DEFAULT_DPI, page_height * param.v_dpi / DEFAULT_DPI);
cairo_svg_surface_restrict_to_version(surface, CAIRO_SVG_VERSION_1_2);
cairo_surface_set_fallback_resolution(surface, param.h_dpi, param.v_dpi);
cairo_t * cr = cairo_create(surface);
setCairo(cr);
bitmaps_in_current_page.clear();
bool process_annotation = param.process_annotation;
doc->displayPage(this, pageno, param.h_dpi, param.v_dpi,
0,
@ -105,13 +127,50 @@ void CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
if(status)
throw string("Error in cairo: ") + cairo_status_to_string(status);
}
//check node count in the svg file, fall back to bitmap_renderer if necessary.
if (param.svg_node_count_limit >= 0)
{
int n = 0;
char c;
ifstream svgfile(fn);
//count of '<' in the file should be an approximation of node count.
while(svgfile >> c)
{
if (c == '<')
++n;
if (n > param.svg_node_count_limit)
{
html_renderer->tmp_files.add(fn);
return false;
}
}
}
// the svg file is actually used, so add its bitmaps' ref count.
for (auto itr = bitmaps_in_current_page.begin(); itr != bitmaps_in_current_page.end(); itr++)
++bitmaps_ref_count[*itr];
return true;
}
void CairoBackgroundRenderer::embed_image(int pageno)
{
auto & f_page = *(html_renderer->f_curpage);
f_page << "<img class=\"" << CSS::FULL_BACKGROUND_IMAGE_CN
// SVGs introduced by <img> or background-image can't have external resources;
// SVGs introduced by <embed> and <object> can, but they are more expensive for browsers.
// So we use <img> if the SVG contains no external bitmaps, and use <embed> otherwise.
// See also:
// https://developer.mozilla.org/en-US/docs/Web/SVG/SVG_as_an_Image
// http://stackoverflow.com/questions/4476526/do-i-use-img-object-or-embed-for-svg-files
if (param.svg_embed_bitmap || bitmaps_in_current_page.empty())
f_page << "<img";
else
f_page << "<embed";
f_page << " class=\"" << CSS::FULL_BACKGROUND_IMAGE_CN
<< "\" alt=\"\" src=\"";
if(param.embed_image)
@ -129,6 +188,54 @@ void CairoBackgroundRenderer::embed_image(int pageno)
f_page << "\"/>";
}
string CairoBackgroundRenderer::build_bitmap_path(int id)
{
// "o" for "PDF Object"
return string(html_renderer->str_fmt("%s/o%d.jpg", param.dest_dir.c_str(), id));
}
// Override CairoOutputDev::setMimeData() and dump bitmaps in SVG to external files.
void CairoBackgroundRenderer::setMimeData(Stream *str, Object *ref, cairo_surface_t *image)
{
if (param.svg_embed_bitmap)
{
CairoOutputDev::setMimeData(str, ref, image);
return;
}
// TODO dump bitmaps in other formats.
if (str->getKind() != strDCT)
return;
// TODO inline image?
if (ref == nullptr || !ref->isRef())
return;
int imgId = ref->getRef().num;
auto uri = strdup((char*) html_renderer->str_fmt("o%d.jpg", imgId));
auto st = cairo_surface_set_mime_data(image, CAIRO_MIME_TYPE_URI,
(unsigned char*) uri, strlen(uri), free, uri);
if (st)
{
free(uri);
return;
}
bitmaps_in_current_page.push_back(imgId);
if(bitmaps_ref_count.find(imgId) != bitmaps_ref_count.end())
return;
bitmaps_ref_count[imgId] = 0;
char *strBuffer;
int len;
if (getStreamData(str->getNextStream(), &strBuffer, &len))
{
ofstream imgfile(build_bitmap_path(imgId), ofstream::binary);
imgfile.write(strBuffer, len);
free(strBuffer);
}
}
} // namespace pdf2htmlEX
#endif // ENABLE_SVG

View File

@ -12,6 +12,9 @@
#include <CairoOutputDev.h>
#include <cairo.h>
#include <cairo-svg.h>
#include <unordered_map>
#include <vector>
#include <string>
#include "pdf2htmlEX-config.h"
@ -24,17 +27,12 @@ namespace pdf2htmlEX {
class CairoBackgroundRenderer : public BackgroundRenderer, CairoOutputDev
{
public:
CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param)
: CairoOutputDev()
, html_renderer(html_renderer)
, param(param)
, surface(nullptr)
{ }
CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param);
virtual ~CairoBackgroundRenderer() { }
virtual ~CairoBackgroundRenderer();
virtual void init(PDFDoc * doc);
virtual void render_page(PDFDoc * doc, int pageno);
virtual bool render_page(PDFDoc * doc, int pageno);
virtual void embed_image(int pageno);
// Does this device use beginType3Char/endType3Char? Otherwise,
@ -46,10 +44,23 @@ public:
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen);
protected:
virtual void setMimeData(Stream *str, Object *ref, cairo_surface_t *image);
protected:
HTMLRenderer * html_renderer;
const Param & param;
cairo_surface_t * surface;
private:
// convert bitmap stream id to bitmap file name. No pageno prefix,
// because a bitmap may be shared by multiple pages.
std::string build_bitmap_path(int id);
// map<id_of_bitmap_stream, usage_count_in_all_svgs>
// note: if a svg bg fallbacks to bitmap bg, its bitmaps are not taken into account.
std::unordered_map<int, int> bitmaps_ref_count;
// id of bitmaps' stream used by current page
std::vector<int> bitmaps_in_current_page;
};
}

View File

@ -28,6 +28,29 @@ using std::unique_ptr;
const SplashColor SplashBackgroundRenderer::white = {255,255,255};
SplashBackgroundRenderer::SplashBackgroundRenderer(const string & imgFormat, HTMLRenderer * html_renderer, const Param & param)
: SplashOutputDev(splashModeRGB8, 4, gFalse, (SplashColorPtr)(&white), gTrue, gTrue)
, html_renderer(html_renderer)
, param(param)
, format(imgFormat)
{
bool supported = false;
#ifdef ENABLE_LIBPNG
if (format.empty())
format = "png";
supported = supported || format == "png";
#endif
#ifdef ENABLE_LIBJPEG
if (format.empty())
format = "jpg";
supported = supported || format == "jpg";
#endif
if (!supported)
{
throw string("Image format not supported: ") + format;
}
}
/*
* SplashOutputDev::startPage would paint the whole page with the background color
* And thus have modified region set to the whole page area
@ -76,7 +99,7 @@ static GBool annot_cb(Annot *, void * pflag) {
return (*((bool*)pflag)) ? gTrue : gFalse;
};
void SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
{
bool process_annotation = param.process_annotation;
doc->displayPage(this, pageno, param.h_dpi, param.v_dpi,
@ -84,6 +107,7 @@ void SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
(!(param.use_cropbox)),
false, false,
nullptr, nullptr, &annot_cb, &process_annotation);
return true;
}
void SplashBackgroundRenderer::embed_image(int pageno)
@ -96,7 +120,7 @@ void SplashBackgroundRenderer::embed_image(int pageno)
if((xmin <= xmax) && (ymin <= ymax))
{
{
auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, param.bg_format.c_str());
auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, format.c_str());
if(param.embed_image)
html_renderer->tmp_files.add((char*)fn);
@ -118,21 +142,21 @@ void SplashBackgroundRenderer::embed_image(int pageno)
if(param.embed_image)
{
auto path = html_renderer->str_fmt("%s/bg%x.%s", param.tmp_dir.c_str(), pageno, param.bg_format.c_str());
auto path = html_renderer->str_fmt("%s/bg%x.%s", param.tmp_dir.c_str(), pageno, format.c_str());
ifstream fin((char*)path, ifstream::binary);
if(!fin)
throw string("Cannot read background image ") + (char*)path;
auto iter = FORMAT_MIME_TYPE_MAP.find(param.bg_format);
auto iter = FORMAT_MIME_TYPE_MAP.find(format);
if(iter == FORMAT_MIME_TYPE_MAP.end())
throw string("Image format not supported: ") + param.bg_format;
throw string("Image format not supported: ") + format;
string mime_type = iter->second;
f_page << "data:" << mime_type << ";base64," << Base64Stream(fin);
}
else
{
f_page << (char*)html_renderer->str_fmt("bg%x.%s", pageno, param.bg_format.c_str());
f_page << (char*)html_renderer->str_fmt("bg%x.%s", pageno, format.c_str());
}
f_page << "\"/>";
}
@ -155,20 +179,20 @@ void SplashBackgroundRenderer::dump_image(const char * filename, int x1, int y1,
if(false) { }
#ifdef ENABLE_LIBPNG
else if(param.bg_format == "png")
else if(format == "png")
{
writer = unique_ptr<ImgWriter>(new PNGWriter);
}
#endif
#ifdef ENABLE_LIBJPEG
else if(param.bg_format == "jpg")
else if(format == "jpg")
{
writer = unique_ptr<ImgWriter>(new JpegWriter);
}
#endif
else
{
throw string("Image format not supported: ") + param.bg_format;
throw string("Image format not supported: ") + format;
}
if(!writer->init(f, width, height, param.h_dpi, param.v_dpi))

View File

@ -26,17 +26,13 @@ class SplashBackgroundRenderer : public BackgroundRenderer, SplashOutputDev
{
public:
static const SplashColor white;
SplashBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param)
: SplashOutputDev(splashModeRGB8, 4, gFalse, (SplashColorPtr)(&white), gTrue, gTrue)
, html_renderer(html_renderer)
, param(param)
{ }
//format: "png" or "jpg", or "" for a default format
SplashBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param);
virtual ~SplashBackgroundRenderer() { }
virtual void init(PDFDoc * doc);
virtual void render_page(PDFDoc * doc, int pageno);
virtual bool render_page(PDFDoc * doc, int pageno);
virtual void embed_image(int pageno);
// Does this device use beginType3Char/endType3Char? Otherwise,
@ -68,6 +64,7 @@ protected:
void dump_image(const char * filename, int x1, int y1, int x2, int y2);
HTMLRenderer * html_renderer;
const Param & param;
std::string format;
};
} // namespace pdf2htmlEX

View File

@ -327,7 +327,8 @@ protected:
friend class CairoBackgroundRenderer; // ugly!
#endif
BackgroundRenderer * bg_renderer;
BackgroundRenderer * fallback_bg_renderer;
bool fallback_bg_required;
struct {
std::ofstream fs;

View File

@ -98,12 +98,17 @@ void HTMLRenderer::process(PDFDoc *doc)
// Process pages
bg_renderer = nullptr;
fallback_bg_renderer = nullptr;
if(param.process_nontext)
{
bg_renderer = BackgroundRenderer::getBackgroundRenderer(param.bg_format, this, param);
if(!bg_renderer)
throw "Cannot initialize background renderer, unsupported format";
bg_renderer->init(doc);
fallback_bg_renderer = BackgroundRenderer::getFallbackBackgroundRenderer(this, param);
if (fallback_bg_renderer)
fallback_bg_renderer->init(doc);
}
int page_count = (param.last_page - param.first_page + 1);
@ -130,7 +135,9 @@ void HTMLRenderer::process(PDFDoc *doc)
if(param.process_nontext)
{
bg_renderer->render_page(doc, i);
fallback_bg_required = !bg_renderer->render_page(doc, i);
if (fallback_bg_required && fallback_bg_renderer != nullptr)
fallback_bg_renderer->render_page(doc, i);
}
doc->displayPage(this, i,
@ -163,6 +170,11 @@ void HTMLRenderer::process(PDFDoc *doc)
delete bg_renderer;
bg_renderer = nullptr;
}
if(fallback_bg_renderer)
{
delete fallback_bg_renderer;
fallback_bg_renderer = nullptr;
}
cerr << endl;
}
@ -219,7 +231,10 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
if(param.process_nontext)
{
bg_renderer->embed_image(pageNum);
if (!fallback_bg_required)
bg_renderer->embed_image(pageNum);
else if (fallback_bg_renderer != nullptr)
fallback_bg_renderer->embed_image(pageNum);
}
reset_state();

View File

@ -63,6 +63,8 @@ struct Param
// background image
std::string bg_format;
int svg_node_count_limit;
int svg_embed_bitmap;
// encryption
std::string owner_password, user_password;

View File

@ -190,6 +190,9 @@ void parse_options (int argc, char **argv)
// background image
.add("bg-format", &param.bg_format, "png", "specify background image format")
.add("svg-node-count-limit", &param.svg_node_count_limit, -1, "if node count in a svg background image exceeds this limit,"
" fall back this page to bitmap background; negative value means no limit.")
.add("svg-embed-bitmap", &param.svg_embed_bitmap, 1, "1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible.")
// encryption
.add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", true)