1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

new option: --bg-format

This commit is contained in:
Lu Wang 2013-09-18 18:01:56 +08:00
parent bcadc6be6a
commit dd9d21cb19
13 changed files with 203 additions and 156 deletions

View File

@ -181,6 +181,7 @@ set(PDF2HTMLEX_SRC ${PDF2HTMLEX_SRC}
src/HTMLRenderer/state.cc src/HTMLRenderer/state.cc
src/HTMLRenderer/text.cc src/HTMLRenderer/text.cc
src/BackgroundRenderer/BackgroundRenderer.h src/BackgroundRenderer/BackgroundRenderer.h
src/BackgroundRenderer/BackgroundRenderer.cc
src/BackgroundRenderer/SplashBackgroundRenderer.h src/BackgroundRenderer/SplashBackgroundRenderer.h
src/BackgroundRenderer/SplashBackgroundRenderer.cc src/BackgroundRenderer/SplashBackgroundRenderer.cc
src/BackgroundRenderer/CairoBackgroundRenderer.h src/BackgroundRenderer/CairoBackgroundRenderer.h

View File

@ -223,6 +223,12 @@ If set to 0, pdf2htmlEX would try its best to balance the two methods above.
.B --optimize-text <0|1> (Deafult: 0) .B --optimize-text <0|1> (Deafult: 0)
If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for text. Turn it off if anything goes wrong. If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for text. Turn it off if anything goes wrong.
.SS Background Image
.TP
.B --bg-format <format> (Default: "png")
Specify the format for background images, currently "png" and "svg" are supported.
.SS PDF Protection .SS PDF Protection
.TP .TP

View File

@ -0,0 +1,39 @@
/*
* Background renderer
* Render all those things not supported as Image
*
* Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
*/
#include "HTMLRenderer/HTMLRenderer.h"
#include "Param.h"
#include "BackgroundRenderer.h"
#include "SplashBackgroundRenderer.h"
#if ENABLE_SVG
#include "CairoBackgroundRenderer.h"
#endif
namespace pdf2htmlEX {
BackgroundRenderer * BackgroundRenderer::getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param)
{
if(format == "png")
{
return new SplashBackgroundRenderer(html_renderer, param);
}
else if (format == "svg")
{
#if ENABLE_SVG
return new CairoBackgroundRenderer(html_renderer, param);
#else
return nullptr;
#endif
}
else
{
return nullptr;
}
}
} // namespace pdf2htmlEX

View File

@ -2,31 +2,36 @@
* Background renderer * Background renderer
* Render all those things not supported as Image * Render all those things not supported as Image
* *
* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com> * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
*/ */
#ifndef BACKGROUND_RENDERER_H__ #ifndef BACKGROUND_RENDERER_H__
#define BACKGROUND_RENDERER_H__ #define BACKGROUND_RENDERER_H__
#include "pdf2htmlEX-config.h" #include <string>
#if ENABLE_SVG class PDFDoc;
#include "CairoBackgroundRenderer.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
typedef CairoBackgroundRenderer BackgroundRenderer;
}
#else class Param;
class HTMLRenderer;
class BackgroundRenderer
{
public:
// return nullptr upon failure
static BackgroundRenderer * getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param);
#include "SplashBackgroundRenderer.h" BackgroundRenderer() {}
virtual ~BackgroundRenderer() {}
namespace pdf2htmlEX { virtual void init(PDFDoc * doc) = 0;
typedef SplashBackgroundRenderer BackgroundRenderer; virtual void render_page(PDFDoc * doc, int pageno) = 0;
} virtual void embed_image(int pageno) = 0;
#endif // ENABLE_SVG };
} // namespace pdf2htmlEX
#endif //BACKGROUND_RENDERER_H__ #endif //BACKGROUND_RENDERER_H__

View File

@ -1,29 +0,0 @@
/*
* Background renderer
* Render all those things not supported as Image
*
* Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
*/
/*
* This file includes forward declarations since HTMLRenderer and BackgroundRendere have cross references
*/
#ifndef BACKGROUND_RENDERER_FORWARD_H__
#define BACKGROUND_RENDERER_FORWARD_H__
#include "pdf2htmlEX-config.h"
namespace pdf2htmlEX {
#if ENABLE_SVG
class CairoBackgroundRenderer;
typedef CairoBackgroundRenderer BackgroundRenderer;
#else
class SplashBackgroundRenderer;
typedef SplashBackgroundRenderer BackgroundRenderer;
#endif // ENABLE_SVG
}
#endif //BACKGROUND_RENDERER_FORWARD_H__

View File

@ -1,7 +1,7 @@
/* /*
* CairoBackgroundRenderer.cc * CairoBackgroundRenderer.cc
* *
* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com> * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
*/ */
#include <string> #include <string>
@ -29,6 +29,11 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
// CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen); // CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen);
} }
void CairoBackgroundRenderer::init(PDFDoc * doc)
{
startDoc(doc);
}
static GBool annot_cb(Annot *, void *) { static GBool annot_cb(Annot *, void *) {
return false; return false;
}; };

View File

@ -2,7 +2,7 @@
* Cairo Background renderer * Cairo Background renderer
* Render all those things not supported as Image, with Cairo * Render all those things not supported as Image, with Cairo
* *
* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com> * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
*/ */
@ -21,7 +21,7 @@
namespace pdf2htmlEX { namespace pdf2htmlEX {
// Based on BackgroundRenderer from poppler // Based on BackgroundRenderer from poppler
class CairoBackgroundRenderer : public CairoOutputDev class CairoBackgroundRenderer : public BackgroundRenderer, CairoOutputDev
{ {
public: public:
CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param) CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param)
@ -33,14 +33,15 @@ public:
virtual ~CairoBackgroundRenderer() { } virtual ~CairoBackgroundRenderer() { }
virtual void init(PDFDoc * doc);
virtual void render_page(PDFDoc * doc, int pageno);
virtual void embed_image(int pageno);
virtual void drawChar(GfxState *state, double x, double y, virtual void drawChar(GfxState *state, double x, double y,
double dx, double dy, double dx, double dy,
double originX, double originY, double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen); CharCode code, int nBytes, Unicode *u, int uLen);
void render_page(PDFDoc * doc, int pageno);
void embed_image(int pageno);
protected: protected:
HTMLRenderer * html_renderer; HTMLRenderer * html_renderer;
const Param & param; const Param & param;

View File

@ -1,11 +1,12 @@
/* /*
* SplashBackgroundRenderer.cc * SplashBackgroundRenderer.cc
* *
* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com> * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
*/ */
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include <memory>
#include <PDFDoc.h> #include <PDFDoc.h>
#include <goo/PNGWriter.h> #include <goo/PNGWriter.h>
@ -19,6 +20,7 @@ namespace pdf2htmlEX {
using std::string; using std::string;
using std::ifstream; using std::ifstream;
using std::vector; using std::vector;
using std::unique_ptr;
const SplashColor SplashBackgroundRenderer::white = {255,255,255}; const SplashColor SplashBackgroundRenderer::white = {255,255,255};
@ -63,6 +65,11 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
} }
} }
void SplashBackgroundRenderer::init(PDFDoc * doc)
{
startDoc(doc);
}
static GBool annot_cb(Annot *, void *) { static GBool annot_cb(Annot *, void *) {
return false; return false;
}; };
@ -134,7 +141,8 @@ void SplashBackgroundRenderer::dump_image(const char * filename, int x1, int y1,
if(!f) if(!f)
throw string("Cannot open file for background image " ) + filename; throw string("Cannot open file for background image " ) + filename;
ImgWriter * writer = new PNGWriter(); // use unique_ptr to auto delete the object upon exception
auto writer = unique_ptr<ImgWriter>(new PNGWriter);
if(!writer->init(f, width, height, param.h_dpi, param.v_dpi)) if(!writer->init(f, width, height, param.h_dpi, param.v_dpi))
throw "Cannot initialize PNGWriter"; throw "Cannot initialize PNGWriter";
@ -157,7 +165,6 @@ void SplashBackgroundRenderer::dump_image(const char * filename, int x1, int y1,
throw "Cannot write background image"; throw "Cannot write background image";
} }
delete writer;
fclose(f); fclose(f);
} }

View File

@ -2,8 +2,7 @@
* Splash Background renderer * Splash Background renderer
* Render all those things not supported as Image, with Splash * Render all those things not supported as Image, with Splash
* *
* by WangLu * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
* 2012.08.06
*/ */
@ -23,7 +22,7 @@
namespace pdf2htmlEX { namespace pdf2htmlEX {
// Based on BackgroundRenderer from poppler // Based on BackgroundRenderer from poppler
class SplashBackgroundRenderer : public SplashOutputDev class SplashBackgroundRenderer : public BackgroundRenderer, SplashOutputDev
{ {
public: public:
static const SplashColor white; static const SplashColor white;
@ -36,6 +35,10 @@ public:
virtual ~SplashBackgroundRenderer() { } virtual ~SplashBackgroundRenderer() { }
virtual void init(PDFDoc * doc);
virtual void render_page(PDFDoc * doc, int pageno);
virtual void embed_image(int pageno);
#if POPPLER_OLDER_THAN_0_23_0 #if POPPLER_OLDER_THAN_0_23_0
virtual void startPage(int pageNum, GfxState *state); virtual void startPage(int pageNum, GfxState *state);
#else #else
@ -57,11 +60,8 @@ public:
SplashOutputDev::fill(state); SplashOutputDev::fill(state);
} }
void render_page(PDFDoc * doc, int pageno);
void embed_image(int pageno);
void dump_image(const char * filename, int x1, int y1, int x2, int y2);
protected: protected:
void dump_image(const char * filename, int x1, int y1, int x2, int y2);
HTMLRenderer * html_renderer; HTMLRenderer * html_renderer;
const Param & param; const Param & param;
}; };

View File

@ -30,7 +30,7 @@
#include "StateManager.h" #include "StateManager.h"
#include "HTMLTextPage.h" #include "HTMLTextPage.h"
#include "BackgroundRenderer/BackgroundRenderer_forward.h" #include "BackgroundRenderer/BackgroundRenderer.h"
#include "util/const.h" #include "util/const.h"
#include "util/misc.h" #include "util/misc.h"
@ -321,8 +321,10 @@ protected:
StringFormatter str_fmt; StringFormatter str_fmt;
// render background image // render background image
friend class SplashBackgroundRenderer; friend class SplashBackgroundRenderer; // ugly!
friend class CairoBackgroundRenderer; #if ENABLE_SVG
friend class CairoBackgroundRenderer; // ugly!
#endif
BackgroundRenderer * bg_renderer; BackgroundRenderer * bg_renderer;

View File

@ -100,8 +100,10 @@ void HTMLRenderer::process(PDFDoc *doc)
bg_renderer = nullptr; bg_renderer = nullptr;
if(param.process_nontext) if(param.process_nontext)
{ {
bg_renderer = new BackgroundRenderer(this, param); bg_renderer = BackgroundRenderer::getBackgroundRenderer(param.bg_format, this, param);
bg_renderer->startDoc(doc); if(!bg_renderer)
throw "Cannot initialize background renderer, unsupported format";
bg_renderer->init(doc);
} }
int page_count = (param.last_page - param.first_page + 1); int page_count = (param.last_page - param.first_page + 1);

View File

@ -57,6 +57,9 @@ struct Param
int space_as_offset; int space_as_offset;
int tounicode; int tounicode;
int optimize_text; int optimize_text;
// background image
std::string bg_format;
// encryption // encryption
std::string owner_password, user_password; std::string owner_password, user_password;
@ -67,9 +70,7 @@ struct Param
std::string data_dir; std::string data_dir;
int css_draw; int css_draw;
int debug; int debug;
int wa_unicode;
// non-optional
std::string input_filename, output_filename; std::string input_filename, output_filename;
// not a paramater // not a paramater

View File

@ -36,18 +36,6 @@ using namespace pdf2htmlEX;
Param param; Param param;
ArgParser argparser; ArgParser argparser;
void deprecated_single_html(const char * dummy = nullptr)
{
cerr << "--single_html is deprecated. Use `--embed CFIJO` instead." << endl;
exit(EXIT_FAILURE);
}
void removed_remove_unsed_glyph(const char * dummy = nullptr)
{
cerr << "--remove-unsed-glyph is removed. Use a PDF optimization tool instead." << endl;
exit(EXIT_FAILURE);
}
void show_usage_and_exit(const char * dummy = nullptr) void show_usage_and_exit(const char * dummy = nullptr)
{ {
cerr << "Usage: pdf2htmlEX [options] <input.pdf> [<output.html>]" << endl; cerr << "Usage: pdf2htmlEX [options] <input.pdf> [<output.html>]" << endl;
@ -141,6 +129,9 @@ void parse_options (int argc, char **argv)
.add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets") .add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets")
.add("tounicode", &param.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)") .add("tounicode", &param.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)")
.add("optimize-text", &param.optimize_text, 0, "try to reduce the number of HTML elements used for text") .add("optimize-text", &param.optimize_text, 0, "try to reduce the number of HTML elements used for text")
// background image
.add("bg-format", &param.bg_format, "png", "specify background image format")
// encryption // encryption
.add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", true) .add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", true)
@ -158,10 +149,6 @@ void parse_options (int argc, char **argv)
.add("version,v", "print copyright and version info", &show_version_and_exit) .add("version,v", "print copyright and version info", &show_version_and_exit)
.add("help,h", "print usage information", &show_usage_and_exit) .add("help,h", "print usage information", &show_usage_and_exit)
// deprecated
.add("single-html", "", &deprecated_single_html)
.add("remove-unused-glyph", "", &removed_remove_unsed_glyph)
.add("", &param.input_filename, "", "") .add("", &param.input_filename, "", "")
.add("", &param.output_filename, "", "") .add("", &param.output_filename, "", "")
; ;
@ -192,14 +179,103 @@ void parse_options (int argc, char **argv)
} }
} }
int main(int argc, char **argv) void check_param()
{ {
parse_options(argc, argv);
if (param.input_filename == "") if (param.input_filename == "")
{ {
show_usage_and_exit(); show_usage_and_exit();
} }
if(param.output_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.output_filename = s.substr(0, s.size() - 4) + ".html";
}
else
{
param.output_filename = s + ".html";
}
}
if(param.page_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.page_filename = s.substr(0, s.size() - 4) + "%d.page";
}
else
{
param.page_filename = s + "%d.page";
}
sanitize_filename(param.page_filename);
}
else
{
// Need to make sure we have a page number placeholder in the filename
if(!sanitize_filename(param.page_filename))
{
// Inject the placeholder just before the file extension
const string suffix = get_suffix(param.page_filename);
param.page_filename = param.page_filename.substr(0, param.page_filename.size() - suffix.size()) + "%d" + suffix;
sanitize_filename(param.page_filename);
}
}
if(param.css_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.css_filename = s.substr(0, s.size() - 4) + ".css";
}
else
{
if(!param.split_pages)
param.css_filename = s + ".css";
}
}
if(param.outline_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.outline_filename = s.substr(0, s.size() - 4) + ".outline";
}
else
{
if(!param.split_pages)
param.outline_filename = s + ".outline";
}
}
if(param.bg_format == "svg")
{
#if not ENABLE_SVG
cerr << "SVG support is not built" << endl;
exit(EXIT_FAILURE);
#endif
}
else if (param.bg_format == "png")
{
// pass
}
else
{
cerr << "Unknown format for background: " << param.bg_format << endl;
exit(EXIT_FAILURE);
}
}
int main(int argc, char **argv)
{
parse_options(argc, argv);
check_param();
//prepare the directories //prepare the directories
{ {
char buf[] = "/tmp/pdf2htmlEX-XXXXXX"; char buf[] = "/tmp/pdf2htmlEX-XXXXXX";
@ -243,89 +319,20 @@ int main(int argc, char **argv)
delete ownerPW; delete ownerPW;
} }
if (!doc->isOk()) { if (!doc->isOk())
throw "Cannot read the file"; throw "Cannot read the file";
}
// check for copy permission // check for copy permission
if (!doc->okToCopy()) { if (!doc->okToCopy())
if (param.no_drm == 0) { {
if (param.no_drm == 0)
throw "Copying of text from this document is not allowed."; throw "Copying of text from this document is not allowed.";
}
cerr << "Document has copy-protection bit set." << endl; cerr << "Document has copy-protection bit set." << endl;
} }
param.first_page = min<int>(max<int>(param.first_page, 1), doc->getNumPages()); param.first_page = min<int>(max<int>(param.first_page, 1), doc->getNumPages());
param.last_page = min<int>(max<int>(param.last_page, param.first_page), doc->getNumPages()); param.last_page = min<int>(max<int>(param.last_page, param.first_page), doc->getNumPages());
if(param.output_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.output_filename = s.substr(0, s.size() - 4) + ".html";
}
else
{
param.output_filename = s + ".html";
}
}
if(param.page_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.page_filename = s.substr(0, s.size() - 4) + "%d.page";
}
else
{
param.page_filename = s + "%d.page";
}
sanitize_filename(param.page_filename);
}
else
{
// Need to make sure we have a page number placeholder in the filename
if(!sanitize_filename(param.page_filename))
{
// Inject the placeholder just before the file extension
const string suffix = get_suffix(param.page_filename);
param.page_filename = param.page_filename.substr(0, param.page_filename.size() - suffix.size()) + "%d" + suffix;
sanitize_filename(param.page_filename);
}
}
if(param.css_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.css_filename = s.substr(0, s.size() - 4) + ".css";
}
else
{
if(!param.split_pages)
param.css_filename = s + ".css";
}
}
if(param.outline_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.outline_filename = s.substr(0, s.size() - 4) + ".outline";
}
else
{
if(!param.split_pages)
param.outline_filename = s + ".outline";
}
}
unique_ptr<HTMLRenderer>(new HTMLRenderer(param))->process(doc); unique_ptr<HTMLRenderer>(new HTMLRenderer(param))->process(doc);