From a8d0fd83d01e3b017b3a909fe7a2a188a064d922 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Sep 2012 00:45:00 +0800 Subject: [PATCH 1/5] fix build on CYGWIN --- CMakeLists.txt | 8 +++++++- src/include/util.h | 24 ++++++++++-------------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8cdee45..5f4cb10 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,10 +50,16 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wunused-function") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ggdb") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wunused-function") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++0x") #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") +# CYGWIN bug +if(CYGWIN) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=gnu++0x") +else() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++0x") +endif() + configure_file (${CMAKE_SOURCE_DIR}/src/include/pdf2htmlEX-config.h.in ${CMAKE_SOURCE_DIR}/src/include/pdf2htmlEX-config.h) add_executable(pdf2htmlEX diff --git a/src/include/util.h b/src/include/util.h index a677228..7638fa6 100644 --- a/src/include/util.h +++ b/src/include/util.h @@ -10,6 +10,7 @@ #ifndef UTIL_H__ #define UTIL_H__ +#include #include #include #include @@ -19,11 +20,6 @@ #include -using std::istream; -using std::ostream; -using std::max; -using std::abs; - #ifndef nullptr #define nullptr (NULL) #endif @@ -133,10 +129,10 @@ class base64stream { public: - base64stream(istream & in) : in(&in) { } - base64stream(istream && in) : in(&in) { } + base64stream(std::istream & in) : in(&in) { } + base64stream(std::istream && in) : in(&in) { } - ostream & dumpto(ostream & out) + std::ostream & dumpto(std::ostream & out) { unsigned char buf[3]; while(in->read((char*)buf, 3)) @@ -170,12 +166,12 @@ public: } private: - istream * in; + std::istream * in; static const char * base64_encoding; }; -static inline ostream & operator << (ostream & out, base64stream & bf) { return bf.dumpto(out); } -static inline ostream & operator << (ostream & out, base64stream && bf) { return bf.dumpto(out); } +static inline std::ostream & operator << (std::ostream & out, base64stream & bf) { return bf.dumpto(out); } +static inline std::ostream & operator << (std::ostream & out, base64stream && bf) { return bf.dumpto(out); } class string_formatter { @@ -200,13 +196,13 @@ public: va_list vlist; va_start(vlist, format); - int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + int l = std::vsnprintf(&buf.front(), buf.capacity(), format, vlist); va_end(vlist); if(l >= (int)buf.capacity()) { - buf.reserve(max((long)(l+1), (long)buf.capacity() * 2)); + buf.reserve(std::max((long)(l+1), (long)buf.capacity() * 2)); va_start(vlist, format); - l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + l = std::vsnprintf(&buf.front(), buf.capacity(), format, vlist); va_end(vlist); } assert(l >= 0); // we should fail when vsnprintf fail From 1f5ebb1756c20d0cf409e6ae5e490358633cf0e0 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Sep 2012 01:21:08 +0800 Subject: [PATCH 2/5] .. --- pdf2htmlEX.1 | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 pdf2htmlEX.1 diff --git a/pdf2htmlEX.1 b/pdf2htmlEX.1 new file mode 100644 index 0000000..292c3bb --- /dev/null +++ b/pdf2htmlEX.1 @@ -0,0 +1,136 @@ +.TH pdf2htmlEX 1 "Aug 31, 2012" "pdf2htmlEX 0.1" +.SH NAME +.PP +.nf + pdf2htmlEX \- converts PDF to HTML without losing text and format. +.fi + +.SH USAGE +.PP +.nf + pdf2htmlEX [options] [] +.fi + +.SH DESCRIPTION +.PP +pdf2htmlEX is a utility that converts PDF files to HTML files. + +pdf2htmlEX tries its best to render the PDF precisely, maintain proper styling, while retaining text and optmizing for Web. + +Fonts are extracted form PDF and then embedded into HTML (Type 3 fonts are not supported). Text in the converted HTML file is usually selectable and copyable. + +Other objects are rendered as images and also embedded. + +.SH OPTIONS +.TP +.B --help +Show all options +.TP +.B -v, --version +Show copyright and version +.TP +.B -o, --owner-password +Specify owner password +.TP +.B -u, --user-password +Specify user password +.TP +.B --dest-dir (Default: ".") +Specify destination folder +.TP +.B -f, --first-page (Default: 1) +Specify the first page to process +.TP +.B -l, --last-page (Default: last page) +Specify the last page to process +.TP +.B --zoom (Default: 1.0) +Specify the zoom ratio of the HTML file +.TP +.B --hpdi , --vpdi (Default: 144) +Specify the horizontal and vertical DPI for images +.TP +.B --process-nontext <0|1> (Default: 1) +Whether to process non-text objects (as images) +.TP +.B --single-html <0|1> (Default: 1) +Whether to embed everything into one HTML file. + +If switched out, there will be several files generated along with the HTML file including files for fonts, css, images. +.TP +.B --embed-base-font <0|1> (Default: 1) +Whether to embed base 14 fonts. + +There are several base font defined in PDF standards, which are supposed to be provided by the PDF reader. + +If this switch is on, local matched font will be used and embedded; otherwise only font names are exported such that web browsers may try to find proper fonts themselves. +.TP +.B --embed-external-font <0|1> (Default: 0) +Similar as above but for non-base fonts. +.TP +.B --decompose-ligature <0|1> (Default: 0) +Decompose ligatures. For example 'fi' -> 'f''i'. +.TP +.B --heps , --veps (Default: 1) +Specify the maximum tolerable horizontal/vertical offset (in pixels). + +pdf2htmlEX would try to optimize the generated HTML file moving Text within this distance. +.TP +.B --space-threshold (Default: 1.0/6) +pdf2htmlEX would insert a whitespace character ' ' if the distance between two consecutive letters in the same line is wider than ratio * font_size. +.TP +.B --font-size-multiplier (Default: 10) +Many web browsers limit the minimum font size, and many would round the given font size, which results in incorrect rendering. + +Specify a ratio greater than 1 would resolve this issue. + +For some versions of Firefox, however, there will be a problem when the font size is too large, in which case a smaller value should be specified here. +.TP +.B --tounicode <-1|0|1> (Default: 0) +A ToUnicode map may be provided for each font in PDF which indicates the 'meaning' of the characters. However often there is better "ToUnicode" info in Type 0/1 fonts, and sometimes the ToUnicode map provided is wrong. + +If this value is set to 1, the ToUnicode Map is always applied, if provided in PDF, and characters may not render correctly in HTML if there are collisions. + +If set to -1, a customized map is used such that rendering will be correct in HTML (visually the same), but you may not get correct characters by select & copy & paste. + +If set to 0, pdf2htmlEX would try it best to balance the two methods above. +.TP +.B --space-as-offset <0|1> (Default: 0) +Treat space characters as offsets, which may increase the size of the output. + +Turn it on if space characters are not displayed correctly, or you want to remove positional spaces. +.TP +.B --font-suffix (Default: ".ttf"), --font-format (Default: "truetype") +Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. +.TP +.B --debug <0|1> (Default: 0) +Show debug information. +.TP +.B --clean-tmp <0|1> (Default: 1) +If switched off, intermediate files won't be cleaned in the end. + +.SH EXAMPLE +.TP +.B pdf2htmlEX /path/to/file.pdf +Convert file.pdf into file.html +.TP +.B pdf2htmlEX --tmp-dir tmp --clean-tmp 0 --debug 1 /path/to/file.pdf +Convert file.pdf and leave all intermediate files. +.TP +.B pdf2htmlEX --dest-dir out --single-html 0 --debug 1 /path/to/file.pdf +Convert file.pdf into out/file.html and leave font/image files separated. + +.SH COPYRIGHT +.PP +Copyright 2012 Lu Wang + +pdf2htmlEX is GPLv2 & GPLv3 dual licensed + +.SH AUTHOR +.PP +pdf2htmlEX is written by Lu Wang + +.SH SEE ALSO +.TP +Home page +http://github.com/coolwanglu/pdf2htmlEX From 181d0242cc920597f34c27a251775c336a156372 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Sep 2012 01:53:33 +0800 Subject: [PATCH 3/5] successfully built with CYGWIN --- CMakeLists.txt | 6 +++++- src/ArgParser.cc | 18 ++++++++++++++++-- src/HTMLRenderer/LineBuffer.cc | 1 + src/ff.c | 1 - src/include/ArgParser.h | 4 ++++ src/include/pdf2htmlEX-config.h | 4 ++-- src/include/util.h | 4 ++-- src/util.cc | 4 +++- 8 files changed, 33 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f4cb10..5242e82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,12 +11,16 @@ link_directories(${POPPLER_LIBRARY_DIRS}) find_path(FF_INCLUDE_PATH fontforge/fontforge.h) if(FF_INCLUDE_PATH) message("Found fontforge.h: ${FF_INCLUDE_PATH}/fontforge/fontforge.h") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -include ${FF_INCLUDE_PATH}/fontforge/config.h") include_directories(${FF_INCLUDE_PATH}/fontforge) else() message(FATAL_ERROR "Error: cannot locate fontforge.h") endif() +find_path(FF_CONFIG_INCLUDE_PATH fontforge/config.h) +if(FF_CONFIG_INCLUDE_PATH) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -include ${FF_INCLUDE_PATH}/fontforge/config.h") +endif() + foreach(FF_LIB_NAME ${CMAKE_IMPORT_LIBRARY_PREFIX}fontforge${CMAKE_IMPORT_LIBRARY_SUFFIX} ${CMAKE_SHARED_LIBRARY_PREFIX}fontforge${CMAKE_SHARED_LIBRARY_SUFFIX} diff --git a/src/ArgParser.cc b/src/ArgParser.cc index 858b579..a61bf2b 100644 --- a/src/ArgParser.cc +++ b/src/ArgParser.cc @@ -64,7 +64,14 @@ void ArgParser::parse(int argc, char ** argv) const if(p->name != "") { int v = (256 + (iter - arg_entries.begin())); - longopts.push_back({p->name.c_str(), ((p->need_arg) ? required_argument : no_argument), nullptr, v}); + longopts.resize(longopts.size() + 1); + { + auto & cur = longopts.back(); + cur.name = p->name.c_str(); + cur.has_arg = ((p->need_arg) ? required_argument : no_argument); + cur.flag = nullptr; + cur.val = v; + } if(!(opt_map.insert(make_pair(v, p)).second)) { cerr << "Warning: duplicated shortname '" << v << "' used by --" << (p->name) << " and --" << (opt_map[p->shortname]->name) << endl; @@ -73,7 +80,14 @@ void ArgParser::parse(int argc, char ** argv) const } optstring.push_back(0); - longopts.push_back({0,0,0,0}); + longopts.resize(longopts.size() + 1); + { + auto & cur = longopts.back(); + cur.name = 0; + cur.has_arg = 0; + cur.flag = 0; + cur.val = 0; + } { opterr = 1; diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc index dcb8995..c1c705d 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/LineBuffer.cc @@ -15,6 +15,7 @@ using std::min; using std::max; using std::vector; +using std::ostream; void HTMLRenderer::LineBuffer::reset(GfxState * state) { diff --git a/src/ff.c b/src/ff.c index 44b4211..b65811f 100644 --- a/src/ff.c +++ b/src/ff.c @@ -13,7 +13,6 @@ #include #include -#include #include #include diff --git a/src/include/ArgParser.h b/src/include/ArgParser.h index 44424b4..9d242fe 100644 --- a/src/include/ArgParser.h +++ b/src/include/ArgParser.h @@ -14,6 +14,10 @@ #include #include +#ifndef nullptr +#define nullptr (NULL) +#endif + class ArgParser { public: diff --git a/src/include/pdf2htmlEX-config.h b/src/include/pdf2htmlEX-config.h index 5503307..b70ba4e 100644 --- a/src/include/pdf2htmlEX-config.h +++ b/src/include/pdf2htmlEX-config.h @@ -12,8 +12,8 @@ #include static const std::string PDF2HTMLEX_VERSION = "0.3"; -static const std::string PDF2HTMLEX_PREFIX = "/usr/local"; -static const std::string PDF2HTMLEX_DATA_PATH = "/usr/local""/share/pdf2htmlEX"; +static const std::string PDF2HTMLEX_PREFIX = "/usr"; +static const std::string PDF2HTMLEX_DATA_PATH = "/usr""/share/pdf2htmlEX"; #endif //PDF2HTMLEX_CONFIG_H__ diff --git a/src/include/util.h b/src/include/util.h index 7638fa6..2cfae36 100644 --- a/src/include/util.h +++ b/src/include/util.h @@ -196,13 +196,13 @@ public: va_list vlist; va_start(vlist, format); - int l = std::vsnprintf(&buf.front(), buf.capacity(), format, vlist); + int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); va_end(vlist); if(l >= (int)buf.capacity()) { buf.reserve(std::max((long)(l+1), (long)buf.capacity() * 2)); va_start(vlist, format); - l = std::vsnprintf(&buf.front(), buf.capacity(), format, vlist); + l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); va_end(vlist); } assert(l >= 0); // we should fail when vsnprintf fail diff --git a/src/util.cc b/src/util.cc index 954a1da..0b07c5b 100644 --- a/src/util.cc +++ b/src/util.cc @@ -6,6 +6,8 @@ * 2012.08.10 */ +#include + #include #include #include @@ -22,7 +24,7 @@ using std::cerr; using std::endl; using std::string; using std::map; - +using std::ostream; const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; From 0e4f418766d8d7c83bfbe1846f32c01575822c86 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Sep 2012 02:14:47 +0800 Subject: [PATCH 4/5] add pdf2htmlEX-config.h --- src/include/pdf2htmlEX-config.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/include/pdf2htmlEX-config.h b/src/include/pdf2htmlEX-config.h index b70ba4e..5503307 100644 --- a/src/include/pdf2htmlEX-config.h +++ b/src/include/pdf2htmlEX-config.h @@ -12,8 +12,8 @@ #include static const std::string PDF2HTMLEX_VERSION = "0.3"; -static const std::string PDF2HTMLEX_PREFIX = "/usr"; -static const std::string PDF2HTMLEX_DATA_PATH = "/usr""/share/pdf2htmlEX"; +static const std::string PDF2HTMLEX_PREFIX = "/usr/local"; +static const std::string PDF2HTMLEX_DATA_PATH = "/usr/local""/share/pdf2htmlEX"; #endif //PDF2HTMLEX_CONFIG_H__ From efbb2a52040bca4d728bc57d50cfa611d2c65944 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Sep 2012 02:15:15 +0800 Subject: [PATCH 5/5] remove pdf2htmlEX-config.h --- src/include/pdf2htmlEX-config.h | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 src/include/pdf2htmlEX-config.h diff --git a/src/include/pdf2htmlEX-config.h b/src/include/pdf2htmlEX-config.h deleted file mode 100644 index 5503307..0000000 --- a/src/include/pdf2htmlEX-config.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * config.h - * Compile time constants - * - * by WangLu - */ - - -#ifndef PDF2HTMLEX_CONFIG_H__ -#define PDF2HTMLEX_CONFIG_H__ - -#include - -static const std::string PDF2HTMLEX_VERSION = "0.3"; -static const std::string PDF2HTMLEX_PREFIX = "/usr/local"; -static const std::string PDF2HTMLEX_DATA_PATH = "/usr/local""/share/pdf2htmlEX"; - - -#endif //PDF2HTMLEX_CONFIG_H__