From ef720388947b1bd2563b928826ebd8bce25be986 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 16 Nov 2012 20:41:44 +0800 Subject: [PATCH 01/38] correct page count in prompt --- src/HTMLRenderer/general.cc | 2 +- src/Preprocessor.cc | 2 +- src/include/util.h | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 9080e0f..3cfec86 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -76,7 +76,7 @@ void HTMLRenderer::process(PDFDoc *doc) bg_renderer->startDoc(doc); } - int page_count = (param->last_page - param->first_page); + int page_count = (param->last_page - param->first_page + 1); for(int i = param->first_page; i <= param->last_page ; ++i) { cerr << "Working: " << (i-param->first_page) << "/" << page_count << '\r' << flush; diff --git a/src/Preprocessor.cc b/src/Preprocessor.cc index b2a9677..317c9cd 100644 --- a/src/Preprocessor.cc +++ b/src/Preprocessor.cc @@ -41,7 +41,7 @@ Preprocessor::~Preprocessor(void) void Preprocessor::process(PDFDoc * doc) { - int page_count = (param->last_page - param->first_page); + int page_count = (param->last_page - param->first_page + 1); for(int i = param->first_page; i <= param->last_page ; ++i) { cerr << "Preprocessing: " << (i-param->first_page) << "/" << page_count << '\r' << flush; diff --git a/src/include/util.h b/src/include/util.h index bb09467..e147230 100644 --- a/src/include/util.h +++ b/src/include/util.h @@ -57,15 +57,14 @@ static inline long long hash_ref(const Ref * id) } /* + * Check if the unicode is valid for HTML * http://en.wikipedia.org/wiki/HTML_decimal_character_rendering */ bool isLegalUnicode(Unicode u); Unicode map_to_private(CharCode code); -/* - * Try to determine the Unicode value directly from the information in the font - */ +/* * Try to determine the Unicode value directly from the information in the font */ Unicode unicode_from_font (CharCode code, GfxFont * font); /* From 210a2451e6c163a35dfbdd83f3c0d21a1cc05b4d Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 16 Nov 2012 20:46:52 +0800 Subject: [PATCH 02/38] text in writing mode are rendered as image --- src/SplashBackgroundRenderer.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/SplashBackgroundRenderer.cc b/src/SplashBackgroundRenderer.cc index cf60bb2..808e22f 100644 --- a/src/SplashBackgroundRenderer.cc +++ b/src/SplashBackgroundRenderer.cc @@ -19,7 +19,8 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y, double originX, double originY, CharCode code, int nBytes, Unicode *u, int uLen) { - if((state->getRender() & 3) == 3) + if(((state->getRender() & 3) == 3) + || ((state->getFont()) && (state->getFont()->getWMode()))) { SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen); } From 521fc5b09bc3cc2b3f00b1ad566259e55ef63cdd Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 16 Nov 2012 20:51:24 +0800 Subject: [PATCH 03/38] typo in manpage --- pdf2htmlEX.1.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 9775233..50a82dc 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -120,7 +120,7 @@ Turn it on if space characters are not displayed correctly, or you want to remov .B --stretch-narrow-glyph <0|1> (Default: 0) If set to 1, glyphs narrower than described in PDF will be strecth; otherwise space will be padded to the right of the glyphs .TP -.B --squeeze_wide_glyph <0|1> (Default: 1) +.B --squeeze-wide-glyph <0|1> (Default: 1) If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated. .TP .B --remove-unused-glyph <0|1> (Default: 1) From 8078528795f7c3e4c2f59a9620389885768c164b Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 16 Nov 2012 22:08:01 +0800 Subject: [PATCH 04/38] TODO --- TODO | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/TODO b/TODO index bd481ba..db21a95 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,7 @@ +combine lines (unwarp) + +type 3 fonts + word space/offset before the first letter (calendar pdf) don't dump image when there is nothing @@ -6,11 +10,10 @@ Integrate splash/cairo native support for image native support for draw -draw non-orthogonal lines with CSS - -position history stack (popstate) ==Wait until someone asks== +position history stack (popstate) +draw non-orthogonal lines with CSS try harder finding glyph names (using fontforge) for CID Type 0 rename single-html -> embed-font/image/css ... merge sub/sup into one line @@ -23,6 +26,5 @@ use absolute positioning for long whitespace color invert detect duplicate base fonts when embedding disable selection if we know unicode is wrong -combine lines (unwarp) -Printing check if we can add information to the font, and let browsers show ligatures automatically +Printing From 462e8dc541cbe7a1ce26aa201ed3e2b500638d91 Mon Sep 17 00:00:00 2001 From: filodej Date: Mon, 26 Nov 2012 22:23:13 +0100 Subject: [PATCH 05/38] Add missing includes to the utils.h file --- src/include/util.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/include/util.h b/src/include/util.h index e147230..26da1fa 100644 --- a/src/include/util.h +++ b/src/include/util.h @@ -18,6 +18,9 @@ #include #include +#include +#include + #ifndef nullptr #define nullptr (NULL) #endif From a5ccc7f073ab6c34e31628d975abfd3e3b7ca224 Mon Sep 17 00:00:00 2001 From: filodej Date: Mon, 26 Nov 2012 22:38:13 +0100 Subject: [PATCH 06/38] merge filodej --- CMakeLists.txt | 1 + src/HTMLRenderer/TmpFiles.cc | 48 ++++++++++++++++++++++++++++++++++++ src/HTMLRenderer/general.cc | 35 +++----------------------- src/HTMLRenderer/text.cc | 10 ++++---- src/include/HTMLRenderer.h | 5 ++-- src/include/TmpFiles.h | 29 ++++++++++++++++++++++ 6 files changed, 89 insertions(+), 39 deletions(-) create mode 100644 src/HTMLRenderer/TmpFiles.cc create mode 100644 src/include/TmpFiles.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d31641..43b2e95 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,6 +154,7 @@ add_executable(pdf2htmlEX src/HTMLRenderer/link.cc src/include/namespace.h src/HTMLRenderer/LineBuffer.cc + src/HTMLRenderer/TmpFiles.cc src/include/ffw.h src/ffw.c src/include/BackgroundRenderer.h diff --git a/src/HTMLRenderer/TmpFiles.cc b/src/HTMLRenderer/TmpFiles.cc new file mode 100644 index 0000000..2025b03 --- /dev/null +++ b/src/HTMLRenderer/TmpFiles.cc @@ -0,0 +1,48 @@ +#include "TmpFiles.h" +#include "Param.h" +#include + +using namespace std; + +namespace pdf2htmlEX { + + +TmpFiles::TmpFiles( Param const& param_ ) + : param( param_ ) +{ +} + +TmpFiles::~TmpFiles() +{ + clean(); +} + +void TmpFiles::add(const string & fn) +{ + if(!param.clean_tmp) + return; + + if(tmp_files.insert(fn).second && param.debug) + cerr << "Add new temporary file: " << fn << endl; +} + +void TmpFiles::clean() +{ + if(!param.clean_tmp) + return; + + for(auto iter = tmp_files.begin(); iter != tmp_files.end(); ++iter) + { + const string & fn = *iter; + remove(fn.c_str()); + if(param.debug) + cerr << "Remove temporary file: " << fn << endl; + } + + remove(param.tmp_dir.c_str()); + if(param.debug) + cerr << "Remove temporary directory: " << param.tmp_dir << endl; +} + + +} // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 3cfec86..b47685e 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -38,6 +38,7 @@ HTMLRenderer::HTMLRenderer(const Param * param) ,line_opened(false) ,line_buf(this) ,preprocessor(param) + ,tmp_files(*param) ,image_count(0) ,param(param) { @@ -56,7 +57,6 @@ HTMLRenderer::HTMLRenderer(const Param * param) HTMLRenderer::~HTMLRenderer() { ffw_finalize(); - clean_tmp_files(); delete [] cur_mapping; delete [] cur_mapping2; delete [] width_list; @@ -94,7 +94,7 @@ void HTMLRenderer::process(PDFDoc *doc) { auto fn = str_fmt("%s/p%x.png", (param->single_html ? param->tmp_dir : param->dest_dir).c_str(), i); if(param->single_html) - add_tmp_file((char*)fn); + tmp_files.add((char*)fn); bg_renderer->render_page(doc, i, (char*)fn); } @@ -280,7 +280,7 @@ void HTMLRenderer::pre_process(PDFDoc * doc) : str_fmt("%s/%s", param->dest_dir.c_str(), param->css_filename.c_str()); if(param->single_html && (!param->split_pages)) - add_tmp_file((char*)fn); + tmp_files.add((char*)fn); css_path = (char*)fn, css_fout.open(css_path, ofstream::binary); @@ -301,7 +301,7 @@ void HTMLRenderer::pre_process(PDFDoc * doc) * Otherwise just generate it */ auto fn = str_fmt("%s/__pages", param->tmp_dir.c_str()); - add_tmp_file((char*)fn); + tmp_files.add((char*)fn); html_path = (char*)fn; html_fout.open(html_path, ofstream::binary); @@ -392,33 +392,6 @@ void HTMLRenderer::fix_stream (std::ostream & out) out << hex << fixed; } -void HTMLRenderer::add_tmp_file(const string & fn) -{ - if(!param->clean_tmp) - return; - - if(tmp_files.insert(fn).second && param->debug) - cerr << "Add new temporary file: " << fn << endl; -} - -void HTMLRenderer::clean_tmp_files() -{ - if(!param->clean_tmp) - return; - - for(auto iter = tmp_files.begin(); iter != tmp_files.end(); ++iter) - { - const string & fn = *iter; - remove(fn.c_str()); - if(param->debug) - cerr << "Remove temporary file: " << fn << endl; - } - - remove(param->tmp_dir.c_str()); - if(param->debug) - cerr << "Remove temporary directory: " << param->tmp_dir << endl; -} - void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy) { string fn = get_filename(path); diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 7e9b566..4f61d43 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -127,7 +127,7 @@ string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) obj.streamReset(); filepath = (char*)str_fmt("%s/f%llx%s", param->tmp_dir.c_str(), fn_id, suffix.c_str()); - add_tmp_file(filepath); + tmp_files.add(filepath); ofstream outf(filepath, ofstream::binary); if(!outf) @@ -171,7 +171,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo if(param->debug) { auto fn = str_fmt("%s/__raw_font_%lld", param->tmp_dir.c_str(), info.id, param->font_suffix.c_str()); - add_tmp_file((char*)fn); + tmp_files.add((char*)fn); ofstream((char*)fn, ofstream::binary) << ifstream(filepath).rdbuf(); } @@ -437,9 +437,9 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo * */ string cur_tmp_fn = (char*)str_fmt("%s/__tmp_font1%s", param->tmp_dir.c_str(), param->font_suffix.c_str()); - add_tmp_file(cur_tmp_fn); + tmp_files.add(cur_tmp_fn); string other_tmp_fn = (char*)str_fmt("%s/__tmp_font2%s", param->tmp_dir.c_str(), param->font_suffix.c_str()); - add_tmp_file(other_tmp_fn); + tmp_files.add(other_tmp_fn); ffw_save(cur_tmp_fn.c_str()); @@ -482,7 +482,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo info.id, param->font_suffix.c_str()); if(param->single_html) - add_tmp_file(fn); + tmp_files.add(fn); ffw_load_font(cur_tmp_fn.c_str()); ffw_metric(&info.ascent, &info.descent); diff --git a/src/include/HTMLRenderer.h b/src/include/HTMLRenderer.h index 54e7654..19add6c 100644 --- a/src/include/HTMLRenderer.h +++ b/src/include/HTMLRenderer.h @@ -27,6 +27,7 @@ #include "Param.h" #include "util.h" #include "Preprocessor.h" +#include "TmpFiles.h" /* * Naming Convention @@ -156,8 +157,6 @@ class HTMLRenderer : public OutputDev // set flags void fix_stream (std::ostream & out); - void add_tmp_file (const std::string & fn); - void clean_tmp_files (); std::string dump_embedded_font (GfxFont * font, long long fn_id); void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false); @@ -408,6 +407,7 @@ class HTMLRenderer : public OutputDev char ** cur_mapping2; int * width_list; Preprocessor preprocessor; + TmpFiles tmp_files; // for string formatting string_formatter str_fmt; @@ -431,7 +431,6 @@ class HTMLRenderer : public OutputDev const Param * param; std::ofstream html_fout, css_fout; std::string html_path, css_path; - std::set tmp_files; static const std::string MANIFEST_FILENAME; }; diff --git a/src/include/TmpFiles.h b/src/include/TmpFiles.h new file mode 100644 index 0000000..3084585 --- /dev/null +++ b/src/include/TmpFiles.h @@ -0,0 +1,29 @@ +#ifndef TMPFILES_H__ +#define TMPFILES_H__ + +#include +#include +#include "Param.h" + +namespace pdf2htmlEX { + +class TmpFiles +{ +public: + explicit TmpFiles( Param const& param ); + virtual ~TmpFiles(); + + void add(std::string const& fn); + +private: + void clean(); + +private: + Param const& param; + std::set tmp_files; + +}; + +} // namespace pdf2htmlEX + +#endif //TMPFILES_H__ From db23e31e6a183ec37823d9b4725734129e5f79e9 Mon Sep 17 00:00:00 2001 From: filodej Date: Tue, 27 Nov 2012 17:17:29 +0100 Subject: [PATCH 07/38] Add changes requested in code review --- src/HTMLRenderer/TmpFiles.cc | 15 ++++++++++++--- src/include/TmpFiles.h | 8 ++++---- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/HTMLRenderer/TmpFiles.cc b/src/HTMLRenderer/TmpFiles.cc index 2025b03..958ab6f 100644 --- a/src/HTMLRenderer/TmpFiles.cc +++ b/src/HTMLRenderer/TmpFiles.cc @@ -1,13 +1,22 @@ +/* + * TmpFiles.cc + * + * Collect and clean-up temporary files + * + * implemented by WangLu + * split off by Filodej + */ + +#include #include "TmpFiles.h" #include "Param.h" -#include using namespace std; namespace pdf2htmlEX { -TmpFiles::TmpFiles( Param const& param_ ) +TmpFiles::TmpFiles( const Param& param_ ) : param( param_ ) { } @@ -17,7 +26,7 @@ TmpFiles::~TmpFiles() clean(); } -void TmpFiles::add(const string & fn) +void TmpFiles::add( const string & fn) { if(!param.clean_tmp) return; diff --git a/src/include/TmpFiles.h b/src/include/TmpFiles.h index 3084585..f036593 100644 --- a/src/include/TmpFiles.h +++ b/src/include/TmpFiles.h @@ -10,16 +10,16 @@ namespace pdf2htmlEX { class TmpFiles { public: - explicit TmpFiles( Param const& param ); - virtual ~TmpFiles(); + explicit TmpFiles( const Param& param ); + ~TmpFiles(); - void add(std::string const& fn); + void add( const std::string& fn); private: void clean(); private: - Param const& param; + const Param& param; std::set tmp_files; }; From 63287ce49186f71ba59e65782f33f989615e70c6 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 17:28:05 +0800 Subject: [PATCH 08/38] reorganize files --- CMakeLists.txt | 55 ++++++++++--------- .../BackgroundRenderer.h | 0 .../CairoBackgroundRenderer.cc | 0 .../CairoBackgroundRenderer.h | 0 .../CairoOutputDev/CairoFontEngine.cc | 0 .../CairoOutputDev/CairoFontEngine.h | 0 .../CairoOutputDev/CairoOutputDev.cc | 0 .../CairoOutputDev/CairoOutputDev.h | 0 .../CairoOutputDev/CairoRescaleBox.cc | 0 .../CairoOutputDev/CairoRescaleBox.h | 0 .../SplashBackgroundRenderer.cc | 0 .../SplashBackgroundRenderer.h | 2 +- src/{include => HTMLRenderer}/HTMLRenderer.h | 4 +- src/HTMLRenderer/LineBuffer.cc | 2 +- src/{ => HTMLRenderer}/Preprocessor.cc | 2 +- src/{include => HTMLRenderer}/Preprocessor.h | 0 src/HTMLRenderer/draw.cc | 4 +- src/HTMLRenderer/export.cc | 2 +- src/HTMLRenderer/general.cc | 6 +- src/HTMLRenderer/image.cc | 2 +- src/HTMLRenderer/install.cc | 4 +- src/HTMLRenderer/link.cc | 4 +- src/HTMLRenderer/state.cc | 4 +- src/HTMLRenderer/text.cc | 4 +- src/{include => }/Param.h | 0 src/{include => }/pdf2htmlEX-config.h.in | 0 src/pdf2htmlEX.cc | 4 +- src/{ => util}/ArgParser.cc | 0 src/{include => util}/ArgParser.h | 0 src/{HTMLRenderer => util}/TmpFiles.cc | 0 src/{include => util}/TmpFiles.h | 0 src/{ => util}/ffw.c | 0 src/{include => util}/ffw.h | 0 src/{include => util}/namespace.h | 0 src/{ => util}/util.cc | 0 src/{include => util}/util.h | 0 36 files changed, 51 insertions(+), 48 deletions(-) rename src/{include => BackgroundRenderer}/BackgroundRenderer.h (100%) rename src/{ => BackgroundRenderer}/CairoBackgroundRenderer.cc (100%) rename src/{include => BackgroundRenderer}/CairoBackgroundRenderer.h (100%) rename src/{ => BackgroundRenderer}/CairoOutputDev/CairoFontEngine.cc (100%) rename src/{ => BackgroundRenderer}/CairoOutputDev/CairoFontEngine.h (100%) rename src/{ => BackgroundRenderer}/CairoOutputDev/CairoOutputDev.cc (100%) rename src/{ => BackgroundRenderer}/CairoOutputDev/CairoOutputDev.h (100%) rename src/{ => BackgroundRenderer}/CairoOutputDev/CairoRescaleBox.cc (100%) rename src/{ => BackgroundRenderer}/CairoOutputDev/CairoRescaleBox.h (100%) rename src/{ => BackgroundRenderer}/SplashBackgroundRenderer.cc (100%) rename src/{include => BackgroundRenderer}/SplashBackgroundRenderer.h (97%) rename src/{include => HTMLRenderer}/HTMLRenderer.h (99%) rename src/{ => HTMLRenderer}/Preprocessor.cc (99%) rename src/{include => HTMLRenderer}/Preprocessor.h (100%) rename src/{include => }/Param.h (100%) rename src/{include => }/pdf2htmlEX-config.h.in (100%) rename src/{ => util}/ArgParser.cc (100%) rename src/{include => util}/ArgParser.h (100%) rename src/{HTMLRenderer => util}/TmpFiles.cc (100%) rename src/{include => util}/TmpFiles.h (100%) rename src/{ => util}/ffw.c (100%) rename src/{include => util}/ffw.h (100%) rename src/{include => util}/namespace.h (100%) rename src/{ => util}/util.cc (100%) rename src/{include => util}/util.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 43b2e95..7e984a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build configuration (Debug, Release, project(pdf2htmlEX) cmake_minimum_required(VERSION 2.6.0 FATAL_ERROR) -include_directories(${CMAKE_SOURCE_DIR}/src/include) +include_directories(${CMAKE_SOURCE_DIR}/src) set(PDF2HTMLEX_VERSION "0.6") set(ARCHIVE_NAME pdf2htmlex-${PDF2HTMLEX_VERSION}) @@ -138,37 +138,40 @@ if(NOT CXX0X_SUPPORT) endif() -configure_file (${CMAKE_SOURCE_DIR}/src/include/pdf2htmlEX-config.h.in ${CMAKE_SOURCE_DIR}/src/include/pdf2htmlEX-config.h) +configure_file (${CMAKE_SOURCE_DIR}/src/pdf2htmlEX-config.h.in ${CMAKE_SOURCE_DIR}/src/pdf2htmlEX-config.h) configure_file (${CMAKE_SOURCE_DIR}/pdf2htmlEX.1.in ${CMAKE_SOURCE_DIR}/pdf2htmlEX.1) add_executable(pdf2htmlEX + src/Param.h + src/config.h + src/pdf2htmlEX-config.h src/pdf2htmlEX.cc - src/include/HTMLRenderer.h - src/HTMLRenderer/general.cc - src/HTMLRenderer/state.cc - src/HTMLRenderer/install.cc - src/HTMLRenderer/export.cc - src/HTMLRenderer/text.cc - src/HTMLRenderer/image.cc + src/HTMLRenderer/HTMLRenderer.h src/HTMLRenderer/draw.cc - src/HTMLRenderer/link.cc - src/include/namespace.h + src/HTMLRenderer/export.cc + src/HTMLRenderer/general.cc + src/HTMLRenderer/image.cc + src/HTMLRenderer/install.cc src/HTMLRenderer/LineBuffer.cc - src/HTMLRenderer/TmpFiles.cc - src/include/ffw.h - src/ffw.c - src/include/BackgroundRenderer.h - src/include/SplashBackgroundRenderer.h - src/SplashBackgroundRenderer.cc - src/include/CairoBackgroundRenderer.h - src/CairoBackgroundRenderer.cc - src/include/Preprocessor.h - src/Preprocessor.cc - src/include/util.h - src/util.cc - src/include/ArgParser.h - src/ArgParser.cc - src/include/pdf2htmlEX-config.h + src/HTMLRenderer/link.cc + src/HTMLRenderer/state.cc + src/HTMLRenderer/text.cc + src/HTMLRenderer/Preprocessor.h + src/HTMLRenderer/Preprocessor.cc + src/BackgroundRenderer/BackgroundRenderer.h + src/BackgroundRenderer/SplashBackgroundRenderer.h + src/BackgroundRenderer/SplashBackgroundRenderer.cc + src/BackgroundRenderer/CairoBackgroundRenderer.h + src/BackgroundRenderer/CairoBackgroundRenderer.cc + src/util/namespace.h + src/util/ffw.h + src/util/ffw.c + src/util/util.h + src/util/util.cc + src/util/TmpFiles.h + src/util/TmpFiles.cc + src/util/ArgParser.h + src/util/ArgParser.cc ) target_link_libraries(pdf2htmlEX ${PDF2HTMLEX_LIBS}) diff --git a/src/include/BackgroundRenderer.h b/src/BackgroundRenderer/BackgroundRenderer.h similarity index 100% rename from src/include/BackgroundRenderer.h rename to src/BackgroundRenderer/BackgroundRenderer.h diff --git a/src/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc similarity index 100% rename from src/CairoBackgroundRenderer.cc rename to src/BackgroundRenderer/CairoBackgroundRenderer.cc diff --git a/src/include/CairoBackgroundRenderer.h b/src/BackgroundRenderer/CairoBackgroundRenderer.h similarity index 100% rename from src/include/CairoBackgroundRenderer.h rename to src/BackgroundRenderer/CairoBackgroundRenderer.h diff --git a/src/CairoOutputDev/CairoFontEngine.cc b/src/BackgroundRenderer/CairoOutputDev/CairoFontEngine.cc similarity index 100% rename from src/CairoOutputDev/CairoFontEngine.cc rename to src/BackgroundRenderer/CairoOutputDev/CairoFontEngine.cc diff --git a/src/CairoOutputDev/CairoFontEngine.h b/src/BackgroundRenderer/CairoOutputDev/CairoFontEngine.h similarity index 100% rename from src/CairoOutputDev/CairoFontEngine.h rename to src/BackgroundRenderer/CairoOutputDev/CairoFontEngine.h diff --git a/src/CairoOutputDev/CairoOutputDev.cc b/src/BackgroundRenderer/CairoOutputDev/CairoOutputDev.cc similarity index 100% rename from src/CairoOutputDev/CairoOutputDev.cc rename to src/BackgroundRenderer/CairoOutputDev/CairoOutputDev.cc diff --git a/src/CairoOutputDev/CairoOutputDev.h b/src/BackgroundRenderer/CairoOutputDev/CairoOutputDev.h similarity index 100% rename from src/CairoOutputDev/CairoOutputDev.h rename to src/BackgroundRenderer/CairoOutputDev/CairoOutputDev.h diff --git a/src/CairoOutputDev/CairoRescaleBox.cc b/src/BackgroundRenderer/CairoOutputDev/CairoRescaleBox.cc similarity index 100% rename from src/CairoOutputDev/CairoRescaleBox.cc rename to src/BackgroundRenderer/CairoOutputDev/CairoRescaleBox.cc diff --git a/src/CairoOutputDev/CairoRescaleBox.h b/src/BackgroundRenderer/CairoOutputDev/CairoRescaleBox.h similarity index 100% rename from src/CairoOutputDev/CairoRescaleBox.h rename to src/BackgroundRenderer/CairoOutputDev/CairoRescaleBox.h diff --git a/src/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc similarity index 100% rename from src/SplashBackgroundRenderer.cc rename to src/BackgroundRenderer/SplashBackgroundRenderer.cc diff --git a/src/include/SplashBackgroundRenderer.h b/src/BackgroundRenderer/SplashBackgroundRenderer.h similarity index 97% rename from src/include/SplashBackgroundRenderer.h rename to src/BackgroundRenderer/SplashBackgroundRenderer.h index 8ba0cd5..ebf6c74 100644 --- a/src/include/SplashBackgroundRenderer.h +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.h @@ -15,8 +15,8 @@ #include #include -#include "HTMLRenderer.h" #include "Param.h" +#include "HTMLRenderer/HTMLRenderer.h" namespace pdf2htmlEX { diff --git a/src/include/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h similarity index 99% rename from src/include/HTMLRenderer.h rename to src/HTMLRenderer/HTMLRenderer.h index 19add6c..2922a8e 100644 --- a/src/include/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -25,9 +25,9 @@ #include #include "Param.h" -#include "util.h" #include "Preprocessor.h" -#include "TmpFiles.h" +#include "util/util.h" +#include "util/TmpFiles.h" /* * Naming Convention diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc index acbb944..c70c24f 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/LineBuffer.cc @@ -10,7 +10,7 @@ #include #include "HTMLRenderer.h" -#include "namespace.h" +#include "util/namespace.h" namespace pdf2htmlEX { diff --git a/src/Preprocessor.cc b/src/HTMLRenderer/Preprocessor.cc similarity index 99% rename from src/Preprocessor.cc rename to src/HTMLRenderer/Preprocessor.cc index 317c9cd..3214d99 100644 --- a/src/Preprocessor.cc +++ b/src/HTMLRenderer/Preprocessor.cc @@ -15,7 +15,7 @@ #include #include "Preprocessor.h" -#include "util.h" +#include "util/util.h" namespace pdf2htmlEX { diff --git a/src/include/Preprocessor.h b/src/HTMLRenderer/Preprocessor.h similarity index 100% rename from src/include/Preprocessor.h rename to src/HTMLRenderer/Preprocessor.h diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc index e745fe4..5132d2d 100644 --- a/src/HTMLRenderer/draw.cc +++ b/src/HTMLRenderer/draw.cc @@ -14,8 +14,8 @@ #include #include "HTMLRenderer.h" -#include "util.h" -#include "namespace.h" +#include "util/util.h" +#include "util/namespace.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index 6caec63..febb5b9 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -11,7 +11,7 @@ #include #include "HTMLRenderer.h" -#include "namespace.h" +#include "util/namespace.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index b47685e..a921c12 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -14,10 +14,10 @@ #include #include "HTMLRenderer.h" -#include "BackgroundRenderer.h" -#include "namespace.h" -#include "ffw.h" #include "pdf2htmlEX-config.h" +#include "BackgroundRenderer/BackgroundRenderer.h" +#include "util/namespace.h" +#include "util/ffw.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/image.cc b/src/HTMLRenderer/image.cc index cac7c2c..9c3da52 100644 --- a/src/HTMLRenderer/image.cc +++ b/src/HTMLRenderer/image.cc @@ -8,7 +8,7 @@ */ #include "HTMLRenderer.h" -#include "namespace.h" +#include "util/namespace.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index b741e26..ee839cd 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -15,8 +15,8 @@ #include "Param.h" #include "HTMLRenderer.h" -#include "namespace.h" -#include "util.h" +#include "util/namespace.h" +#include "util/util.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/link.cc b/src/HTMLRenderer/link.cc index 83cf6aa..6b1f9ff 100644 --- a/src/HTMLRenderer/link.cc +++ b/src/HTMLRenderer/link.cc @@ -11,10 +11,10 @@ #include #include -#include #include -#include "namespace.h" +#include "HTMLRenderer.h" +#include "util/namespace.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index e5a1dac..1474469 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -16,8 +16,8 @@ #include #include "HTMLRenderer.h" -#include "namespace.h" -#include "util.h" +#include "util/namespace.h" +#include "util/util.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 4f61d43..0c07c88 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -15,9 +15,9 @@ #include #include -#include "ffw.h" #include "HTMLRenderer.h" -#include "namespace.h" +#include "util/ffw.h" +#include "util/namespace.h" namespace pdf2htmlEX { diff --git a/src/include/Param.h b/src/Param.h similarity index 100% rename from src/include/Param.h rename to src/Param.h diff --git a/src/include/pdf2htmlEX-config.h.in b/src/pdf2htmlEX-config.h.in similarity index 100% rename from src/include/pdf2htmlEX-config.h.in rename to src/pdf2htmlEX-config.h.in diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index f56b778..f971243 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -19,10 +19,10 @@ #include #include -#include "HTMLRenderer.h" #include "Param.h" #include "pdf2htmlEX-config.h" -#include "ArgParser.h" +#include "HTMLRenderer/HTMLRenderer.h" +#include "util/ArgParser.h" using namespace std; using namespace pdf2htmlEX; diff --git a/src/ArgParser.cc b/src/util/ArgParser.cc similarity index 100% rename from src/ArgParser.cc rename to src/util/ArgParser.cc diff --git a/src/include/ArgParser.h b/src/util/ArgParser.h similarity index 100% rename from src/include/ArgParser.h rename to src/util/ArgParser.h diff --git a/src/HTMLRenderer/TmpFiles.cc b/src/util/TmpFiles.cc similarity index 100% rename from src/HTMLRenderer/TmpFiles.cc rename to src/util/TmpFiles.cc diff --git a/src/include/TmpFiles.h b/src/util/TmpFiles.h similarity index 100% rename from src/include/TmpFiles.h rename to src/util/TmpFiles.h diff --git a/src/ffw.c b/src/util/ffw.c similarity index 100% rename from src/ffw.c rename to src/util/ffw.c diff --git a/src/include/ffw.h b/src/util/ffw.h similarity index 100% rename from src/include/ffw.h rename to src/util/ffw.h diff --git a/src/include/namespace.h b/src/util/namespace.h similarity index 100% rename from src/include/namespace.h rename to src/util/namespace.h diff --git a/src/util.cc b/src/util/util.cc similarity index 100% rename from src/util.cc rename to src/util/util.cc diff --git a/src/include/util.h b/src/util/util.h similarity index 100% rename from src/include/util.h rename to src/util/util.h From 35fccdc28c764ae9eebffab9cae1fc9924cc1e85 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 17:45:26 +0800 Subject: [PATCH 09/38] reorganizaing --- CMakeLists.txt | 18 ++-- src/HTMLRenderer/LineBuffer.cc | 1 + src/HTMLRenderer/export.cc | 2 +- src/HTMLRenderer/general.cc | 4 +- src/HTMLRenderer/text.cc | 1 + src/util/const.cc | 39 ++++++++ src/util/const.h | 31 +++++++ src/util/unicode.cc | 157 +++++++++++++++++++++++++++++++ src/util/unicode.h | 41 +++++++++ src/util/util.cc | 163 --------------------------------- src/util/util.h | 41 +-------- 11 files changed, 289 insertions(+), 209 deletions(-) create mode 100644 src/util/const.cc create mode 100644 src/util/const.h create mode 100644 src/util/unicode.cc create mode 100644 src/util/unicode.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e984a3..068bcee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,15 +163,19 @@ add_executable(pdf2htmlEX src/BackgroundRenderer/SplashBackgroundRenderer.cc src/BackgroundRenderer/CairoBackgroundRenderer.h src/BackgroundRenderer/CairoBackgroundRenderer.cc - src/util/namespace.h - src/util/ffw.h - src/util/ffw.c - src/util/util.h - src/util/util.cc - src/util/TmpFiles.h - src/util/TmpFiles.cc src/util/ArgParser.h src/util/ArgParser.cc + src/util/const.h + src/util/const.cc + src/util/ffw.h + src/util/ffw.c + src/util/namespace.h + src/util/TmpFiles.h + src/util/TmpFiles.cc + src/util/unicode.h + src/util/unicode.cc + src/util/util.h + src/util/util.cc ) target_link_libraries(pdf2htmlEX ${PDF2HTMLEX_LIBS}) diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc index c70c24f..ebbf7ad 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/LineBuffer.cc @@ -11,6 +11,7 @@ #include "HTMLRenderer.h" #include "util/namespace.h" +#include "util/unicode.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index febb5b9..b3fa713 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -99,7 +99,7 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) // we have already shifted the origin // TODO: recognize common matices - if(_tm_equal(tm, id_matrix, 4)) + if(_tm_equal(tm, ID_MATRIX, 4)) { auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"}; for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index a921c12..a8253bc 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -170,8 +170,8 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) cur_font_size = draw_font_size = 0; cur_fs_id = install_font_size(cur_font_size); - memcpy(cur_text_tm, id_matrix, sizeof(cur_text_tm)); - memcpy(draw_text_tm, id_matrix, sizeof(draw_text_tm)); + memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm)); + memcpy(draw_text_tm, ID_MATRIX, sizeof(draw_text_tm)); cur_ttm_id = install_transform_matrix(draw_text_tm); cur_letter_space = cur_word_space = 0; diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 0c07c88..2a4dda9 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -18,6 +18,7 @@ #include "HTMLRenderer.h" #include "util/ffw.h" #include "util/namespace.h" +#include "util/unicode.h" namespace pdf2htmlEX { diff --git a/src/util/const.cc b/src/util/const.cc new file mode 100644 index 0000000..1a5d2d4 --- /dev/null +++ b/src/util/const.cc @@ -0,0 +1,39 @@ +/* + * Constants + * + * by WangLu + * 2012.11.29 + */ + +#include "const.h" + +namespace pdf2htmlEX { + +using std::map; +using std::string; + +const double ID_MATRIX[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + +const map BASE_14_FONT_CSS_FONT_MAP({ + { "Courier", "Courier,monospace" }, + { "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" }, + { "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" }, + { "Symbol", "Symbol,\"Standard Symbols L\"" }, + { "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" }, +}); + +const map GB_ENCODED_FONT_NAME_MAP({ + {"\xCB\xCE\xCC\xE5", "SimSun"}, + {"\xBA\xDA\xCC\xE5", "SimHei"}, + {"\xBF\xAC\xCC\xE5_GB2312", "SimKai"}, + {"\xB7\xC2\xCB\xCE_GB2312", "SimFang"}, + {"\xC1\xA5\xCA\xE9", "SimLi"}, +}); + +const std::map, std::pair > EMBED_STRING_MAP({ + {{".css", 0}, {""}}, + {{".css", 1}, {""}}, + {{".js", 0}, {""}}, + {{".js", 1}, {""}} +}); +} //namespace pdf2htmlEX diff --git a/src/util/const.h b/src/util/const.h new file mode 100644 index 0000000..1fde46f --- /dev/null +++ b/src/util/const.h @@ -0,0 +1,31 @@ +/* + * Constants + * + * by WangLu + * 2012.11.29 + */ + +#ifndef CONST_H__ +#define CONST_H__ + +#include +#include + +namespace pdf2htmlEX { + +static const double EPS = 1e-6; +static const double DEFAULT_DPI = 72.0; +extern const double ID_MATRIX[6]; + +// PDF base 14 font name -> CSS font name +extern const std::map BASE_14_FONT_CSS_FONT_MAP; +// For GB encoded font names +extern const std::map GB_ENCODED_FONT_NAME_MAP; +// map to embed files into html +// key: (suffix, if_embed_content) +// value: (prefix string, suffix string) +extern const std::map, std::pair > EMBED_STRING_MAP; + +} // namespace pdf2htmlEX + +#endif //CONST_H__ diff --git a/src/util/unicode.cc b/src/util/unicode.cc new file mode 100644 index 0000000..86c85f1 --- /dev/null +++ b/src/util/unicode.cc @@ -0,0 +1,157 @@ +/* + * Unicode manipulation functions + * + * by WangLu + * 2012.11.29 + */ + +#include + +#include "unicode.h" + +namespace pdf2htmlEX { + +using std::cerr; +using std::endl; +using std::ostream; + +bool isLegalUnicode(Unicode u) +{ + /* + if((u == 9) || (u == 10) || (u == 13)) + return true; + */ + + if(u <= 31) + return false; + + if((u >= 127) && (u <= 159)) + return false; + + if((u >= 0xd800) && (u <= 0xdfff)) + return false; + + return true; +} + +Unicode map_to_private(CharCode code) +{ + Unicode private_mapping = (Unicode)(code + 0xE000); + if(private_mapping > 0xF8FF) + { + private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000); + if(private_mapping > 0xFFFFD) + { + private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000); + if(private_mapping > 0x10FFFD) + { + cerr << "Warning: all private use unicode are used" << endl; + } + } + } + return private_mapping; +} + +Unicode unicode_from_font (CharCode code, GfxFont * font) +{ + if(!font->isCIDFont()) + { + char * cname = dynamic_cast(font)->getCharName(code); + // may be untranslated ligature + if(cname) + { + Unicode ou = globalParams->mapNameToUnicode(cname); + + if(isLegalUnicode(ou)) + return ou; + } + } + + return map_to_private(code); +} + +Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font) +{ + if(len == 0) + return map_to_private(code); + + if(len == 1) + { + if(isLegalUnicode(*u)) + return *u; + } + + return unicode_from_font(code, font); +} + +/* + * Copied from UTF.h / UTF8.h in poppler + */ +static int mapUTF8(Unicode u, char *buf, int bufSize) { + if (u <= 0x0000007f) { + if (bufSize < 1) { + return 0; + } + buf[0] = (char)u; + return 1; + } else if (u <= 0x000007ff) { + if (bufSize < 2) { + return 0; + } + buf[0] = (char)(0xc0 + (u >> 6)); + buf[1] = (char)(0x80 + (u & 0x3f)); + return 2; + } else if (u <= 0x0000ffff) { + if (bufSize < 3) { + return 0; + } + buf[0] = (char)(0xe0 + (u >> 12)); + buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[2] = (char)(0x80 + (u & 0x3f)); + return 3; + } else if (u <= 0x0010ffff) { + if (bufSize < 4) { + return 0; + } + buf[0] = (char)(0xf0 + (u >> 18)); + buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); + buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[3] = (char)(0x80 + (u & 0x3f)); + return 4; + } else { + return 0; + } +} + +void outputUnicodes(ostream & out, const Unicode * u, int uLen) +{ + for(int i = 0; i < uLen; ++i) + { + switch(u[i]) + { + case '&': + out << "&"; + break; + case '\"': + out << """; + break; + case '\'': + out << "'"; + break; + case '<': + out << "<"; + break; + case '>': + out << ">"; + break; + default: + { + char buf[4]; + auto n = mapUTF8(u[i], buf, 4); + out.write(buf, n); + } + } + } +} + +} //namespace pdf2htmlEX diff --git a/src/util/unicode.h b/src/util/unicode.h new file mode 100644 index 0000000..9cc9dc6 --- /dev/null +++ b/src/util/unicode.h @@ -0,0 +1,41 @@ +/* + * Unicode manipulation functions + * + * by WangLu + * 2012.11.29 + */ + +#ifndef UNICODE_H__ +#define UNICODE_H__ + +#include + +#include +#include + +namespace pdf2htmlEX { + +/* + * Check if the unicode is valid for HTML + * http://en.wikipedia.org/wiki/HTML_decimal_character_rendering + */ +bool isLegalUnicode(Unicode u); + +Unicode map_to_private(CharCode code); + +/* * Try to determine the Unicode value directly from the information in the font */ +Unicode unicode_from_font (CharCode code, GfxFont * font); + +/* + * We have to use a single Unicode value to reencode fonts + * if we got multi-unicode values, it might be expanded ligature, try to restore it + * if we cannot figure it out at the end, use a private mapping + */ +Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font); + +void outputUnicodes(std::ostream & out, const Unicode * u, int uLen); + + +} // namespace pdf2htmlEX + +#endif //UNICODE_H__ diff --git a/src/util/util.cc b/src/util/util.cc index a69654e..606b33d 100644 --- a/src/util/util.cc +++ b/src/util/util.cc @@ -29,31 +29,6 @@ using std::ostream; namespace pdf2htmlEX { -const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; - -const map BASE_14_FONT_CSS_FONT_MAP({ - { "Courier", "Courier,monospace" }, - { "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" }, - { "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" }, - { "Symbol", "Symbol,\"Standard Symbols L\"" }, - { "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" }, -}); - -const map GB_ENCODED_FONT_NAME_MAP({ - {"\xCB\xCE\xCC\xE5", "SimSun"}, - {"\xBA\xDA\xCC\xE5", "SimHei"}, - {"\xBF\xAC\xCC\xE5_GB2312", "SimKai"}, - {"\xB7\xC2\xCB\xCE_GB2312", "SimFang"}, - {"\xC1\xA5\xCA\xE9", "SimLi"}, -}); - -const std::map, std::pair > EMBED_STRING_MAP({ - {{".css", 0}, {""}}, - {{".css", 1}, {""}}, - {{".js", 0}, {""}}, - {{".js", 1}, {""}} -}); - void _tm_transform(const double * tm, double & x, double & y, bool is_delta) { double xx = x, yy = y; @@ -79,144 +54,6 @@ void _tm_multiply(double * tm_left, const double * tm_right) tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5]; } -bool isLegalUnicode(Unicode u) -{ - /* - if((u == 9) || (u == 10) || (u == 13)) - return true; - */ - - if(u <= 31) - return false; - - if((u >= 127) && (u <= 159)) - return false; - - if((u >= 0xd800) && (u <= 0xdfff)) - return false; - - return true; -} - -Unicode map_to_private(CharCode code) -{ - Unicode private_mapping = (Unicode)(code + 0xE000); - if(private_mapping > 0xF8FF) - { - private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000); - if(private_mapping > 0xFFFFD) - { - private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000); - if(private_mapping > 0x10FFFD) - { - cerr << "Warning: all private use unicode are used" << endl; - } - } - } - return private_mapping; -} - -Unicode unicode_from_font (CharCode code, GfxFont * font) -{ - if(!font->isCIDFont()) - { - char * cname = dynamic_cast(font)->getCharName(code); - // may be untranslated ligature - if(cname) - { - Unicode ou = globalParams->mapNameToUnicode(cname); - - if(isLegalUnicode(ou)) - return ou; - } - } - - return map_to_private(code); -} - -Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font) -{ - if(len == 0) - return map_to_private(code); - - if(len == 1) - { - if(isLegalUnicode(*u)) - return *u; - } - - return unicode_from_font(code, font); -} - -/* - * Copied from UTF.h / UTF8.h in poppler - */ -static int mapUTF8(Unicode u, char *buf, int bufSize) { - if (u <= 0x0000007f) { - if (bufSize < 1) { - return 0; - } - buf[0] = (char)u; - return 1; - } else if (u <= 0x000007ff) { - if (bufSize < 2) { - return 0; - } - buf[0] = (char)(0xc0 + (u >> 6)); - buf[1] = (char)(0x80 + (u & 0x3f)); - return 2; - } else if (u <= 0x0000ffff) { - if (bufSize < 3) { - return 0; - } - buf[0] = (char)(0xe0 + (u >> 12)); - buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); - buf[2] = (char)(0x80 + (u & 0x3f)); - return 3; - } else if (u <= 0x0010ffff) { - if (bufSize < 4) { - return 0; - } - buf[0] = (char)(0xf0 + (u >> 18)); - buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); - buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); - buf[3] = (char)(0x80 + (u & 0x3f)); - return 4; - } else { - return 0; - } -} - -void outputUnicodes(ostream & out, const Unicode * u, int uLen) -{ - for(int i = 0; i < uLen; ++i) - { - switch(u[i]) - { - case '&': - out << "&"; - break; - case '\"': - out << """; - break; - case '\'': - out << "'"; - break; - case '<': - out << "<"; - break; - case '>': - out << ">"; - break; - default: - { - char buf[4]; - auto n = mapUTF8(u[i], buf, 4); - out.write(buf, n); - } - } - } -} const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; diff --git a/src/util/util.h b/src/util/util.h index 26da1fa..d10ef2e 100644 --- a/src/util/util.h +++ b/src/util/util.h @@ -1,6 +1,5 @@ /* - * Constants & Misc functions - * + * Help classes and Functions * * by WangLu * 2012.08.10 @@ -19,7 +18,8 @@ #include #include -#include + +#include "const.h" #ifndef nullptr #define nullptr (NULL) @@ -27,18 +27,6 @@ namespace pdf2htmlEX { -static const double EPS = 1e-6; -extern const double id_matrix[6]; - -static const double DEFAULT_DPI = 72.0; - -extern const std::map BASE_14_FONT_CSS_FONT_MAP; -extern const std::map GB_ENCODED_FONT_NAME_MAP; -// map to embed files into html -// key: (suffix, if_embed_content) -// value: (prefix string, suffix string) -extern const std::map, std::pair > EMBED_STRING_MAP; - static inline double _round(double x) { return (std::abs(x) > EPS) ? x : 0.0; } static inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; } static inline bool _is_positive(double x) { return x > EPS; } @@ -59,26 +47,6 @@ static inline long long hash_ref(const Ref * id) return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen); } -/* - * Check if the unicode is valid for HTML - * http://en.wikipedia.org/wiki/HTML_decimal_character_rendering - */ -bool isLegalUnicode(Unicode u); - -Unicode map_to_private(CharCode code); - -/* * Try to determine the Unicode value directly from the information in the font */ -Unicode unicode_from_font (CharCode code, GfxFont * font); - -/* - * We have to use a single Unicode value to reencode fonts - * if we got multi-unicode values, it might be expanded ligature, try to restore it - * if we cannot figure it out at the end, use a private mapping - */ -Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font); - -void outputUnicodes(std::ostream & out, const Unicode * u, int uLen); - class GfxRGB_hash { public: @@ -233,5 +201,6 @@ void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2, std::ostream & operator << (std::ostream & out, const GfxRGB & rgb); -} // namespace util +} // namespace pdf2htmlEX + #endif //UTIL_H__ From ab28c44034fc6ad1353c566a4aaba3bb5bd694d5 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 17:50:40 +0800 Subject: [PATCH 10/38] moving base64/string_formatter out --- CMakeLists.txt | 3 ++ src/HTMLRenderer/HTMLRenderer.h | 1 + src/HTMLRenderer/export.cc | 1 + src/HTMLRenderer/general.cc | 1 + src/util/base64.cc | 3 ++ src/util/base64.h | 62 +++++++++++++++++++++++ src/util/string_formatter.h | 53 +++++++++++++++++++ src/util/util.cc | 3 -- src/util/util.h | 90 --------------------------------- 9 files changed, 124 insertions(+), 93 deletions(-) create mode 100644 src/util/base64.cc create mode 100644 src/util/base64.h create mode 100644 src/util/string_formatter.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 068bcee..66a2cd6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,11 +165,14 @@ add_executable(pdf2htmlEX src/BackgroundRenderer/CairoBackgroundRenderer.cc src/util/ArgParser.h src/util/ArgParser.cc + src/util/base64.h + src/util/base64.cc src/util/const.h src/util/const.cc src/util/ffw.h src/util/ffw.c src/util/namespace.h + src/util/string_formatter.h src/util/TmpFiles.h src/util/TmpFiles.cc src/util/unicode.h diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 2922a8e..9ff3b11 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -27,6 +27,7 @@ #include "Param.h" #include "Preprocessor.h" #include "util/util.h" +#include "util/string_formatter.h" #include "util/TmpFiles.h" /* diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index b3fa713..5c79e43 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -12,6 +12,7 @@ #include "HTMLRenderer.h" #include "util/namespace.h" +#include "util/base64.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index a8253bc..ab83699 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -18,6 +18,7 @@ #include "BackgroundRenderer/BackgroundRenderer.h" #include "util/namespace.h" #include "util/ffw.h" +#include "util/base64.h" namespace pdf2htmlEX { diff --git a/src/util/base64.cc b/src/util/base64.cc new file mode 100644 index 0000000..51b4a7e --- /dev/null +++ b/src/util/base64.cc @@ -0,0 +1,3 @@ +#include "base64.h" + +const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; diff --git a/src/util/base64.h b/src/util/base64.h new file mode 100644 index 0000000..efebac8 --- /dev/null +++ b/src/util/base64.h @@ -0,0 +1,62 @@ +/* + * Base64 Encoding + * + * by WangLu + * 2012.11.29 + */ + +#ifndef BASE64_H__ +#define BASE64_H__ + +#include + +class base64stream +{ +public: + + base64stream(std::istream & in) : in(&in) { } + base64stream(std::istream && in) : in(&in) { } + + std::ostream & dumpto(std::ostream & out) + { + unsigned char buf[3]; + while(in->read((char*)buf, 3)) + { + out << base64_encoding[(buf[0] & 0xfc)>>2] + << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)] + << base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)] + << base64_encoding[(buf[2] & 0x3f)]; + } + auto cnt = in->gcount(); + if(cnt > 0) + { + for(int i = cnt; i < 3; ++i) + buf[i] = 0; + + out << base64_encoding[(buf[0] & 0xfc)>>2] + << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]; + + if(cnt > 1) + { + out << base64_encoding[(buf[1] & 0x0f)<<2]; + } + else + { + out << '='; + } + out << '='; + } + + return out; + } + +private: + std::istream * in; + static const char * base64_encoding; +}; + +static inline std::ostream & operator << (std::ostream & out, base64stream & bf) { return bf.dumpto(out); } +static inline std::ostream & operator << (std::ostream & out, base64stream && bf) { return bf.dumpto(out); } + + +#endif //BASE64_H__ diff --git a/src/util/string_formatter.h b/src/util/string_formatter.h new file mode 100644 index 0000000..64da233 --- /dev/null +++ b/src/util/string_formatter.h @@ -0,0 +1,53 @@ +/* + * Buffer reusing string formatter + * + * by WangLu + * 2012.11.29 + */ + +#ifndef STRING_FORMATTER_H__ +#define STRING_FORMATTER_H__ + +class string_formatter +{ +public: + class guarded_pointer + { + public: + guarded_pointer(string_formatter * sf) : sf(sf) { ++(sf->buf_cnt); } + ~guarded_pointer(void) { --(sf->buf_cnt); } + operator char* () { return &(sf->buf.front()); } + private: + string_formatter * sf; + }; + + string_formatter() : buf_cnt(0) { buf.reserve(L_tmpnam); } + /* + * Important: + * there is only one buffer, so new strings will replace old ones + */ + guarded_pointer operator () (const char * format, ...) { + assert((buf_cnt == 0) && "string_formatter: buffer is reused!"); + + va_list vlist; + va_start(vlist, format); + int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + va_end(vlist); + if(l >= (int)buf.capacity()) + { + buf.reserve(std::max((long)(l+1), (long)buf.capacity() * 2)); + va_start(vlist, format); + l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + va_end(vlist); + } + assert(l >= 0); // we should fail when vsnprintf fail + assert(l < (int)buf.capacity()); + return guarded_pointer(this); + } +private: + friend class guarded_pointer; + std::vector buf; + int buf_cnt; +}; + +#endif //STRING_FORMATTER_H__ diff --git a/src/util/util.cc b/src/util/util.cc index 606b33d..feb09ea 100644 --- a/src/util/util.cc +++ b/src/util/util.cc @@ -54,9 +54,6 @@ void _tm_multiply(double * tm_left, const double * tm_right) tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5]; } - -const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - void create_directories(string path) { if(path.empty()) return; diff --git a/src/util/util.h b/src/util/util.h index d10ef2e..e9011a9 100644 --- a/src/util/util.h +++ b/src/util/util.h @@ -92,96 +92,6 @@ public: } }; -class base64stream -{ -public: - - base64stream(std::istream & in) : in(&in) { } - base64stream(std::istream && in) : in(&in) { } - - std::ostream & dumpto(std::ostream & out) - { - unsigned char buf[3]; - while(in->read((char*)buf, 3)) - { - out << base64_encoding[(buf[0] & 0xfc)>>2] - << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)] - << base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)] - << base64_encoding[(buf[2] & 0x3f)]; - } - auto cnt = in->gcount(); - if(cnt > 0) - { - for(int i = cnt; i < 3; ++i) - buf[i] = 0; - - out << base64_encoding[(buf[0] & 0xfc)>>2] - << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]; - - if(cnt > 1) - { - out << base64_encoding[(buf[1] & 0x0f)<<2]; - } - else - { - out << '='; - } - out << '='; - } - - return out; - } - -private: - std::istream * in; - static const char * base64_encoding; -}; - -static inline std::ostream & operator << (std::ostream & out, base64stream & bf) { return bf.dumpto(out); } -static inline std::ostream & operator << (std::ostream & out, base64stream && bf) { return bf.dumpto(out); } - -class string_formatter -{ -public: - class guarded_pointer - { - public: - guarded_pointer(string_formatter * sf) : sf(sf) { ++(sf->buf_cnt); } - ~guarded_pointer(void) { --(sf->buf_cnt); } - operator char* () { return &(sf->buf.front()); } - private: - string_formatter * sf; - }; - - string_formatter() : buf_cnt(0) { buf.reserve(L_tmpnam); } - /* - * Important: - * there is only one buffer, so new strings will replace old ones - */ - guarded_pointer operator () (const char * format, ...) { - assert((buf_cnt == 0) && "string_formatter: buffer is reused!"); - - va_list vlist; - va_start(vlist, format); - int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); - va_end(vlist); - if(l >= (int)buf.capacity()) - { - buf.reserve(std::max((long)(l+1), (long)buf.capacity() * 2)); - va_start(vlist, format); - l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); - va_end(vlist); - } - assert(l >= 0); // we should fail when vsnprintf fail - assert(l < (int)buf.capacity()); - return guarded_pointer(this); - } -private: - friend class guarded_pointer; - std::vector buf; - int buf_cnt; -}; - void create_directories(std::string path); bool is_truetype_suffix(const std::string & suffix); From d179b50147a443cf350a5de26648cfd789386ce0 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 18:16:05 +0800 Subject: [PATCH 11/38] working on util.h --- CMakeLists.txt | 4 ++ src/HTMLRenderer/HTMLRenderer.h | 44 +++++++++++++++++ src/HTMLRenderer/LineBuffer.cc | 4 +- src/HTMLRenderer/draw.cc | 52 ++++++++++----------- src/HTMLRenderer/export.cc | 29 ++++++------ src/HTMLRenderer/general.cc | 10 ++-- src/HTMLRenderer/install.cc | 10 ++-- src/HTMLRenderer/link.cc | 15 +++--- src/HTMLRenderer/state.cc | 32 ++++++------- src/HTMLRenderer/text.cc | 4 +- src/pdf2htmlEX.cc | 1 + src/util/base64.cc | 4 ++ src/util/base64.h | 4 +- src/util/math.cc | 32 +++++++++++++ src/util/math.h | 33 +++++++++++++ src/util/path.cc | 65 ++++++++++++++++++++++++++ src/util/path.h | 23 +++++++++ src/util/string_formatter.h | 3 ++ src/util/util.cc | 83 --------------------------------- src/util/util.h | 68 --------------------------- 20 files changed, 293 insertions(+), 227 deletions(-) create mode 100644 src/util/math.cc create mode 100644 src/util/math.h create mode 100644 src/util/path.cc create mode 100644 src/util/path.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 66a2cd6..2badbab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,7 +171,11 @@ add_executable(pdf2htmlEX src/util/const.cc src/util/ffw.h src/util/ffw.c + src/util/math.h + src/util/math.cc src/util/namespace.h + src/util/path.h + src/util/path.cc src/util/string_formatter.h src/util/TmpFiles.h src/util/TmpFiles.cc diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 9ff3b11..d11b62a 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -62,6 +62,50 @@ namespace pdf2htmlEX { +// we may need more info of a font in the future +class FontInfo +{ +public: + long long id; + bool use_tounicode; + int em_size; + double ascent, descent; +}; + +class GfxRGB_hash +{ +public: + size_t operator () (const GfxRGB & rgb) const + { + return (colToByte(rgb.r) << 16) | (colToByte(rgb.g) << 8) | (colToByte(rgb.b)); + } +}; + +class GfxRGB_equal +{ +public: + bool operator ()(const GfxRGB & rgb1, const GfxRGB & rgb2) const + { + return ((rgb1.r == rgb2.r) && (rgb1.g == rgb2.g) && (rgb1.b == rgb1.b)); + } +}; + +class Matrix_less +{ +public: + bool operator () (const Matrix & m1, const Matrix & m2) const + { + // Note that we only care about the first 4 elements + for(int i = 0; i < 4; ++i) + { + if(m1.m[i] < m2.m[i] - EPS) + return true; + if(m1.m[i] > m2.m[i] + EPS) + return false; + } + return false; + } +}; class HTMLRenderer : public OutputDev { public: diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc index ebbf7ad..8876ed0 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/LineBuffer.cc @@ -81,8 +81,8 @@ void HTMLRenderer::LineBuffer::flush(void) ostream & out = renderer->html_fout; out << "
install_height(max_ascent) diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc index 5132d2d..0185c07 100644 --- a/src/HTMLRenderer/draw.cc +++ b/src/HTMLRenderer/draw.cc @@ -14,7 +14,7 @@ #include #include "HTMLRenderer.h" -#include "util/util.h" +#include "util/math.h" #include "util/namespace.h" namespace pdf2htmlEX { @@ -33,36 +33,36 @@ static bool is_horizontal_line(GfxSubpath * path) { return ((path->getNumPoints() == 2) && (!path->getCurve(1)) - && (_equal(path->getY(0), path->getY(1)))); + && (equal(path->getY(0), path->getY(1)))); } static bool is_vertical_line(GfxSubpath * path) { return ((path->getNumPoints() == 2) && (!path->getCurve(1)) - && (_equal(path->getX(0), path->getX(1)))); + && (equal(path->getX(0), path->getX(1)))); } static bool is_rectangle(GfxSubpath * path) { if (!(((path->getNumPoints() != 4) && (path->isClosed())) || ((path->getNumPoints() == 5) - && _equal(path->getX(0), path->getX(4)) - && _equal(path->getY(0), path->getY(4))))) + && equal(path->getX(0), path->getX(4)) + && equal(path->getY(0), path->getY(4))))) return false; for(int i = 1; i < path->getNumPoints(); ++i) if(path->getCurve(i)) return false; - return (_equal(path->getY(0), path->getY(1)) - && _equal(path->getX(1), path->getX(2)) - && _equal(path->getY(2), path->getY(3)) - && _equal(path->getX(3), path->getX(0))) - || (_equal(path->getX(0), path->getX(1)) - && _equal(path->getY(1), path->getY(2)) - && _equal(path->getX(2), path->getX(3)) - && _equal(path->getY(3), path->getY(0))); + return (equal(path->getY(0), path->getY(1)) + && equal(path->getX(1), path->getX(2)) + && equal(path->getY(2), path->getY(3)) + && equal(path->getX(3), path->getX(0))) + || (equal(path->getX(0), path->getX(1)) + && equal(path->getY(1), path->getY(2)) + && equal(path->getX(2), path->getX(3)) + && equal(path->getY(3), path->getY(0))); } static void get_shading_bbox(GfxState * state, GfxShading * shading, @@ -105,7 +105,7 @@ static void get_shading_bbox(GfxState * state, GfxShading * shading, */ static double get_angle(double dx, double dy) { - double r = _hypot(dx, dy); + double r = hypot(dx, dy); /* * acos always returns [0, pi] @@ -208,10 +208,10 @@ void LinearGradient::dumpto (ostream & out) auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"}; for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) { - out << "background-image:" << (*iter) << "linear-gradient(" << _round(angle) << "rad"; + out << "background-image:" << (*iter) << "linear-gradient(" << round(angle) << "rad"; for(auto iter2 = stops.begin(); iter2 != stops.end(); ++iter2) { - out << "," << (iter2->rgb) << " " << _round((iter2->pos) * 100) << "%"; + out << "," << (iter2->rgb) << " " << round((iter2->pos) * 100) << "%"; } out << ");"; } @@ -318,7 +318,7 @@ bool HTMLRenderer::css_do_path(GfxState *state, bool fill, bool test_only) GfxRGB * ps = fill ? nullptr : (&stroke_color); GfxRGB * pf = fill ? (&fill_color) : nullptr; - if(_equal(h, 0) || _equal(w, 0)) + if(equal(h, 0) || equal(w, 0)) { // orthogonal line @@ -351,7 +351,7 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co double new_tm[6]; memcpy(new_tm, tm, sizeof(new_tm)); - _tm_transform(new_tm, x, y); + tm_transform(new_tm, x, y); double scale = 1.0; { @@ -359,8 +359,8 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co double i1 = (new_tm[0] + new_tm[2]) / sqrt2; double i2 = (new_tm[1] + new_tm[3]) / sqrt2; - scale = _hypot(i1, i2); - if(_is_positive(scale)) + scale = hypot(i1, i2); + if(is_positive(scale)) { for(int i = 0; i < 4; ++i) new_tm[i] /= scale; @@ -383,8 +383,8 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co if(i > 0) html_fout << ' '; double lw = line_width_array[i] * scale; - html_fout << _round(lw); - if(_is_positive(lw)) html_fout << "px"; + html_fout << round(lw); + if(is_positive(lw)) html_fout << "px"; } html_fout << ";"; } @@ -407,10 +407,10 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co style_function(style_function_data, html_fout); } - html_fout << "bottom:" << _round(y) << "px;" - << "left:" << _round(x) << "px;" - << "width:" << _round(w * scale) << "px;" - << "height:" << _round(h * scale) << "px;"; + html_fout << "bottom:" << round(y) << "px;" + << "left:" << round(x) << "px;" + << "width:" << round(w * scale) << "px;" + << "height:" << round(h * scale) << "px;"; html_fout << "\">
"; } diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index 5c79e43..c8e0ea8 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -13,6 +13,7 @@ #include "HTMLRenderer.h" #include "util/namespace.h" #include "util/base64.h" +#include "util/math.h" namespace pdf2htmlEX { @@ -39,7 +40,7 @@ void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suff css_fout << ")format(\"" << fontfileformat << "\");}.f" << info.id << "{font-family:f" << info.id - << ";line-height:" << _round(info.ascent - info.descent) + << ";line-height:" << round(info.ascent - info.descent) << ";font-style:normal;font-weight:normal;}"; css_fout << endl; @@ -82,14 +83,14 @@ void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, cons else css_fout << "font-style:normal;"; - css_fout << "line-height:" << _round(info.ascent - info.descent) << ";"; + css_fout << "line-height:" << round(info.ascent - info.descent) << ";"; css_fout << "}" << endl; } void HTMLRenderer::export_font_size (long long fs_id, double font_size) { - css_fout << ".s" << fs_id << "{font-size:" << _round(font_size) << "px;}" << endl; + css_fout << ".s" << fs_id << "{font-size:" << round(font_size) << "px;}" << endl; } void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) @@ -100,7 +101,7 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) // we have already shifted the origin // TODO: recognize common matices - if(_tm_equal(tm, ID_MATRIX, 4)) + if(tm_equal(tm, ID_MATRIX, 4)) { auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"}; for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) @@ -113,10 +114,10 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) { // PDF use a different coordinate system from Web css_fout << *iter << "transform:matrix(" - << _round(tm[0]) << ',' - << _round(-tm[1]) << ',' - << _round(-tm[2]) << ',' - << _round(tm[3]) << ','; + << round(tm[0]) << ',' + << round(-tm[1]) << ',' + << round(-tm[2]) << ',' + << round(tm[3]) << ','; css_fout << "0,0);"; } @@ -126,12 +127,12 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) void HTMLRenderer::export_letter_space (long long ls_id, double letter_space) { - css_fout << ".l" << ls_id << "{letter-spacing:" << _round(letter_space) << "px;}" << endl; + css_fout << ".l" << ls_id << "{letter-spacing:" << round(letter_space) << "px;}" << endl; } void HTMLRenderer::export_word_space (long long ws_id, double word_space) { - css_fout << ".w" << ws_id << "{word-spacing:" << _round(word_space) << "px;}" << endl; + css_fout << ".w" << ws_id << "{word-spacing:" << round(word_space) << "px;}" << endl; } void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb) @@ -142,19 +143,19 @@ void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb) void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) { if(ws_width > 0) - css_fout << "._" << ws_id << "{display:inline-block;width:" << _round(ws_width) << "px;}" << endl; + css_fout << "._" << ws_id << "{display:inline-block;width:" << round(ws_width) << "px;}" << endl; else - css_fout << "._" << ws_id << "{display:inline;margin-left:" << _round(ws_width) << "px;}" << endl; + css_fout << "._" << ws_id << "{display:inline;margin-left:" << round(ws_width) << "px;}" << endl; } void HTMLRenderer::export_rise (long long rise_id, double rise) { - css_fout << ".r" << rise_id << "{top:" << _round(-rise) << "px;}" << endl; + css_fout << ".r" << rise_id << "{top:" << round(-rise) << "px;}" << endl; } void HTMLRenderer::export_height (long long height_id, double height) { - css_fout << ".h" << height_id << "{height:" << _round(height) << "px;}" << endl; + css_fout << ".h" << height_id << "{height:" << round(height) << "px;}" << endl; } } diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index ab83699..2c8833d 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -19,6 +19,8 @@ #include "util/namespace.h" #include "util/ffw.h" #include "util/base64.h" +#include "util/math.h" +#include "util/path.h" namespace pdf2htmlEX { @@ -211,7 +213,7 @@ void HTMLRenderer::endPage() { for(int i = 0; i < 6; ++i) { if(i > 0) html_fout << ","; - html_fout << _round(default_ctm[i]); + html_fout << round(default_ctm[i]); } html_fout << "]"; @@ -233,17 +235,17 @@ void HTMLRenderer::pre_process(PDFDoc * doc) vector zoom_factors; - if(_is_positive(param->zoom)) + if(is_positive(param->zoom)) { zoom_factors.push_back(param->zoom); } - if(_is_positive(param->fit_width)) + if(is_positive(param->fit_width)) { zoom_factors.push_back((param->fit_width) / preprocessor.get_max_width()); } - if(_is_positive(param->fit_height)) + if(is_positive(param->fit_height)) { zoom_factors.push_back((param->fit_height) / preprocessor.get_max_height()); } diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index ee839cd..a51dbf7 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -16,7 +16,7 @@ #include "Param.h" #include "HTMLRenderer.h" #include "util/namespace.h" -#include "util/util.h" +#include "util/math.h" namespace pdf2htmlEX { @@ -204,7 +204,7 @@ void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info) long long HTMLRenderer::install_font_size(double font_size) { auto iter = font_size_map.lower_bound(font_size - EPS); - if((iter != font_size_map.end()) && (_equal(iter->first, font_size))) + if((iter != font_size_map.end()) && (equal(iter->first, font_size))) return iter->second; long long new_fs_id = font_size_map.size(); @@ -219,7 +219,7 @@ long long HTMLRenderer::install_transform_matrix(const double * tm) memcpy(m.m, tm, sizeof(m.m)); auto iter = transform_matrix_map.lower_bound(m); - if((iter != transform_matrix_map.end()) && (_tm_equal(m.m, iter->first.m, 4))) + if((iter != transform_matrix_map.end()) && (tm_equal(m.m, iter->first.m, 4))) return iter->second; long long new_tm_id = transform_matrix_map.size(); @@ -231,7 +231,7 @@ long long HTMLRenderer::install_transform_matrix(const double * tm) long long HTMLRenderer::install_letter_space(double letter_space) { auto iter = letter_space_map.lower_bound(letter_space - EPS); - if((iter != letter_space_map.end()) && (_equal(iter->first, letter_space))) + if((iter != letter_space_map.end()) && (equal(iter->first, letter_space))) return iter->second; long long new_ls_id = letter_space_map.size(); @@ -243,7 +243,7 @@ long long HTMLRenderer::install_letter_space(double letter_space) long long HTMLRenderer::install_word_space(double word_space) { auto iter = word_space_map.lower_bound(word_space - EPS); - if((iter != word_space_map.end()) && (_equal(iter->first, word_space))) + if((iter != word_space_map.end()) && (equal(iter->first, word_space))) return iter->second; long long new_ws_id = word_space_map.size(); diff --git a/src/HTMLRenderer/link.cc b/src/HTMLRenderer/link.cc index 6b1f9ff..68126dd 100644 --- a/src/HTMLRenderer/link.cc +++ b/src/HTMLRenderer/link.cc @@ -15,6 +15,7 @@ #include "HTMLRenderer.h" #include "util/namespace.h" +#include "util/math.h" namespace pdf2htmlEX { @@ -211,9 +212,9 @@ void HTMLRenderer::processLink(AnnotLink * al) border_top_bottom_width, border_left_right_width); if(abs(border_top_bottom_width - border_left_right_width) < EPS) - html_fout << "border-width:" << _round(border_top_bottom_width) << "px;"; + html_fout << "border-width:" << round(border_top_bottom_width) << "px;"; else - html_fout << "border-width:" << _round(border_top_bottom_width) << "px " << _round(border_left_right_width) << "px;"; + html_fout << "border-width:" << round(border_top_bottom_width) << "px " << round(border_left_right_width) << "px;"; } auto style = border->getStyle(); switch(style) @@ -267,13 +268,13 @@ void HTMLRenderer::processLink(AnnotLink * al) html_fout << "border-style:none;"; } - _tm_transform(default_ctm, x, y); + tm_transform(default_ctm, x, y); html_fout << "position:absolute;" - << "left:" << _round(x) << "px;" - << "bottom:" << _round(y) << "px;" - << "width:" << _round(w) << "px;" - << "height:" << _round(h) << "px;"; + << "left:" << round(x) << "px;" + << "bottom:" << round(y) << "px;" + << "width:" << round(w) << "px;" + << "height:" << round(h) << "px;"; // fix for IE html_fout << "background-color:rgba(255,255,255,0.000001);"; diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 1474469..7cc9ee3 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -17,7 +17,7 @@ #include "HTMLRenderer.h" #include "util/namespace.h" -#include "util/util.h" +#include "util/math.h" namespace pdf2htmlEX { @@ -104,7 +104,7 @@ void HTMLRenderer::check_state_change(GfxState * state) } double new_font_size = state->getFontSize(); - if(!_equal(cur_font_size, new_font_size)) + if(!equal(cur_font_size, new_font_size)) { need_rescale_font = true; cur_font_size = new_font_size; @@ -132,7 +132,7 @@ void HTMLRenderer::check_state_change(GfxState * state) new_ctm[5] = m1[1] * m2[4] + m1[3] * m2[5] + m1[5]; //new_ctm[4] = new_ctm[5] = 0; - if(!_tm_equal(new_ctm, cur_text_tm)) + if(!tm_equal(new_ctm, cur_text_tm)) { need_recheck_position = true; need_rescale_font = true; @@ -147,10 +147,10 @@ void HTMLRenderer::check_state_change(GfxState * state) double new_draw_text_tm[6]; memcpy(new_draw_text_tm, cur_text_tm, sizeof(new_draw_text_tm)); - double new_draw_text_scale = 1.0/text_scale_factor2 * _hypot(new_draw_text_tm[2], new_draw_text_tm[3]); + double new_draw_text_scale = 1.0/text_scale_factor2 * hypot(new_draw_text_tm[2], new_draw_text_tm[3]); double new_draw_font_size = cur_font_size; - if(_is_positive(new_draw_text_scale)) + if(is_positive(new_draw_text_scale)) { new_draw_font_size *= new_draw_text_scale; for(int i = 0; i < 4; ++i) @@ -161,19 +161,19 @@ void HTMLRenderer::check_state_change(GfxState * state) new_draw_text_scale = 1.0; } - if(!(_equal(new_draw_text_scale, draw_text_scale))) + if(!(equal(new_draw_text_scale, draw_text_scale))) { draw_text_scale_changed = true; draw_text_scale = new_draw_text_scale; } - if(!(_equal(new_draw_font_size, draw_font_size))) + if(!(equal(new_draw_font_size, draw_font_size))) { new_line_state = max(new_line_state, NLS_SPAN); draw_font_size = new_draw_font_size; cur_fs_id = install_font_size(draw_font_size); } - if(!(_tm_equal(new_draw_text_tm, draw_text_tm, 4))) + if(!(tm_equal(new_draw_text_tm, draw_text_tm, 4))) { new_line_state = max(new_line_state, NLS_DIV); memcpy(draw_text_tm, new_draw_text_tm, sizeof(draw_text_tm)); @@ -199,21 +199,21 @@ void HTMLRenderer::check_state_change(GfxState * state) */ bool merged = false; - if(_tm_equal(old_ctm, cur_text_tm, 4)) + if(tm_equal(old_ctm, cur_text_tm, 4)) { double dy = cur_ty - draw_ty; double tdx = old_ctm[4] - cur_text_tm[4] - cur_text_tm[2] * dy; double tdy = old_ctm[5] - cur_text_tm[5] - cur_text_tm[3] * dy; - if(_equal(cur_text_tm[0] * tdy, cur_text_tm[1] * tdx)) + if(equal(cur_text_tm[0] * tdy, cur_text_tm[1] * tdx)) { - if(_is_positive(cur_text_tm[0])) + if(is_positive(cur_text_tm[0])) { draw_tx += tdx / cur_text_tm[0]; draw_ty += dy; merged = true; } - else if (_is_positive(cur_text_tm[1])) + else if (is_positive(cur_text_tm[1])) { draw_tx += tdy / cur_text_tm[1]; draw_ty += dy; @@ -221,7 +221,7 @@ void HTMLRenderer::check_state_change(GfxState * state) } else { - if((_equal(tdx,0)) && (_equal(tdy,0))) + if((equal(tdx,0)) && (equal(tdy,0))) { // free draw_tx = cur_tx; @@ -246,7 +246,7 @@ void HTMLRenderer::check_state_change(GfxState * state) if(all_changed || letter_space_changed || draw_text_scale_changed) { double new_letter_space = state->getCharSpace(); - if(!_equal(cur_letter_space, new_letter_space)) + if(!equal(cur_letter_space, new_letter_space)) { new_line_state = max(new_line_state, NLS_SPAN); cur_letter_space = new_letter_space; @@ -259,7 +259,7 @@ void HTMLRenderer::check_state_change(GfxState * state) if(all_changed || word_space_changed || draw_text_scale_changed) { double new_word_space = state->getWordSpace(); - if(!_equal(cur_word_space, new_word_space)) + if(!equal(cur_word_space, new_word_space)) { new_line_state = max(new_line_state, NLS_SPAN); cur_word_space = new_word_space; @@ -294,7 +294,7 @@ void HTMLRenderer::check_state_change(GfxState * state) if(all_changed || rise_changed || draw_text_scale_changed) { double new_rise = state->getRise(); - if(!_equal(cur_rise, new_rise)) + if(!equal(cur_rise, new_rise)) { new_line_state = max(new_line_state, NLS_SPAN); cur_rise = new_rise; diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 2a4dda9..2f40b82 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -19,6 +19,8 @@ #include "util/ffw.h" #include "util/namespace.h" #include "util/unicode.h" +#include "util/path.h" +#include "util/math.h" namespace pdf2htmlEX { @@ -542,7 +544,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) while (len > 0) { auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy); - if(!(_equal(ox, 0) && _equal(oy, 0))) + if(!(equal(ox, 0) && equal(oy, 0))) { cerr << "TODO: non-zero origins" << endl; } diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index f971243..df6ae7d 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -23,6 +23,7 @@ #include "pdf2htmlEX-config.h" #include "HTMLRenderer/HTMLRenderer.h" #include "util/ArgParser.h" +#include "util/path.h" using namespace std; using namespace pdf2htmlEX; diff --git a/src/util/base64.cc b/src/util/base64.cc index 51b4a7e..8e462bf 100644 --- a/src/util/base64.cc +++ b/src/util/base64.cc @@ -1,3 +1,7 @@ #include "base64.h" +namespace pdf2htmlEX { + const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +} diff --git a/src/util/base64.h b/src/util/base64.h index efebac8..3c53a84 100644 --- a/src/util/base64.h +++ b/src/util/base64.h @@ -10,6 +10,8 @@ #include +namespace pdf2htmlEX { + class base64stream { public: @@ -58,5 +60,5 @@ private: static inline std::ostream & operator << (std::ostream & out, base64stream & bf) { return bf.dumpto(out); } static inline std::ostream & operator << (std::ostream & out, base64stream && bf) { return bf.dumpto(out); } - +} //namespace pdf2htmlEX #endif //BASE64_H__ diff --git a/src/util/math.cc b/src/util/math.cc new file mode 100644 index 0000000..d23d48f --- /dev/null +++ b/src/util/math.cc @@ -0,0 +1,32 @@ +#include +#include "math.h" + +namespace pdf2htmlEX { + +void tm_transform(const double * tm, double & x, double & y, bool is_delta) +{ + double xx = x, yy = y; + x = tm[0] * xx + tm[2] * yy; + y = tm[1] * xx + tm[3] * yy; + if(!is_delta) + { + x += tm[4]; + y += tm[5]; + } +} + +void tm_multiply(double * tm_left, const double * tm_right) +{ + double old[4]; + memcpy(old, tm_left, sizeof(old)); + + tm_left[0] = old[0] * tm_right[0] + old[2] * tm_right[1]; + tm_left[1] = old[1] * tm_right[0] + old[3] * tm_right[1]; + tm_left[2] = old[0] * tm_right[2] + old[2] * tm_right[3]; + tm_left[3] = old[1] * tm_right[2] + old[3] * tm_right[3]; + tm_left[4] += old[0] * tm_right[4] + old[2] * tm_right[5]; + tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5]; +} + +} //namespace pdf2htmlEX + diff --git a/src/util/math.h b/src/util/math.h new file mode 100644 index 0000000..2bfb6bc --- /dev/null +++ b/src/util/math.h @@ -0,0 +1,33 @@ +/* + * Math functions + * + * by WangLu + * 2012.11.29 + */ + +#ifndef MATH_H__ +#define MATH_H__ + +#include + +#include "const.h" + +namespace pdf2htmlEX { + +static inline double round(double x) { return (std::fabs(x) > EPS) ? x : 0.0; } +static inline bool equal(double x, double y) { return std::abs(x-y) < EPS; } +static inline bool is_positive(double x) { return x > EPS; } +static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6) +{ + for(int i = 0; i < size; ++i) + if(!equal(tm1[i], tm2[i])) + return false; + return true; +} +static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); } + +void tm_transform(const double * tm, double & x, double & y, bool is_delta = false); +void tm_multiply(double * tm_left, const double * tm_right); + +} //namespace pdf2htmlEX +#endif //MATH_H__ diff --git a/src/util/path.cc b/src/util/path.cc new file mode 100644 index 0000000..c6f5804 --- /dev/null +++ b/src/util/path.cc @@ -0,0 +1,65 @@ +#include +#include + +#include "path.h" + +using std::string; + +namespace pdf2htmlEX { + +void create_directories(string path) +{ + if(path.empty()) return; + + size_t idx = path.rfind('/'); + if(idx != string::npos) + { + create_directories(path.substr(0, idx)); + } + + int r = mkdir(path.c_str(), S_IRWXU); + if(r != 0) + { + if(errno == EEXIST) + { + struct stat stat_buf; + if((stat(path.c_str(), &stat_buf) == 0) && S_ISDIR(stat_buf.st_mode)) + return; + } + + throw string("Cannot create directory: ") + path; + } +} + +bool is_truetype_suffix(const string & suffix) +{ + return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf"); +} + +string get_filename (const string & path) +{ + size_t idx = path.rfind('/'); + if(idx == string::npos) + return path; + else if (idx == path.size() - 1) + return ""; + return path.substr(idx + 1); +} + +string get_suffix(const string & path) +{ + string fn = get_filename(path); + size_t idx = fn.rfind('.'); + if(idx == string::npos) + return ""; + else + { + string s = fn.substr(idx); + for(auto iter = s.begin(); iter != s.end(); ++iter) + *iter = tolower(*iter); + return s; + } +} + + +} //namespace pdf2htmlEX diff --git a/src/util/path.h b/src/util/path.h new file mode 100644 index 0000000..ddc9a4a --- /dev/null +++ b/src/util/path.h @@ -0,0 +1,23 @@ +/* + * Function handling filenames and paths + * + * by WangLu + * 2012.11.29 + */ + +#ifndef PATH_H__ +#define PATH_H__ + +#include + +namespace pdf2htmlEX { + +void create_directories(std::string path); + +bool is_truetype_suffix(const std::string & suffix); + +std::string get_filename(const std::string & path); +std::string get_suffix(const std::string & path); + +} //namespace pdf2htmlEX +#endif //PATH_H__ diff --git a/src/util/string_formatter.h b/src/util/string_formatter.h index 64da233..a5dc08b 100644 --- a/src/util/string_formatter.h +++ b/src/util/string_formatter.h @@ -8,6 +8,8 @@ #ifndef STRING_FORMATTER_H__ #define STRING_FORMATTER_H__ +namespace pdf2htmlEX { + class string_formatter { public: @@ -50,4 +52,5 @@ private: int buf_cnt; }; +} //namespace pdf2htmlEX #endif //STRING_FORMATTER_H__ diff --git a/src/util/util.cc b/src/util/util.cc index feb09ea..034d108 100644 --- a/src/util/util.cc +++ b/src/util/util.cc @@ -15,10 +15,6 @@ #include #include -// for mkdir -#include -#include - #include "util.h" using std::cerr; @@ -29,85 +25,6 @@ using std::ostream; namespace pdf2htmlEX { -void _tm_transform(const double * tm, double & x, double & y, bool is_delta) -{ - double xx = x, yy = y; - x = tm[0] * xx + tm[2] * yy; - y = tm[1] * xx + tm[3] * yy; - if(!is_delta) - { - x += tm[4]; - y += tm[5]; - } -} - -void _tm_multiply(double * tm_left, const double * tm_right) -{ - double old[4]; - memcpy(old, tm_left, sizeof(old)); - - tm_left[0] = old[0] * tm_right[0] + old[2] * tm_right[1]; - tm_left[1] = old[1] * tm_right[0] + old[3] * tm_right[1]; - tm_left[2] = old[0] * tm_right[2] + old[2] * tm_right[3]; - tm_left[3] = old[1] * tm_right[2] + old[3] * tm_right[3]; - tm_left[4] += old[0] * tm_right[4] + old[2] * tm_right[5]; - tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5]; -} - -void create_directories(string path) -{ - if(path.empty()) return; - - size_t idx = path.rfind('/'); - if(idx != string::npos) - { - create_directories(path.substr(0, idx)); - } - - int r = mkdir(path.c_str(), S_IRWXU); - if(r != 0) - { - if(errno == EEXIST) - { - struct stat stat_buf; - if((stat(path.c_str(), &stat_buf) == 0) && S_ISDIR(stat_buf.st_mode)) - return; - } - - throw string("Cannot create directory: ") + path; - } -} - -bool is_truetype_suffix(const string & suffix) -{ - return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf"); -} - -string get_filename (const string & path) -{ - size_t idx = path.rfind('/'); - if(idx == string::npos) - return path; - else if (idx == path.size() - 1) - return ""; - return path.substr(idx + 1); -} - -string get_suffix(const string & path) -{ - string fn = get_filename(path); - size_t idx = fn.rfind('.'); - if(idx == string::npos) - return ""; - else - { - string s = fn.substr(idx); - for(auto iter = s.begin(); iter != s.end(); ++iter) - *iter = tolower(*iter); - return s; - } -} - void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2, double border_width, diff --git a/src/util/util.h b/src/util/util.h index e9011a9..24e1248 100644 --- a/src/util/util.h +++ b/src/util/util.h @@ -17,8 +17,6 @@ #include #include -#include - #include "const.h" #ifndef nullptr @@ -27,77 +25,11 @@ namespace pdf2htmlEX { -static inline double _round(double x) { return (std::abs(x) > EPS) ? x : 0.0; } -static inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; } -static inline bool _is_positive(double x) { return x > EPS; } -static inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6) -{ - for(int i = 0; i < size; ++i) - if(!_equal(tm1[i], tm2[i])) - return false; - return true; -} -static inline double _hypot(double x, double y) { return std::sqrt(x*x+y*y); } - -void _tm_transform(const double * tm, double & x, double & y, bool is_delta = false); -void _tm_multiply(double * tm_left, const double * tm_right); - static inline long long hash_ref(const Ref * id) { return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen); } -class GfxRGB_hash -{ -public: - size_t operator () (const GfxRGB & rgb) const - { - return (colToByte(rgb.r) << 16) | (colToByte(rgb.g) << 8) | (colToByte(rgb.b)); - } -}; - -class GfxRGB_equal -{ -public: - bool operator ()(const GfxRGB & rgb1, const GfxRGB & rgb2) const - { - return ((rgb1.r == rgb2.r) && (rgb1.g == rgb2.g) && (rgb1.b == rgb1.b)); - } -}; - -// we may need more info of a font in the future -class FontInfo -{ -public: - long long id; - bool use_tounicode; - int em_size; - double ascent, descent; -}; - -class Matrix_less -{ -public: - bool operator () (const Matrix & m1, const Matrix & m2) const - { - // Note that we only care about the first 4 elements - for(int i = 0; i < 4; ++i) - { - if(m1.m[i] < m2.m[i] - EPS) - return true; - if(m1.m[i] > m2.m[i] + EPS) - return false; - } - return false; - } -}; - -void create_directories(std::string path); - -bool is_truetype_suffix(const std::string & suffix); - -std::string get_filename(const std::string & path); -std::string get_suffix(const std::string & path); /* * In PDF, edges of the rectangle are in the middle of the borders From 56e59baeb2302d3f8c1e6f984544265ad82a76f7 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 18:28:07 +0800 Subject: [PATCH 12/38] split util.h --- CMakeLists.txt | 4 ++-- src/HTMLRenderer/HTMLRenderer.h | 2 +- src/HTMLRenderer/LineBuffer.cc | 3 +++ src/HTMLRenderer/Preprocessor.cc | 3 ++- src/HTMLRenderer/draw.cc | 1 + src/HTMLRenderer/export.cc | 1 + src/HTMLRenderer/general.cc | 2 ++ src/HTMLRenderer/install.cc | 3 +++ src/HTMLRenderer/link.cc | 3 +++ src/HTMLRenderer/text.cc | 3 +++ src/util/const.h | 4 ++++ src/util/{util.cc => misc.cc} | 11 ++--------- src/util/{util.h => misc.h} | 13 +------------ src/util/namespace.h | 2 -- 14 files changed, 28 insertions(+), 27 deletions(-) rename src/util/{util.cc => misc.cc} (88%) rename src/util/{util.h => misc.h} (81%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2badbab..68b0fc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,6 +173,8 @@ add_executable(pdf2htmlEX src/util/ffw.c src/util/math.h src/util/math.cc + src/util/misc.h + src/util/misc.cc src/util/namespace.h src/util/path.h src/util/path.cc @@ -181,8 +183,6 @@ add_executable(pdf2htmlEX src/util/TmpFiles.cc src/util/unicode.h src/util/unicode.cc - src/util/util.h - src/util/util.cc ) target_link_libraries(pdf2htmlEX ${PDF2HTMLEX_LIBS}) diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index d11b62a..7dfc605 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -26,7 +26,7 @@ #include "Param.h" #include "Preprocessor.h" -#include "util/util.h" +#include "util/const.h" #include "util/string_formatter.h" #include "util/TmpFiles.h" diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc index 8876ed0..00d4078 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/LineBuffer.cc @@ -12,6 +12,7 @@ #include "HTMLRenderer.h" #include "util/namespace.h" #include "util/unicode.h" +#include "util/math.h" namespace pdf2htmlEX { @@ -19,6 +20,8 @@ using std::min; using std::max; using std::vector; using std::ostream; +using std::cerr; +using std::endl; void HTMLRenderer::LineBuffer::reset(GfxState * state) { diff --git a/src/HTMLRenderer/Preprocessor.cc b/src/HTMLRenderer/Preprocessor.cc index 3214d99..1c77337 100644 --- a/src/HTMLRenderer/Preprocessor.cc +++ b/src/HTMLRenderer/Preprocessor.cc @@ -15,7 +15,8 @@ #include #include "Preprocessor.h" -#include "util/util.h" +#include "util/misc.h" +#include "util/const.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc index 0185c07..9d21d46 100644 --- a/src/HTMLRenderer/draw.cc +++ b/src/HTMLRenderer/draw.cc @@ -14,6 +14,7 @@ #include #include "HTMLRenderer.h" +#include "util/misc.h" #include "util/math.h" #include "util/namespace.h" diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index c8e0ea8..f7f6b18 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -14,6 +14,7 @@ #include "util/namespace.h" #include "util/base64.h" #include "util/math.h" +#include "util/misc.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 2c8833d..fa5a1cb 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -31,6 +31,8 @@ using std::max; using std::min_element; using std::vector; using std::abs; +using std::cerr; +using std::endl; static void dummy(void *, enum ErrorCategory, int pos, char *) { diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index a51dbf7..511e1d4 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -17,10 +17,13 @@ #include "HTMLRenderer.h" #include "util/namespace.h" #include "util/math.h" +#include "util/misc.h" namespace pdf2htmlEX { using std::abs; +using std::cerr; +using std::endl; const FontInfo * HTMLRenderer::install_font(GfxFont * font) { diff --git a/src/HTMLRenderer/link.cc b/src/HTMLRenderer/link.cc index 68126dd..fa78b9e 100644 --- a/src/HTMLRenderer/link.cc +++ b/src/HTMLRenderer/link.cc @@ -16,12 +16,15 @@ #include "HTMLRenderer.h" #include "util/namespace.h" #include "util/math.h" +#include "util/misc.h" namespace pdf2htmlEX { using std::ostringstream; using std::min; using std::max; +using std::cerr; +using std::endl; /* * The detailed rectangle area of the link destination diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 2f40b82..666be7e 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -21,6 +21,7 @@ #include "util/unicode.h" #include "util/path.h" #include "util/math.h" +#include "util/misc.h" namespace pdf2htmlEX { @@ -29,6 +30,8 @@ using std::min; using std::all_of; using std::floor; using std::swap; +using std::cerr; +using std::endl; string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) { diff --git a/src/util/const.h b/src/util/const.h index 1fde46f..ffd2357 100644 --- a/src/util/const.h +++ b/src/util/const.h @@ -13,6 +13,10 @@ namespace pdf2htmlEX { +#ifndef nullptr +#define nullptr (NULL) +#endif + static const double EPS = 1e-6; static const double DEFAULT_DPI = 72.0; extern const double ID_MATRIX[6]; diff --git a/src/util/util.cc b/src/util/misc.cc similarity index 88% rename from src/util/util.cc rename to src/util/misc.cc index 034d108..e2572c0 100644 --- a/src/util/util.cc +++ b/src/util/misc.cc @@ -6,16 +6,9 @@ * 2012.08.10 */ -#include -#include +#include -#include -#include -#include -#include -#include - -#include "util.h" +#include "misc.h" using std::cerr; using std::endl; diff --git a/src/util/util.h b/src/util/misc.h similarity index 81% rename from src/util/util.h rename to src/util/misc.h index 24e1248..11ae739 100644 --- a/src/util/util.h +++ b/src/util/misc.h @@ -9,19 +9,9 @@ #ifndef UTIL_H__ #define UTIL_H__ -#include #include -#include -#include -#include -#include -#include -#include "const.h" - -#ifndef nullptr -#define nullptr (NULL) -#endif +#include namespace pdf2htmlEX { @@ -30,7 +20,6 @@ static inline long long hash_ref(const Ref * id) return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen); } - /* * In PDF, edges of the rectangle are in the middle of the borders * In HTML, edges are completely outside the rectangle diff --git a/src/util/namespace.h b/src/util/namespace.h index a74f936..46dcd0f 100644 --- a/src/util/namespace.h +++ b/src/util/namespace.h @@ -12,8 +12,6 @@ using std::hex; using std::dec; using std::string; -using std::cout; -using std::cerr; using std::endl; using std::make_pair; using std::ifstream; From ba77c1cc6c331cf0e36453772a51c8d0cc35304c Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 19:38:57 +0800 Subject: [PATCH 13/38] refince code: base64 --- src/util/base64.cc | 40 +++++++++++++++++++++++++++++++++++++++- src/util/base64.h | 37 +++---------------------------------- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/src/util/base64.cc b/src/util/base64.cc index 8e462bf..e5da7e3 100644 --- a/src/util/base64.cc +++ b/src/util/base64.cc @@ -2,6 +2,44 @@ namespace pdf2htmlEX { +using std::ostream; + +ostream & base64stream::dumpto(ostream & out) +{ + unsigned char buf[3]; + while(in->read((char*)buf, 3)) + { + out << base64_encoding[(buf[0] & 0xfc)>>2] + << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)] + << base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)] + << base64_encoding[(buf[2] & 0x3f)]; + } + auto cnt = in->gcount(); + if(cnt > 0) + { + for(int i = cnt; i < 3; ++i) + buf[i] = 0; + + out << base64_encoding[(buf[0] & 0xfc)>>2] + << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]; + + if(cnt > 1) + { + out << base64_encoding[(buf[1] & 0x0f)<<2]; + } + else + { + out << '='; + } + out << '='; + } + + return out; +} + const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; -} +ostream & operator << (ostream & out, base64stream & bf) { return bf.dumpto(out); } +ostream & operator << (ostream & out, base64stream && bf) { return bf.dumpto(out); } + +} //namespace pdf2htmlEX diff --git a/src/util/base64.h b/src/util/base64.h index 3c53a84..5bdb4fe 100644 --- a/src/util/base64.h +++ b/src/util/base64.h @@ -19,46 +19,15 @@ public: base64stream(std::istream & in) : in(&in) { } base64stream(std::istream && in) : in(&in) { } - std::ostream & dumpto(std::ostream & out) - { - unsigned char buf[3]; - while(in->read((char*)buf, 3)) - { - out << base64_encoding[(buf[0] & 0xfc)>>2] - << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)] - << base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)] - << base64_encoding[(buf[2] & 0x3f)]; - } - auto cnt = in->gcount(); - if(cnt > 0) - { - for(int i = cnt; i < 3; ++i) - buf[i] = 0; - - out << base64_encoding[(buf[0] & 0xfc)>>2] - << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]; - - if(cnt > 1) - { - out << base64_encoding[(buf[1] & 0x0f)<<2]; - } - else - { - out << '='; - } - out << '='; - } - - return out; - } + std::ostream & dumpto(std::ostream & out); private: std::istream * in; static const char * base64_encoding; }; -static inline std::ostream & operator << (std::ostream & out, base64stream & bf) { return bf.dumpto(out); } -static inline std::ostream & operator << (std::ostream & out, base64stream && bf) { return bf.dumpto(out); } +std::ostream & operator << (std::ostream & out, base64stream & bf); +std::ostream & operator << (std::ostream & out, base64stream && bf); } //namespace pdf2htmlEX #endif //BASE64_H__ From 9d99fd9b82ee7fca11ce489443d0e534130192fb Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 19:43:19 +0800 Subject: [PATCH 14/38] clean code --- src/util/path.cc | 2 +- src/util/path.h | 2 +- src/util/string_formatter.h | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/util/path.cc b/src/util/path.cc index c6f5804..a485fce 100644 --- a/src/util/path.cc +++ b/src/util/path.cc @@ -7,7 +7,7 @@ using std::string; namespace pdf2htmlEX { -void create_directories(string path) +void create_directories(const string & path) { if(path.empty()) return; diff --git a/src/util/path.h b/src/util/path.h index ddc9a4a..4f82a8e 100644 --- a/src/util/path.h +++ b/src/util/path.h @@ -12,7 +12,7 @@ namespace pdf2htmlEX { -void create_directories(std::string path); +void create_directories(const std::string & path); bool is_truetype_suffix(const std::string & suffix); diff --git a/src/util/string_formatter.h b/src/util/string_formatter.h index a5dc08b..8d50df0 100644 --- a/src/util/string_formatter.h +++ b/src/util/string_formatter.h @@ -17,8 +17,9 @@ public: { public: guarded_pointer(string_formatter * sf) : sf(sf) { ++(sf->buf_cnt); } + guarded_pointer(const guarded_pointer & gp) : sf(gp.sf) { ++(sf->buf_cnt); } ~guarded_pointer(void) { --(sf->buf_cnt); } - operator char* () { return &(sf->buf.front()); } + operator char* () const { return &(sf->buf.front()); } private: string_formatter * sf; }; From 195a6cc5814ef46baa07308d94aa3d309c183d92 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 20:39:30 +0800 Subject: [PATCH 15/38] refining --- src/HTMLRenderer/HTMLRenderer.h | 10 +++---- src/HTMLRenderer/general.cc | 11 ++++---- .../{string_formatter.h => StringFormatter.h} | 28 +++++++++---------- 3 files changed, 24 insertions(+), 25 deletions(-) rename src/util/{string_formatter.h => StringFormatter.h} (58%) diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 7dfc605..7d9371a 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -27,7 +27,7 @@ #include "Param.h" #include "Preprocessor.h" #include "util/const.h" -#include "util/string_formatter.h" +#include "util/StringFormatter.h" #include "util/TmpFiles.h" /* @@ -106,6 +106,7 @@ public: return false; } }; + class HTMLRenderer : public OutputDev { public: @@ -200,7 +201,7 @@ class HTMLRenderer : public OutputDev void post_process(); // set flags - void fix_stream (std::ostream & out); + void set_stream_flags (std::ostream & out); std::string dump_embedded_font (GfxFont * font, long long fn_id); void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false); @@ -451,11 +452,12 @@ class HTMLRenderer : public OutputDev int32_t * cur_mapping; char ** cur_mapping2; int * width_list; + Preprocessor preprocessor; TmpFiles tmp_files; // for string formatting - string_formatter str_fmt; + StringFormatter str_fmt; //////////////////////////////////////////////////// // styles & resources @@ -471,8 +473,6 @@ class HTMLRenderer : public OutputDev std::map rise_map; std::map height_map; - int image_count; - const Param * param; std::ofstream html_fout, css_fout; std::string html_path, css_path; diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index fa5a1cb..f6ea324 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -44,7 +44,6 @@ HTMLRenderer::HTMLRenderer(const Param * param) ,line_buf(this) ,preprocessor(param) ,tmp_files(*param) - ,image_count(0) ,param(param) { if(!(param->debug)) @@ -92,7 +91,7 @@ void HTMLRenderer::process(PDFDoc *doc) html_fout.open((char*)page_fn, ofstream::binary); if(!html_fout) throw string("Cannot open ") + (char*)page_fn + " for writing"; - fix_stream(html_fout); + set_stream_flags(html_fout); } if(param->process_nontext) @@ -291,7 +290,7 @@ void HTMLRenderer::pre_process(PDFDoc * doc) css_fout.open(css_path, ofstream::binary); if(!css_fout) throw string("Cannot open ") + (char*)fn + " for writing"; - fix_stream(css_fout); + set_stream_flags(css_fout); } // if split-pages is specified, open & close the file in the process loop @@ -312,7 +311,7 @@ void HTMLRenderer::pre_process(PDFDoc * doc) html_fout.open(html_path, ofstream::binary); if(!html_fout) throw string("Cannot open ") + (char*)fn + " for writing"; - fix_stream(html_fout); + set_stream_flags(html_fout); } } @@ -332,7 +331,7 @@ void HTMLRenderer::post_process() output.open((char*)fn, ofstream::binary); if(!output) throw string("Cannot open ") + (char*)fn + " for writing"; - fix_stream(output); + set_stream_flags(output); } // apply manifest @@ -390,7 +389,7 @@ void HTMLRenderer::post_process() } } -void HTMLRenderer::fix_stream (std::ostream & out) +void HTMLRenderer::set_stream_flags(std::ostream & out) { // we output all ID's in hex // browsers are not happy with scientific notations diff --git a/src/util/string_formatter.h b/src/util/StringFormatter.h similarity index 58% rename from src/util/string_formatter.h rename to src/util/StringFormatter.h index 8d50df0..96e0370 100644 --- a/src/util/string_formatter.h +++ b/src/util/StringFormatter.h @@ -5,32 +5,32 @@ * 2012.11.29 */ -#ifndef STRING_FORMATTER_H__ -#define STRING_FORMATTER_H__ +#ifndef STRINGFORMATTER_H__ +#define STRINGFORMATTER_H__ namespace pdf2htmlEX { -class string_formatter +class StringFormatter { public: - class guarded_pointer + class GuardedPointer { public: - guarded_pointer(string_formatter * sf) : sf(sf) { ++(sf->buf_cnt); } - guarded_pointer(const guarded_pointer & gp) : sf(gp.sf) { ++(sf->buf_cnt); } - ~guarded_pointer(void) { --(sf->buf_cnt); } + GuardedPointer(StringFormatter * sf) : sf(sf) { ++(sf->buf_cnt); } + GuardedPointer(const GuardedPointer & gp) : sf(gp.sf) { ++(sf->buf_cnt); } + ~GuardedPointer(void) { --(sf->buf_cnt); } operator char* () const { return &(sf->buf.front()); } private: - string_formatter * sf; + StringFormatter * sf; }; - string_formatter() : buf_cnt(0) { buf.reserve(L_tmpnam); } + StringFormatter() : buf_cnt(0) { buf.reserve(L_tmpnam); } /* * Important: * there is only one buffer, so new strings will replace old ones */ - guarded_pointer operator () (const char * format, ...) { - assert((buf_cnt == 0) && "string_formatter: buffer is reused!"); + GuardedPointer operator () (const char * format, ...) { + assert((buf_cnt == 0) && "StringFormatter: buffer is reused!"); va_list vlist; va_start(vlist, format); @@ -45,13 +45,13 @@ public: } assert(l >= 0); // we should fail when vsnprintf fail assert(l < (int)buf.capacity()); - return guarded_pointer(this); + return GuardedPointer(this); } private: - friend class guarded_pointer; + friend class GuardedPointer; std::vector buf; int buf_cnt; }; } //namespace pdf2htmlEX -#endif //STRING_FORMATTER_H__ +#endif //STRINGFORMATTER_H__ From 96fd23b6e156fa789815c9bea7b7a17af4baff2a Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 20:40:47 +0800 Subject: [PATCH 16/38] .. --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 68b0fc0..caa353b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,7 +143,6 @@ configure_file (${CMAKE_SOURCE_DIR}/pdf2htmlEX.1.in ${CMAKE_SOURCE_DIR}/pdf2html add_executable(pdf2htmlEX src/Param.h - src/config.h src/pdf2htmlEX-config.h src/pdf2htmlEX.cc src/HTMLRenderer/HTMLRenderer.h @@ -178,7 +177,7 @@ add_executable(pdf2htmlEX src/util/namespace.h src/util/path.h src/util/path.cc - src/util/string_formatter.h + src/util/StringFormatter.h src/util/TmpFiles.h src/util/TmpFiles.cc src/util/unicode.h From 02305d560b3d5681b164f2556fa3507e4b2c240a Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 21:20:26 +0800 Subject: [PATCH 17/38] .. --- src/util/TmpFiles.cc | 7 +++---- src/util/math.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/util/TmpFiles.cc b/src/util/TmpFiles.cc index 958ab6f..616ac28 100644 --- a/src/util/TmpFiles.cc +++ b/src/util/TmpFiles.cc @@ -16,10 +16,9 @@ using namespace std; namespace pdf2htmlEX { -TmpFiles::TmpFiles( const Param& param_ ) - : param( param_ ) -{ -} +TmpFiles::TmpFiles( const Param& param ) + : param( param ) +{ } TmpFiles::~TmpFiles() { diff --git a/src/util/math.h b/src/util/math.h index 2bfb6bc..9c9f5db 100644 --- a/src/util/math.h +++ b/src/util/math.h @@ -14,7 +14,7 @@ namespace pdf2htmlEX { -static inline double round(double x) { return (std::fabs(x) > EPS) ? x : 0.0; } +static inline double round(double x) { return (std::abs(x) > EPS) ? x : 0.0; } static inline bool equal(double x, double y) { return std::abs(x-y) < EPS; } static inline bool is_positive(double x) { return x > EPS; } static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6) From e534298e1b372e1ba9bd5e069e76f315dde309a7 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 21:38:23 +0800 Subject: [PATCH 18/38] .. --- src/HTMLRenderer/install.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index 511e1d4..ea3efa7 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -169,7 +169,6 @@ void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info) cerr << "Warning: workaround for font names in bad encodings." << endl; } - GooString gfn(fontname.c_str()); GfxFontLoc * localfontloc = font->locateFont(xref, gFalse); if(param->embed_external_font) From 36a340ecf9edb5a2e8860e5bb6513e7ffafabbbd Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 21:38:53 +0800 Subject: [PATCH 19/38] auto-hint is now default --- pdf2htmlEX.1.in | 2 +- src/pdf2htmlEX.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 50a82dc..6ccb168 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -98,7 +98,7 @@ Specify a ratio greater than 1 would resolve this issue, however it might freeze For some versions of Firefox, however, there will be a problem when the font size is too large, in which case a smaller value should be specified here. .TP -.B --auto-hint <0|1> (Default: 0) +.B --auto-hint <0|1> (Default: 1) If set to 1, hints will be generated for the fonts using fontforge. This may be preceded by --external-hint-tool. diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index df6ae7d..720e541 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -79,7 +79,7 @@ void parse_options (int argc, char **argv) .add("veps", ¶m.v_eps, 1.0, "max tolerated vertical offset (in pixels)") .add("space-threshold", ¶m.space_threshold, (1.0/8), "distance no thiner than (threshold * em) will be considered as a space character") .add("font-size-multiplier", ¶m.font_size_multiplier, 4.0, "setting a value greater than 1 would increase the rendering accuracy") - .add("auto-hint", ¶m.auto_hint, 0, "Whether to generate hints for fonts") + .add("auto-hint", ¶m.auto_hint, 1, "Whether to generate hints for fonts") .add("tounicode", ¶m.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled") .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") .add("stretch-narrow-glyph", ¶m.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding space") From 6fb3421860ae2deb53cc35affb59f92c28314fda Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 29 Nov 2012 21:41:48 +0800 Subject: [PATCH 20/38] nope, hinting of fontforge is not reliable --- pdf2htmlEX.1.in | 2 +- src/pdf2htmlEX.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 6ccb168..50a82dc 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -98,7 +98,7 @@ Specify a ratio greater than 1 would resolve this issue, however it might freeze For some versions of Firefox, however, there will be a problem when the font size is too large, in which case a smaller value should be specified here. .TP -.B --auto-hint <0|1> (Default: 1) +.B --auto-hint <0|1> (Default: 0) If set to 1, hints will be generated for the fonts using fontforge. This may be preceded by --external-hint-tool. diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 720e541..df6ae7d 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -79,7 +79,7 @@ void parse_options (int argc, char **argv) .add("veps", ¶m.v_eps, 1.0, "max tolerated vertical offset (in pixels)") .add("space-threshold", ¶m.space_threshold, (1.0/8), "distance no thiner than (threshold * em) will be considered as a space character") .add("font-size-multiplier", ¶m.font_size_multiplier, 4.0, "setting a value greater than 1 would increase the rendering accuracy") - .add("auto-hint", ¶m.auto_hint, 1, "Whether to generate hints for fonts") + .add("auto-hint", ¶m.auto_hint, 0, "Whether to generate hints for fonts") .add("tounicode", ¶m.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled") .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") .add("stretch-narrow-glyph", ¶m.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding space") From b4257e7f9f285c02c6226c7efbd7c1deaa8f873e Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 30 Nov 2012 17:18:12 +0800 Subject: [PATCH 21/38] fix #51 --- src/util/path.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/util/path.cc b/src/util/path.cc index a485fce..ce80a4f 100644 --- a/src/util/path.cc +++ b/src/util/path.cc @@ -1,3 +1,11 @@ +/* + * Functions manipulating filenames and paths + * + * by WangLu + * 2012.11.29 + */ + +#include #include #include From d1dfb33e43801c606b6c35083ea843a80bf50725 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 30 Nov 2012 17:33:27 +0800 Subject: [PATCH 22/38] .. --- src/HTMLRenderer/text.cc | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 666be7e..a3f1f80 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -380,7 +380,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo // in auto mode, just drop the tounicode map if(!retried) { - cerr << "ToUnicode CMap is not valid and got dropped" << endl; + cerr << "ToUnicode CMap is not valid and got dropped for font: " << hex << info.id << dec << endl; retried = true; codeset.clear(); info.use_tounicode = false; @@ -416,7 +416,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo ffw_reencode_raw(cur_mapping, max_key + 1, 1); - // we need the space chracter for offsets + // we need the space character for offsets if(!has_space) { int space_width; @@ -523,14 +523,6 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) char *p = s->getCString(); int len = s->getLength(); - //debug - { - if(strcmp(p, "ORTUG") == 0) - { - cerr << "DEBUG: " << (int)(state->getRender()) << endl; - } - } - double dx = 0; double dy = 0; double dxerr = 0; @@ -544,7 +536,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) CharCode code; Unicode *u = nullptr; - while (len > 0) { + while (len > 0) + { auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy); if(!(equal(ox, 0) && equal(oy, 0))) From 2e0b9c67217d79c2fce83797105aaf50f640f1a3 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 30 Nov 2012 17:59:36 +0800 Subject: [PATCH 23/38] should fix a span with only a space --- share/base.css | 5 ++--- src/HTMLRenderer/HTMLRenderer.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/share/base.css b/share/base.css index c04938a..fd42bd3 100644 --- a/share/base.css +++ b/share/base.css @@ -63,6 +63,8 @@ span { position:relative; vertical-align: baseline; + /* _ for spaces may need display:inline, which will override this */ + display:inline-block; } ._ { color:transparent; @@ -74,9 +76,6 @@ span { ::-moz-selection{ background: rgba(127,255,255,1); } -.i { - position:absolute; -} .j { display:none; } diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 7d9371a..4b035ac 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -40,7 +40,6 @@ * b - page Box * d - page Decoration * l - Line - * i - Image * j - Js data * p - Page * From 806fdc39dbda6412b4229179d58c241ceb295d02 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 30 Nov 2012 18:08:08 +0800 Subject: [PATCH 24/38] typo --- pdf2htmlEX.1.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 50a82dc..6aacec1 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -110,7 +110,7 @@ If this value is set to 1, the ToUnicode Map is always applied, if provided in P If set to -1, a customized map is used such that rendering will be correct in HTML (visually the same), but you may not get correct characters by select & copy & paste. -If set to 0, pdf2htmlEX would try it best to balance the two methods above. +If set to 0, pdf2htmlEX would try its best to balance the two methods above. .TP .B --space-as-offset <0|1> (Default: 0) Treat space characters as offsets, which may increase the size of the output. From 30c24b788702793be62295950c8395f7a0026400 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 30 Nov 2012 18:08:45 +0800 Subject: [PATCH 25/38] typo --- pdf2htmlEX.1.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 6aacec1..23f60e5 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -118,7 +118,7 @@ Treat space characters as offsets, which may increase the size of the output. Turn it on if space characters are not displayed correctly, or you want to remove positional spaces. .TP .B --stretch-narrow-glyph <0|1> (Default: 0) -If set to 1, glyphs narrower than described in PDF will be strecth; otherwise space will be padded to the right of the glyphs +If set to 1, glyphs narrower than described in PDF will be stretched; otherwise space will be padded to the right of the glyphs .TP .B --squeeze-wide-glyph <0|1> (Default: 1) If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated. From d497c974908882e2c8975db648b0525e76f43ae3 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 30 Nov 2012 18:09:50 +0800 Subject: [PATCH 26/38] .. --- pdf2htmlEX.1.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 23f60e5..964fcd8 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -124,7 +124,7 @@ If set to 1, glyphs narrower than described in PDF will be stretched; otherwise If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated. .TP .B --remove-unused-glyph <0|1> (Default: 1) -[Experimental] If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size. +If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size. .TP .B --font-suffix (Default: .ttf), --font-format (Default: truetype) Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. From 0f4be7f48d1fb39331ebca0c683743caac5759d1 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Sun, 2 Dec 2012 00:59:59 +0800 Subject: [PATCH 27/38] special thanks --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ad25446..3d1f3e6 100644 --- a/README.md +++ b/README.md @@ -161,4 +161,5 @@ pdf2htmlEX is inspired by the following projects: ### Special Thanks * Hongliang Tian +* Wanmin Liu From 5f919c8d3fdd32b0658e6e767ca2d2b4f13c1244 Mon Sep 17 00:00:00 2001 From: Herbert Jones Date: Wed, 5 Dec 2012 16:10:22 -0600 Subject: [PATCH 28/38] Fix problem with certain pdfs that transform the entire document upside down, including the fonts, which normally appear correctly in other renderers. Since CSS doesn't know how to handle negative font sizes, the document needs to be transformed correctly. --- src/HTMLRenderer/state.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 615cf36..87f2009 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -161,6 +161,15 @@ void HTMLRenderer::check_state_change(GfxState * state) new_draw_text_scale = 1.0; } + if(!_is_positive(new_draw_font_size)) + { + // Page is flipped and css can't handle it. + new_draw_font_size = -new_draw_font_size; + + for(int i = 0; i < 4; ++i) + new_draw_text_tm[i] *= -1; + } + if(!(_equal(new_draw_text_scale, draw_text_scale))) { draw_text_scale_changed = true; From 59a571ac8a79cb875e2c9e135e745abc8824836e Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 7 Dec 2012 20:31:09 +0800 Subject: [PATCH 29/38] new param use-cropbox --- pdf2htmlEX.1.in | 5 ++++- src/BackgroundRenderer/SplashBackgroundRenderer.cc | 4 +++- src/HTMLRenderer/general.cc | 4 +++- src/Param.h | 1 + src/pdf2htmlEX.cc | 1 + 5 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 964fcd8..aa9cdbc 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -54,9 +54,12 @@ If multiple values are specified, the minimum one will be used. If none is specified, pages will be rendered as 72DPI. .TP -.B --hpdi , --vpdi (Default: 144) +.B --hdpi , --vdpi (Default: 144) Specify the horizontal and vertical DPI for images .TP +.B --use-cropbox <0|1> (Default: 0) +Use CropBox instead of MediaBox for output. +.TP .B --process-nontext <0|1> (Default: 1) Whether to process non-text objects (as images) .TP diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc index 808e22f..03be242 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.cc +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -33,7 +33,9 @@ static GBool annot_cb(Annot *, void *) { void SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno, const string & filename) { doc->displayPage(this, pageno, param->h_dpi, param->v_dpi, - 0, true, false, false, + 0, + (param->use_cropbox == 0), + false, false, nullptr, nullptr, &annot_cb, nullptr); getBitmap()->writeImgFile(splashFormatPng, diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index f6ea324..043bc93 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -105,7 +105,9 @@ void HTMLRenderer::process(PDFDoc *doc) doc->displayPage(this, i, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, - 0, true, false, false, + 0, + (param->use_cropbox == 0), + false, false, nullptr, nullptr, nullptr, nullptr); if(param->split_pages) diff --git a/src/Param.h b/src/Param.h index ee7610e..5f9a7d3 100644 --- a/src/Param.h +++ b/src/Param.h @@ -28,6 +28,7 @@ struct Param double zoom; double fit_width, fit_height; double h_dpi, v_dpi; + int use_cropbox; int process_nontext; int single_html; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index df6ae7d..3a82ed5 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -67,6 +67,7 @@ void parse_options (int argc, char **argv) .add("fit-height", ¶m.fit_height, 0, "fit height", nullptr, true) .add("hdpi", ¶m.h_dpi, 144.0, "horizontal DPI for non-text") .add("vdpi", ¶m.v_dpi, 144.0, "vertical DPI for non-text") + .add("use-cropbox", ¶m.use_cropbox, 0, "use CropBox instead of MediaBox") .add("process-nontext", ¶m.process_nontext, 1, "process nontext objects") .add("single-html", ¶m.single_html, 1, "combine everything into one single HTML file") From 804f86b12797f8a3dbc6867dd66aba14b7834815 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Dec 2012 20:17:36 +0800 Subject: [PATCH 30/38] reorganize coce --- CMakeLists.txt | 9 +++---- ChangeLog | 5 ++++ debian/changelog | 6 +++++ src/HTMLRenderer/HTMLRenderer.h | 2 +- src/HTMLRenderer/export.cc | 2 +- src/HTMLRenderer/general.cc | 2 +- src/{HTMLRenderer => util}/Preprocessor.cc | 0 src/{HTMLRenderer => util}/Preprocessor.h | 0 src/util/StringFormatter.cc | 28 ++++++++++++++++++++++ src/util/StringFormatter.h | 18 +------------- src/util/{base64.cc => base64stream.cc} | 2 +- src/util/{base64.h => base64stream.h} | 6 ++--- 12 files changed, 52 insertions(+), 28 deletions(-) rename src/{HTMLRenderer => util}/Preprocessor.cc (100%) rename src/{HTMLRenderer => util}/Preprocessor.h (100%) create mode 100644 src/util/StringFormatter.cc rename src/util/{base64.cc => base64stream.cc} (97%) rename src/util/{base64.h => base64stream.h} (87%) diff --git a/CMakeLists.txt b/CMakeLists.txt index caa353b..903f11f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,8 +155,6 @@ add_executable(pdf2htmlEX src/HTMLRenderer/link.cc src/HTMLRenderer/state.cc src/HTMLRenderer/text.cc - src/HTMLRenderer/Preprocessor.h - src/HTMLRenderer/Preprocessor.cc src/BackgroundRenderer/BackgroundRenderer.h src/BackgroundRenderer/SplashBackgroundRenderer.h src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -164,8 +162,8 @@ add_executable(pdf2htmlEX src/BackgroundRenderer/CairoBackgroundRenderer.cc src/util/ArgParser.h src/util/ArgParser.cc - src/util/base64.h - src/util/base64.cc + src/util/base64stream.h + src/util/base64stream.cc src/util/const.h src/util/const.cc src/util/ffw.h @@ -177,7 +175,10 @@ add_executable(pdf2htmlEX src/util/namespace.h src/util/path.h src/util/path.cc + src/util/Preprocessor.h + src/util/Preprocessor.cc src/util/StringFormatter.h + src/util/StringFormatter.cc src/util/TmpFiles.h src/util/TmpFiles.cc src/util/unicode.h diff --git a/ChangeLog b/ChangeLog index 9f270de..8df58bf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ Latest v0.6 +* New parameter: --use-cropbox +* Progress indicator +* Create a glyph for ' ' when missing +* Code refining + v0.5 2012.10.06 diff --git a/debian/changelog b/debian/changelog index bf75d52..3e7697e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +pdf2htmlex (0.6-1~git201212111844rd76af-0ubuntu1) quantal; urgency=low + + * Package for quantal + + -- WANG Lu Tue, 11 Dec 2012 18:44:44 +0800 + pdf2htmlex (0.6-1~git201210070052rcb9a8-0ubuntu1) precise; urgency=low * New version diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 4b035ac..22ce5f1 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -25,7 +25,7 @@ #include #include "Param.h" -#include "Preprocessor.h" +#include "util/Preprocessor.h" #include "util/const.h" #include "util/StringFormatter.h" #include "util/TmpFiles.h" diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index f7f6b18..cb893bb 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -12,7 +12,7 @@ #include "HTMLRenderer.h" #include "util/namespace.h" -#include "util/base64.h" +#include "util/base64stream.h" #include "util/math.h" #include "util/misc.h" diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 043bc93..165c684 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -18,7 +18,7 @@ #include "BackgroundRenderer/BackgroundRenderer.h" #include "util/namespace.h" #include "util/ffw.h" -#include "util/base64.h" +#include "util/base64stream.h" #include "util/math.h" #include "util/path.h" diff --git a/src/HTMLRenderer/Preprocessor.cc b/src/util/Preprocessor.cc similarity index 100% rename from src/HTMLRenderer/Preprocessor.cc rename to src/util/Preprocessor.cc diff --git a/src/HTMLRenderer/Preprocessor.h b/src/util/Preprocessor.h similarity index 100% rename from src/HTMLRenderer/Preprocessor.h rename to src/util/Preprocessor.h diff --git a/src/util/StringFormatter.cc b/src/util/StringFormatter.cc new file mode 100644 index 0000000..0b183ac --- /dev/null +++ b/src/util/StringFormatter.cc @@ -0,0 +1,28 @@ +#include + +#include "StringFormatter.h" + +namespace pdf2htmlEX { + +StringFormatter::GuardedPointer StringFormatter::operator () (const char * format, ...) +{ + assert((buf_cnt == 0) && "StringFormatter: buffer is reused!"); + + va_list vlist; + va_start(vlist, format); + int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + va_end(vlist); + if(l >= (int)buf.capacity()) + { + buf.reserve(std::max((long)(l+1), (long)buf.capacity() * 2)); + va_start(vlist, format); + l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + va_end(vlist); + } + assert(l >= 0); // we should fail when vsnprintf fail + assert(l < (int)buf.capacity()); + return GuardedPointer(this); +} + +} //namespace pdf2htmlEX + diff --git a/src/util/StringFormatter.h b/src/util/StringFormatter.h index 96e0370..048a30f 100644 --- a/src/util/StringFormatter.h +++ b/src/util/StringFormatter.h @@ -29,24 +29,8 @@ public: * Important: * there is only one buffer, so new strings will replace old ones */ - GuardedPointer operator () (const char * format, ...) { - assert((buf_cnt == 0) && "StringFormatter: buffer is reused!"); + GuardedPointer operator () (const char * format, ...); - va_list vlist; - va_start(vlist, format); - int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); - va_end(vlist); - if(l >= (int)buf.capacity()) - { - buf.reserve(std::max((long)(l+1), (long)buf.capacity() * 2)); - va_start(vlist, format); - l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); - va_end(vlist); - } - assert(l >= 0); // we should fail when vsnprintf fail - assert(l < (int)buf.capacity()); - return GuardedPointer(this); - } private: friend class GuardedPointer; std::vector buf; diff --git a/src/util/base64.cc b/src/util/base64stream.cc similarity index 97% rename from src/util/base64.cc rename to src/util/base64stream.cc index e5da7e3..7df00fe 100644 --- a/src/util/base64.cc +++ b/src/util/base64stream.cc @@ -1,4 +1,4 @@ -#include "base64.h" +#include "base64stream.h" namespace pdf2htmlEX { diff --git a/src/util/base64.h b/src/util/base64stream.h similarity index 87% rename from src/util/base64.h rename to src/util/base64stream.h index 5bdb4fe..46955cf 100644 --- a/src/util/base64.h +++ b/src/util/base64stream.h @@ -5,8 +5,8 @@ * 2012.11.29 */ -#ifndef BASE64_H__ -#define BASE64_H__ +#ifndef BASE64STREAM_H__ +#define BASE64STREAM_H__ #include @@ -30,4 +30,4 @@ std::ostream & operator << (std::ostream & out, base64stream & bf); std::ostream & operator << (std::ostream & out, base64stream && bf); } //namespace pdf2htmlEX -#endif //BASE64_H__ +#endif //BASE64STREAM_H__ From add1d1e28e906bad84792cae526d0516389de2f1 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Dec 2012 20:20:52 +0800 Subject: [PATCH 31/38] reorganize --- src/util/StringFormatter.cc | 2 ++ src/util/StringFormatter.h | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/util/StringFormatter.cc b/src/util/StringFormatter.cc index 0b183ac..b361c2d 100644 --- a/src/util/StringFormatter.cc +++ b/src/util/StringFormatter.cc @@ -1,4 +1,6 @@ #include +#include +#include #include "StringFormatter.h" diff --git a/src/util/StringFormatter.h b/src/util/StringFormatter.h index 048a30f..2d34126 100644 --- a/src/util/StringFormatter.h +++ b/src/util/StringFormatter.h @@ -8,6 +8,9 @@ #ifndef STRINGFORMATTER_H__ #define STRINGFORMATTER_H__ +#include +#include + namespace pdf2htmlEX { class StringFormatter From 5842d2279ddb1e65420b06c86ccd3a52500f6bde Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Dec 2012 20:48:01 +0800 Subject: [PATCH 32/38] separate TextLineBuffer --- CMakeLists.txt | 3 +- src/HTMLRenderer/HTMLRenderer.h | 70 +------------------ .../{LineBuffer.cc => TextLineBuffer.cc} | 25 +++---- src/HTMLRenderer/general.cc | 4 +- src/HTMLRenderer/state.cc | 9 +-- src/HTMLRenderer/text.cc | 9 +-- 6 files changed, 31 insertions(+), 89 deletions(-) rename src/HTMLRenderer/{LineBuffer.cc => TextLineBuffer.cc} (88%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 903f11f..6db0dc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,7 +151,8 @@ add_executable(pdf2htmlEX src/HTMLRenderer/general.cc src/HTMLRenderer/image.cc src/HTMLRenderer/install.cc - src/HTMLRenderer/LineBuffer.cc + src/HTMLRenderer/TextLineBuffer.h + src/HTMLRenderer/TextLineBuffer.cc src/HTMLRenderer/link.cc src/HTMLRenderer/state.cc src/HTMLRenderer/text.cc diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 22ce5f1..9b61c32 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -379,73 +379,9 @@ class HTMLRenderer : public OutputDev double draw_tx, draw_ty; // some metrics have to be determined after all elements in the lines have been seen - class LineBuffer { - public: - LineBuffer (HTMLRenderer * renderer) : renderer(renderer) { } - - class State { - public: - void begin(std::ostream & out, const State * prev_state); - void end(std::ostream & out) const; - void hash(void); - int diff(const State & s) const; - - enum { - FONT_ID, - FONT_SIZE_ID, - COLOR_ID, - LETTER_SPACE_ID, - WORD_SPACE_ID, - RISE_ID, - - ID_COUNT - }; - - long long ids[ID_COUNT]; - - double ascent; - double descent; - double draw_font_size; - - size_t start_idx; // index of the first Text using this state - // for optimzation - long long hash_value; - bool need_close; - - static const char * format_str; // class names for each id - }; - - - class Offset { - public: - size_t start_idx; // should put this idx before text[start_idx]; - double width; - }; - - void reset(GfxState * state); - void append_unicodes(const Unicode * u, int l); - void append_offset(double width); - void append_state(void); - void flush(void); - - private: - // retrieve state from renderer - void set_state(State & state); - - HTMLRenderer * renderer; - - double x, y; - long long tm_id; - - std::vector states; - std::vector offsets; - std::vector text; - - // for flush - std::vector stack; - - } line_buf; - friend class LineBuffer; + class TextLineBuffer; + friend class TextLineBuffer; + TextLineBuffer * text_line_buf; // for font reencoding int32_t * cur_mapping; diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc similarity index 88% rename from src/HTMLRenderer/LineBuffer.cc rename to src/HTMLRenderer/TextLineBuffer.cc index 00d4078..022ed4c 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -1,5 +1,5 @@ /* - * LineBuffer.cc + * TextLineBuffer.cc * * Generate and optimized HTML for one line * @@ -10,6 +10,7 @@ #include #include "HTMLRenderer.h" +#include "TextLineBuffer.h" #include "util/namespace.h" #include "util/unicode.h" #include "util/math.h" @@ -23,18 +24,18 @@ using std::ostream; using std::cerr; using std::endl; -void HTMLRenderer::LineBuffer::reset(GfxState * state) +void HTMLRenderer::TextLineBuffer::reset(GfxState * state) { state->transform(state->getCurX(), state->getCurY(), &x, &y); tm_id = renderer->cur_ttm_id; } -void HTMLRenderer::LineBuffer::append_unicodes(const Unicode * u, int l) +void HTMLRenderer::TextLineBuffer::append_unicodes(const Unicode * u, int l) { text.insert(text.end(), u, u+l); } -void HTMLRenderer::LineBuffer::append_offset(double width) +void HTMLRenderer::TextLineBuffer::append_offset(double width) { if((!offsets.empty()) && (offsets.back().start_idx == text.size())) offsets.back().width += width; @@ -42,7 +43,7 @@ void HTMLRenderer::LineBuffer::append_offset(double width) offsets.push_back(Offset({text.size(), width})); } -void HTMLRenderer::LineBuffer::append_state(void) +void HTMLRenderer::TextLineBuffer::append_state(void) { if(states.empty() || (states.back().start_idx != text.size())) { @@ -53,7 +54,7 @@ void HTMLRenderer::LineBuffer::append_state(void) set_state(states.back()); } -void HTMLRenderer::LineBuffer::flush(void) +void HTMLRenderer::TextLineBuffer::flush(void) { /* * Each Line is an independent absolute positioined block @@ -181,7 +182,7 @@ void HTMLRenderer::LineBuffer::flush(void) } -void HTMLRenderer::LineBuffer::set_state (State & state) +void HTMLRenderer::TextLineBuffer::set_state (State & state) { state.ids[State::FONT_ID] = renderer->cur_font_info->id; state.ids[State::FONT_SIZE_ID] = renderer->cur_fs_id; @@ -196,7 +197,7 @@ void HTMLRenderer::LineBuffer::set_state (State & state) state.draw_font_size = renderer->draw_font_size; } -void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state) +void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state) { bool first = true; for(int i = 0; i < ID_COUNT; ++i) @@ -229,13 +230,13 @@ void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_s } } -void HTMLRenderer::LineBuffer::State::end(ostream & out) const +void HTMLRenderer::TextLineBuffer::State::end(ostream & out) const { if(need_close) out << ""; } -void HTMLRenderer::LineBuffer::State::hash(void) +void HTMLRenderer::TextLineBuffer::State::hash(void) { hash_value = 0; for(int i = 0; i < ID_COUNT; ++i) @@ -244,7 +245,7 @@ void HTMLRenderer::LineBuffer::State::hash(void) } } -int HTMLRenderer::LineBuffer::State::diff(const State & s) const +int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const { /* * A quick check based on hash_value @@ -260,5 +261,5 @@ int HTMLRenderer::LineBuffer::State::diff(const State & s) const return d; } -const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr"; +const char * HTMLRenderer::TextLineBuffer::State::format_str = "fsclwr"; } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 165c684..1aa2d21 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -14,6 +14,7 @@ #include #include "HTMLRenderer.h" +#include "TextLineBuffer.h" #include "pdf2htmlEX-config.h" #include "BackgroundRenderer/BackgroundRenderer.h" #include "util/namespace.h" @@ -41,7 +42,7 @@ static void dummy(void *, enum ErrorCategory, int pos, char *) HTMLRenderer::HTMLRenderer(const Param * param) :OutputDev() ,line_opened(false) - ,line_buf(this) + ,text_line_buf(new TextLineBuffer(this)) ,preprocessor(param) ,tmp_files(*param) ,param(param) @@ -60,6 +61,7 @@ HTMLRenderer::HTMLRenderer(const Param * param) HTMLRenderer::~HTMLRenderer() { + delete text_line_buf; ffw_finalize(); delete [] cur_mapping; delete [] cur_mapping2; diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 83912bf..4115eef 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -16,6 +16,7 @@ #include #include "HTMLRenderer.h" +#include "TextLineBuffer.h" #include "util/namespace.h" #include "util/math.h" @@ -342,7 +343,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state) { close_text_line(); - line_buf.reset(state); + text_line_buf->reset(state); //resync position draw_ty = cur_ty; @@ -359,14 +360,14 @@ void HTMLRenderer::prepare_text_line(GfxState * state) } else { - line_buf.append_offset(target); + text_line_buf->append_offset(target); draw_tx += target / draw_text_scale; } } if(new_line_state != NLS_NONE) { - line_buf.append_state(); + text_line_buf->append_state(); } line_opened = true; @@ -377,7 +378,7 @@ void HTMLRenderer::close_text_line() if(line_opened) { line_opened = false; - line_buf.flush(); + text_line_buf->flush(); } } diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index a3f1f80..1ccf254 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -16,6 +16,7 @@ #include #include "HTMLRenderer.h" +#include "TextLineBuffer.h" #include "util/ffw.h" #include "util/namespace.h" #include "util/unicode.h" @@ -555,25 +556,25 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) if(is_space && (param->space_as_offset)) { // ignore horiz_scaling, as it's merged in CTM - line_buf.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); + text_line_buf->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); } else { if((param->decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) { - line_buf.append_unicodes(u, uLen); + text_line_buf->append_unicodes(u, uLen); } else { if(cur_font_info->use_tounicode) { Unicode uu = check_unicode(u, uLen, code, font); - line_buf.append_unicodes(&uu, 1); + text_line_buf->append_unicodes(&uu, 1); } else { Unicode uu = unicode_from_font(code, font); - line_buf.append_unicodes(&uu, 1); + text_line_buf->append_unicodes(&uu, 1); } } } From 495dabf96460ee51f68c17e523ded1de1f3c8d0c Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Dec 2012 20:48:10 +0800 Subject: [PATCH 33/38] separate TextLineBuffer --- src/HTMLRenderer/TextLineBuffer.h | 77 +++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/HTMLRenderer/TextLineBuffer.h diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h new file mode 100644 index 0000000..491804a --- /dev/null +++ b/src/HTMLRenderer/TextLineBuffer.h @@ -0,0 +1,77 @@ +#ifndef TEXTLINEBUFFER_H__ +#define TEXTLINEBUFFER_H__ + +#include +#include + +namespace pdf2htmlEX { + +class HTMLRenderer::TextLineBuffer +{ +public: + TextLineBuffer (HTMLRenderer * renderer) : renderer(renderer) { } + + class State { + public: + void begin(std::ostream & out, const State * prev_state); + void end(std::ostream & out) const; + void hash(void); + int diff(const State & s) const; + + enum { + FONT_ID, + FONT_SIZE_ID, + COLOR_ID, + LETTER_SPACE_ID, + WORD_SPACE_ID, + RISE_ID, + + ID_COUNT + }; + + long long ids[ID_COUNT]; + + double ascent; + double descent; + double draw_font_size; + + size_t start_idx; // index of the first Text using this state + // for optimzation + long long hash_value; + bool need_close; + + static const char * format_str; // class names for each id + }; + + + class Offset { + public: + size_t start_idx; // should put this idx before text[start_idx]; + double width; + }; + + void reset(GfxState * state); + void append_unicodes(const Unicode * u, int l); + void append_offset(double width); + void append_state(void); + void flush(void); + +private: + // retrieve state from renderer + void set_state(State & state); + + HTMLRenderer * renderer; + + double x, y; + long long tm_id; + + std::vector states; + std::vector offsets; + std::vector text; + + // for flush + std::vector stack; +}; + +} // namespace pdf2htmlEX +#endif //TEXTLINEBUFFER_H__ From 2dbb226e0e238e94bbbe48cf118f0c571193a1a5 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Dec 2012 20:52:36 +0800 Subject: [PATCH 34/38] .. --- src/HTMLRenderer/TextLineBuffer.cc | 2 +- src/HTMLRenderer/TextLineBuffer.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc index 022ed4c..a3ecc52 100644 --- a/src/HTMLRenderer/TextLineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -57,7 +57,7 @@ void HTMLRenderer::TextLineBuffer::append_state(void) void HTMLRenderer::TextLineBuffer::flush(void) { /* - * Each Line is an independent absolute positioined block + * Each Line is an independent absolute positioned block * so even we have a few states or offsets, we may omit them */ if(text.empty()) return; diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h index 491804a..e13ef20 100644 --- a/src/HTMLRenderer/TextLineBuffer.h +++ b/src/HTMLRenderer/TextLineBuffer.h @@ -6,6 +6,7 @@ namespace pdf2htmlEX { +class HTMLRenderer; class HTMLRenderer::TextLineBuffer { public: From 028fb085a6be8da93dcfa430919780df46a2a023 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 11 Dec 2012 21:59:20 +0800 Subject: [PATCH 35/38] TODO --- TODO | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/TODO b/TODO index db21a95..6cdbc74 100644 --- a/TODO +++ b/TODO @@ -1,14 +1,7 @@ -combine lines (unwarp) - -type 3 fonts word space/offset before the first letter (calendar pdf) -don't dump image when there is nothing - -Integrate splash/cairo -native support for image -native support for draw +minimum line width of css drawing ==Wait until someone asks== @@ -19,8 +12,14 @@ rename single-html -> embed-font/image/css ... merge sub/sup into one line precise link dest: zoom multiple charcode mapped to a same glyph +don't dump image when there is nothing ==Future== +Integrate splash/cairo +native support for image +native support for draw +type 3 fonts +combine lines (unwarp) argument auto-completion use absolute positioning for long whitespace color invert From 2dcc60d6deef87004712983b933e77cbea055c69 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Wed, 12 Dec 2012 16:34:48 +0800 Subject: [PATCH 36/38] Update README.md --- README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/README.md b/README.md index 3d1f3e6..b103ed6 100644 --- a/README.md +++ b/README.md @@ -113,11 +113,6 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u * [I want more features](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ#wiki-feature_commission) * [More](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ) - -**WINDOWS XP USERS: Please make sure ClearType is turned on** - -(Control Panel -> Display -> Appearance -> Effects -> "Use the following method to smooth edges of screen fonts" -> ClearType) - ## LICENSE GPLv2 & GPLv3 Dual licensed From f138adc339afebe3d538d0dd72fa57acf4526bd2 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Wed, 12 Dec 2012 16:54:48 +0800 Subject: [PATCH 37/38] Update README.md --- README.md | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index b103ed6..8faaea5 100644 --- a/README.md +++ b/README.md @@ -16,28 +16,38 @@ A beautiful demo is worth a thousand words: pdf2htmlEX renders PDF files in HTML, utilizing modern Web technologies, aims to provide an accuracy rendering, while keeping optimized for Web display. -It is optimized for modern web browsers.On Linux/Mac, the generated HTML pages could be as beautiful as PDF files. +It is optimized for modern web browsers. On Linux/Mac, the generated HTML pages could be as beautiful as PDF files. This program is designed for scientific papers with complicate formulas and figures, therefore precise rendering is the #1 concern. But of course general PDF files are also supported. +### Why HTML ? + +HTML, together with CSS and Javascript, is much more open and flexible than PDF. Almost everything can be customized. + - Embedding documents to web pages with consistent theme and behavior + - Cross references to other documents are much easier and intuitive + - More functions to the document with Javascript, e.g. access control, animation, statistics + +Readers can also be benefitted + - Read while downloading + - Plugin-free + ## Features -* Single HTML file output +* Optional single HTML file output * Precise rendering -* Text Selection -* Font embedding & reencoding for Web -* Proper styling (Color, Transformation...) +* Text perserved - you can select & copy & paste +* Proper styling + - Font - extracted and reencoded + - Color + - Transformation * Links -* Optimization for Web * [EXPERIMENTAL] Path drawing with CSS - * Orthogonal lines - * Rectangles - * Linear gradients - -### Objects rendered as images - -* Type 3 fonts -* Non-text object + - Orthogonal lines + - Rectangles + - Linear gradients +* Not fully supported, and rendered as images + - Type 3 fonts + - Non-text object ## Get started @@ -99,12 +109,6 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u man pdf2htmlEX -### For Geeks - -* Experimental and unsupported - - pdf2htmlEX --process-nontext 0 --css-draw 1 /path/to/foobar.pdf - ## FAQ * [Troubleshooting compilation errors](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ#wiki-compile) From bb12f8d60e9fdd5495b7d5c1471e6cc265175f33 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 18 Dec 2012 22:00:16 +0800 Subject: [PATCH 38/38] .. --- debian/changelog | 7 +++++++ debian/control | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/debian/changelog b/debian/changelog index 3e7697e..6228965 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +pdf2htmlex (0.6-1~git201212182148rd76af-0ubuntu1) quantal; urgency=low + + * fix dependency of poppler for quantal + * + + -- WANG Lu Tue, 18 Dec 2012 21:48:35 +0800 + pdf2htmlex (0.6-1~git201212111844rd76af-0ubuntu1) quantal; urgency=low * Package for quantal diff --git a/debian/control b/debian/control index 9cd17c6..3aac4d6 100644 --- a/debian/control +++ b/debian/control @@ -8,6 +8,6 @@ Homepage: http://github.com/coolwanglu/pdf2htmlEX Package: pdf2htmlex Architecture: any -Depends: ${shlibs:Depends}, ${misc:Depends}, libpoppler27 (>= 0.20.3), libboost-filesystem-dev, libboost-program-options-dev, libpng12-0, libfontforge1 +Depends: ${shlibs:Depends}, ${misc:Depends}, libpoppler27 (>= 0.20.3) | libpoppler28, libboost-filesystem-dev, libboost-program-options-dev, libpng12-0, libfontforge1 Description: Converts PDF to HTML without losing format pdf2htmlEX converts PDF to HTML while retaining text, format & style as much as possible