diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d31641..6db0dc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build configuration (Debug, Release, project(pdf2htmlEX) cmake_minimum_required(VERSION 2.6.0 FATAL_ERROR) -include_directories(${CMAKE_SOURCE_DIR}/src/include) +include_directories(${CMAKE_SOURCE_DIR}/src) set(PDF2HTMLEX_VERSION "0.6") set(ARCHIVE_NAME pdf2htmlex-${PDF2HTMLEX_VERSION}) @@ -138,36 +138,52 @@ if(NOT CXX0X_SUPPORT) endif() -configure_file (${CMAKE_SOURCE_DIR}/src/include/pdf2htmlEX-config.h.in ${CMAKE_SOURCE_DIR}/src/include/pdf2htmlEX-config.h) +configure_file (${CMAKE_SOURCE_DIR}/src/pdf2htmlEX-config.h.in ${CMAKE_SOURCE_DIR}/src/pdf2htmlEX-config.h) configure_file (${CMAKE_SOURCE_DIR}/pdf2htmlEX.1.in ${CMAKE_SOURCE_DIR}/pdf2htmlEX.1) add_executable(pdf2htmlEX + src/Param.h + src/pdf2htmlEX-config.h src/pdf2htmlEX.cc - src/include/HTMLRenderer.h - src/HTMLRenderer/general.cc - src/HTMLRenderer/state.cc - src/HTMLRenderer/install.cc - src/HTMLRenderer/export.cc - src/HTMLRenderer/text.cc - src/HTMLRenderer/image.cc + src/HTMLRenderer/HTMLRenderer.h src/HTMLRenderer/draw.cc + src/HTMLRenderer/export.cc + src/HTMLRenderer/general.cc + src/HTMLRenderer/image.cc + src/HTMLRenderer/install.cc + src/HTMLRenderer/TextLineBuffer.h + src/HTMLRenderer/TextLineBuffer.cc src/HTMLRenderer/link.cc - src/include/namespace.h - src/HTMLRenderer/LineBuffer.cc - src/include/ffw.h - src/ffw.c - src/include/BackgroundRenderer.h - src/include/SplashBackgroundRenderer.h - src/SplashBackgroundRenderer.cc - src/include/CairoBackgroundRenderer.h - src/CairoBackgroundRenderer.cc - src/include/Preprocessor.h - src/Preprocessor.cc - src/include/util.h - src/util.cc - src/include/ArgParser.h - src/ArgParser.cc - src/include/pdf2htmlEX-config.h + src/HTMLRenderer/state.cc + src/HTMLRenderer/text.cc + src/BackgroundRenderer/BackgroundRenderer.h + src/BackgroundRenderer/SplashBackgroundRenderer.h + src/BackgroundRenderer/SplashBackgroundRenderer.cc + src/BackgroundRenderer/CairoBackgroundRenderer.h + src/BackgroundRenderer/CairoBackgroundRenderer.cc + src/util/ArgParser.h + src/util/ArgParser.cc + src/util/base64stream.h + src/util/base64stream.cc + src/util/const.h + src/util/const.cc + src/util/ffw.h + src/util/ffw.c + src/util/math.h + src/util/math.cc + src/util/misc.h + src/util/misc.cc + src/util/namespace.h + src/util/path.h + src/util/path.cc + src/util/Preprocessor.h + src/util/Preprocessor.cc + src/util/StringFormatter.h + src/util/StringFormatter.cc + src/util/TmpFiles.h + src/util/TmpFiles.cc + src/util/unicode.h + src/util/unicode.cc ) target_link_libraries(pdf2htmlEX ${PDF2HTMLEX_LIBS}) diff --git a/ChangeLog b/ChangeLog index 9f270de..8df58bf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ Latest v0.6 +* New parameter: --use-cropbox +* Progress indicator +* Create a glyph for ' ' when missing +* Code refining + v0.5 2012.10.06 diff --git a/README.md b/README.md index ad25446..8faaea5 100644 --- a/README.md +++ b/README.md @@ -16,28 +16,38 @@ A beautiful demo is worth a thousand words: pdf2htmlEX renders PDF files in HTML, utilizing modern Web technologies, aims to provide an accuracy rendering, while keeping optimized for Web display. -It is optimized for modern web browsers.On Linux/Mac, the generated HTML pages could be as beautiful as PDF files. +It is optimized for modern web browsers. On Linux/Mac, the generated HTML pages could be as beautiful as PDF files. This program is designed for scientific papers with complicate formulas and figures, therefore precise rendering is the #1 concern. But of course general PDF files are also supported. +### Why HTML ? + +HTML, together with CSS and Javascript, is much more open and flexible than PDF. Almost everything can be customized. + - Embedding documents to web pages with consistent theme and behavior + - Cross references to other documents are much easier and intuitive + - More functions to the document with Javascript, e.g. access control, animation, statistics + +Readers can also be benefitted + - Read while downloading + - Plugin-free + ## Features -* Single HTML file output +* Optional single HTML file output * Precise rendering -* Text Selection -* Font embedding & reencoding for Web -* Proper styling (Color, Transformation...) +* Text perserved - you can select & copy & paste +* Proper styling + - Font - extracted and reencoded + - Color + - Transformation * Links -* Optimization for Web * [EXPERIMENTAL] Path drawing with CSS - * Orthogonal lines - * Rectangles - * Linear gradients - -### Objects rendered as images - -* Type 3 fonts -* Non-text object + - Orthogonal lines + - Rectangles + - Linear gradients +* Not fully supported, and rendered as images + - Type 3 fonts + - Non-text object ## Get started @@ -99,12 +109,6 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u man pdf2htmlEX -### For Geeks - -* Experimental and unsupported - - pdf2htmlEX --process-nontext 0 --css-draw 1 /path/to/foobar.pdf - ## FAQ * [Troubleshooting compilation errors](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ#wiki-compile) @@ -113,11 +117,6 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u * [I want more features](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ#wiki-feature_commission) * [More](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ) - -**WINDOWS XP USERS: Please make sure ClearType is turned on** - -(Control Panel -> Display -> Appearance -> Effects -> "Use the following method to smooth edges of screen fonts" -> ClearType) - ## LICENSE GPLv2 & GPLv3 Dual licensed @@ -161,4 +160,5 @@ pdf2htmlEX is inspired by the following projects: ### Special Thanks * Hongliang Tian +* Wanmin Liu diff --git a/TODO b/TODO index e1a81c7..18bb97b 100644 --- a/TODO +++ b/TODO @@ -1,30 +1,37 @@ +<<<<<<< HEAD manually locate font if fixed name word space/offset before the first letter (calendar pdf) don't dump image when there is nothing +======= +>>>>>>> master -Integrate splash/cairo -native support for image -native support for draw +word space/offset before the first letter (calendar pdf) -draw non-orthogonal lines with CSS +minimum line width of css drawing -position history stack (popstate) ==Wait until someone asks== +position history stack (popstate) +draw non-orthogonal lines with CSS try harder finding glyph names (using fontforge) for CID Type 0 rename single-html -> embed-font/image/css ... merge sub/sup into one line precise link dest: zoom multiple charcode mapped to a same glyph +don't dump image when there is nothing ==Future== +Integrate splash/cairo +native support for image +native support for draw +type 3 fonts +combine lines (unwarp) argument auto-completion use absolute positioning for long whitespace color invert detect duplicate base fonts when embedding disable selection if we know unicode is wrong -combine lines (unwarp) -Printing check if we can add information to the font, and let browsers show ligatures automatically +Printing diff --git a/debian/changelog b/debian/changelog index bf75d52..6228965 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,16 @@ +pdf2htmlex (0.6-1~git201212182148rd76af-0ubuntu1) quantal; urgency=low + + * fix dependency of poppler for quantal + * + + -- WANG Lu Tue, 18 Dec 2012 21:48:35 +0800 + +pdf2htmlex (0.6-1~git201212111844rd76af-0ubuntu1) quantal; urgency=low + + * Package for quantal + + -- WANG Lu Tue, 11 Dec 2012 18:44:44 +0800 + pdf2htmlex (0.6-1~git201210070052rcb9a8-0ubuntu1) precise; urgency=low * New version diff --git a/debian/control b/debian/control index 9cd17c6..3aac4d6 100644 --- a/debian/control +++ b/debian/control @@ -8,6 +8,6 @@ Homepage: http://github.com/coolwanglu/pdf2htmlEX Package: pdf2htmlex Architecture: any -Depends: ${shlibs:Depends}, ${misc:Depends}, libpoppler27 (>= 0.20.3), libboost-filesystem-dev, libboost-program-options-dev, libpng12-0, libfontforge1 +Depends: ${shlibs:Depends}, ${misc:Depends}, libpoppler27 (>= 0.20.3) | libpoppler28, libboost-filesystem-dev, libboost-program-options-dev, libpng12-0, libfontforge1 Description: Converts PDF to HTML without losing format pdf2htmlEX converts PDF to HTML while retaining text, format & style as much as possible diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 9775233..aa9cdbc 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -54,9 +54,12 @@ If multiple values are specified, the minimum one will be used. If none is specified, pages will be rendered as 72DPI. .TP -.B --hpdi , --vpdi (Default: 144) +.B --hdpi , --vdpi (Default: 144) Specify the horizontal and vertical DPI for images .TP +.B --use-cropbox <0|1> (Default: 0) +Use CropBox instead of MediaBox for output. +.TP .B --process-nontext <0|1> (Default: 1) Whether to process non-text objects (as images) .TP @@ -110,7 +113,7 @@ If this value is set to 1, the ToUnicode Map is always applied, if provided in P If set to -1, a customized map is used such that rendering will be correct in HTML (visually the same), but you may not get correct characters by select & copy & paste. -If set to 0, pdf2htmlEX would try it best to balance the two methods above. +If set to 0, pdf2htmlEX would try its best to balance the two methods above. .TP .B --space-as-offset <0|1> (Default: 0) Treat space characters as offsets, which may increase the size of the output. @@ -118,13 +121,13 @@ Treat space characters as offsets, which may increase the size of the output. Turn it on if space characters are not displayed correctly, or you want to remove positional spaces. .TP .B --stretch-narrow-glyph <0|1> (Default: 0) -If set to 1, glyphs narrower than described in PDF will be strecth; otherwise space will be padded to the right of the glyphs +If set to 1, glyphs narrower than described in PDF will be stretched; otherwise space will be padded to the right of the glyphs .TP -.B --squeeze_wide_glyph <0|1> (Default: 1) +.B --squeeze-wide-glyph <0|1> (Default: 1) If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated. .TP .B --remove-unused-glyph <0|1> (Default: 1) -[Experimental] If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size. +If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size. .TP .B --font-suffix (Default: .ttf), --font-format (Default: truetype) Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. diff --git a/share/base.css b/share/base.css index c04938a..fd42bd3 100644 --- a/share/base.css +++ b/share/base.css @@ -63,6 +63,8 @@ span { position:relative; vertical-align: baseline; + /* _ for spaces may need display:inline, which will override this */ + display:inline-block; } ._ { color:transparent; @@ -74,9 +76,6 @@ span { ::-moz-selection{ background: rgba(127,255,255,1); } -.i { - position:absolute; -} .j { display:none; } diff --git a/src/include/BackgroundRenderer.h b/src/BackgroundRenderer/BackgroundRenderer.h similarity index 100% rename from src/include/BackgroundRenderer.h rename to src/BackgroundRenderer/BackgroundRenderer.h diff --git a/src/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc similarity index 100% rename from src/CairoBackgroundRenderer.cc rename to src/BackgroundRenderer/CairoBackgroundRenderer.cc diff --git a/src/include/CairoBackgroundRenderer.h b/src/BackgroundRenderer/CairoBackgroundRenderer.h similarity index 100% rename from src/include/CairoBackgroundRenderer.h rename to src/BackgroundRenderer/CairoBackgroundRenderer.h diff --git a/src/CairoOutputDev/CairoFontEngine.cc b/src/BackgroundRenderer/CairoOutputDev/CairoFontEngine.cc similarity index 100% rename from src/CairoOutputDev/CairoFontEngine.cc rename to src/BackgroundRenderer/CairoOutputDev/CairoFontEngine.cc diff --git a/src/CairoOutputDev/CairoFontEngine.h b/src/BackgroundRenderer/CairoOutputDev/CairoFontEngine.h similarity index 100% rename from src/CairoOutputDev/CairoFontEngine.h rename to src/BackgroundRenderer/CairoOutputDev/CairoFontEngine.h diff --git a/src/CairoOutputDev/CairoOutputDev.cc b/src/BackgroundRenderer/CairoOutputDev/CairoOutputDev.cc similarity index 100% rename from src/CairoOutputDev/CairoOutputDev.cc rename to src/BackgroundRenderer/CairoOutputDev/CairoOutputDev.cc diff --git a/src/CairoOutputDev/CairoOutputDev.h b/src/BackgroundRenderer/CairoOutputDev/CairoOutputDev.h similarity index 100% rename from src/CairoOutputDev/CairoOutputDev.h rename to src/BackgroundRenderer/CairoOutputDev/CairoOutputDev.h diff --git a/src/CairoOutputDev/CairoRescaleBox.cc b/src/BackgroundRenderer/CairoOutputDev/CairoRescaleBox.cc similarity index 100% rename from src/CairoOutputDev/CairoRescaleBox.cc rename to src/BackgroundRenderer/CairoOutputDev/CairoRescaleBox.cc diff --git a/src/CairoOutputDev/CairoRescaleBox.h b/src/BackgroundRenderer/CairoOutputDev/CairoRescaleBox.h similarity index 100% rename from src/CairoOutputDev/CairoRescaleBox.h rename to src/BackgroundRenderer/CairoOutputDev/CairoRescaleBox.h diff --git a/src/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc similarity index 83% rename from src/SplashBackgroundRenderer.cc rename to src/BackgroundRenderer/SplashBackgroundRenderer.cc index cf60bb2..03be242 100644 --- a/src/SplashBackgroundRenderer.cc +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -19,7 +19,8 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y, double originX, double originY, CharCode code, int nBytes, Unicode *u, int uLen) { - if((state->getRender() & 3) == 3) + if(((state->getRender() & 3) == 3) + || ((state->getFont()) && (state->getFont()->getWMode()))) { SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen); } @@ -32,7 +33,9 @@ static GBool annot_cb(Annot *, void *) { void SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno, const string & filename) { doc->displayPage(this, pageno, param->h_dpi, param->v_dpi, - 0, true, false, false, + 0, + (param->use_cropbox == 0), + false, false, nullptr, nullptr, &annot_cb, nullptr); getBitmap()->writeImgFile(splashFormatPng, diff --git a/src/include/SplashBackgroundRenderer.h b/src/BackgroundRenderer/SplashBackgroundRenderer.h similarity index 97% rename from src/include/SplashBackgroundRenderer.h rename to src/BackgroundRenderer/SplashBackgroundRenderer.h index 8ba0cd5..ebf6c74 100644 --- a/src/include/SplashBackgroundRenderer.h +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.h @@ -15,8 +15,8 @@ #include #include -#include "HTMLRenderer.h" #include "Param.h" +#include "HTMLRenderer/HTMLRenderer.h" namespace pdf2htmlEX { diff --git a/src/include/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h similarity index 85% rename from src/include/HTMLRenderer.h rename to src/HTMLRenderer/HTMLRenderer.h index 54e7654..9b61c32 100644 --- a/src/include/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -25,8 +25,10 @@ #include #include "Param.h" -#include "util.h" -#include "Preprocessor.h" +#include "util/Preprocessor.h" +#include "util/const.h" +#include "util/StringFormatter.h" +#include "util/TmpFiles.h" /* * Naming Convention @@ -38,7 +40,6 @@ * b - page Box * d - page Decoration * l - Line - * i - Image * j - Js data * p - Page * @@ -60,6 +61,51 @@ namespace pdf2htmlEX { +// we may need more info of a font in the future +class FontInfo +{ +public: + long long id; + bool use_tounicode; + int em_size; + double ascent, descent; +}; + +class GfxRGB_hash +{ +public: + size_t operator () (const GfxRGB & rgb) const + { + return (colToByte(rgb.r) << 16) | (colToByte(rgb.g) << 8) | (colToByte(rgb.b)); + } +}; + +class GfxRGB_equal +{ +public: + bool operator ()(const GfxRGB & rgb1, const GfxRGB & rgb2) const + { + return ((rgb1.r == rgb2.r) && (rgb1.g == rgb2.g) && (rgb1.b == rgb1.b)); + } +}; + +class Matrix_less +{ +public: + bool operator () (const Matrix & m1, const Matrix & m2) const + { + // Note that we only care about the first 4 elements + for(int i = 0; i < 4; ++i) + { + if(m1.m[i] < m2.m[i] - EPS) + return true; + if(m1.m[i] > m2.m[i] + EPS) + return false; + } + return false; + } +}; + class HTMLRenderer : public OutputDev { public: @@ -154,10 +200,8 @@ class HTMLRenderer : public OutputDev void post_process(); // set flags - void fix_stream (std::ostream & out); + void set_stream_flags (std::ostream & out); - void add_tmp_file (const std::string & fn); - void clean_tmp_files (); std::string dump_embedded_font (GfxFont * font, long long fn_id); void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false); @@ -335,82 +379,20 @@ class HTMLRenderer : public OutputDev double draw_tx, draw_ty; // some metrics have to be determined after all elements in the lines have been seen - class LineBuffer { - public: - LineBuffer (HTMLRenderer * renderer) : renderer(renderer) { } - - class State { - public: - void begin(std::ostream & out, const State * prev_state); - void end(std::ostream & out) const; - void hash(void); - int diff(const State & s) const; - - enum { - FONT_ID, - FONT_SIZE_ID, - COLOR_ID, - LETTER_SPACE_ID, - WORD_SPACE_ID, - RISE_ID, - - ID_COUNT - }; - - long long ids[ID_COUNT]; - - double ascent; - double descent; - double draw_font_size; - - size_t start_idx; // index of the first Text using this state - // for optimzation - long long hash_value; - bool need_close; - - static const char * format_str; // class names for each id - }; - - - class Offset { - public: - size_t start_idx; // should put this idx before text[start_idx]; - double width; - }; - - void reset(GfxState * state); - void append_unicodes(const Unicode * u, int l); - void append_offset(double width); - void append_state(void); - void flush(void); - - private: - // retrieve state from renderer - void set_state(State & state); - - HTMLRenderer * renderer; - - double x, y; - long long tm_id; - - std::vector states; - std::vector offsets; - std::vector text; - - // for flush - std::vector stack; - - } line_buf; - friend class LineBuffer; + class TextLineBuffer; + friend class TextLineBuffer; + TextLineBuffer * text_line_buf; // for font reencoding int32_t * cur_mapping; char ** cur_mapping2; int * width_list; + Preprocessor preprocessor; + TmpFiles tmp_files; // for string formatting - string_formatter str_fmt; + StringFormatter str_fmt; //////////////////////////////////////////////////// // styles & resources @@ -426,12 +408,9 @@ class HTMLRenderer : public OutputDev std::map rise_map; std::map height_map; - int image_count; - const Param * param; std::ofstream html_fout, css_fout; std::string html_path, css_path; - std::set tmp_files; static const std::string MANIFEST_FILENAME; }; diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc similarity index 85% rename from src/HTMLRenderer/LineBuffer.cc rename to src/HTMLRenderer/TextLineBuffer.cc index acbb944..a3ecc52 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -1,5 +1,5 @@ /* - * LineBuffer.cc + * TextLineBuffer.cc * * Generate and optimized HTML for one line * @@ -10,7 +10,10 @@ #include #include "HTMLRenderer.h" -#include "namespace.h" +#include "TextLineBuffer.h" +#include "util/namespace.h" +#include "util/unicode.h" +#include "util/math.h" namespace pdf2htmlEX { @@ -18,19 +21,21 @@ using std::min; using std::max; using std::vector; using std::ostream; +using std::cerr; +using std::endl; -void HTMLRenderer::LineBuffer::reset(GfxState * state) +void HTMLRenderer::TextLineBuffer::reset(GfxState * state) { state->transform(state->getCurX(), state->getCurY(), &x, &y); tm_id = renderer->cur_ttm_id; } -void HTMLRenderer::LineBuffer::append_unicodes(const Unicode * u, int l) +void HTMLRenderer::TextLineBuffer::append_unicodes(const Unicode * u, int l) { text.insert(text.end(), u, u+l); } -void HTMLRenderer::LineBuffer::append_offset(double width) +void HTMLRenderer::TextLineBuffer::append_offset(double width) { if((!offsets.empty()) && (offsets.back().start_idx == text.size())) offsets.back().width += width; @@ -38,7 +43,7 @@ void HTMLRenderer::LineBuffer::append_offset(double width) offsets.push_back(Offset({text.size(), width})); } -void HTMLRenderer::LineBuffer::append_state(void) +void HTMLRenderer::TextLineBuffer::append_state(void) { if(states.empty() || (states.back().start_idx != text.size())) { @@ -49,10 +54,10 @@ void HTMLRenderer::LineBuffer::append_state(void) set_state(states.back()); } -void HTMLRenderer::LineBuffer::flush(void) +void HTMLRenderer::TextLineBuffer::flush(void) { /* - * Each Line is an independent absolute positioined block + * Each Line is an independent absolute positioned block * so even we have a few states or offsets, we may omit them */ if(text.empty()) return; @@ -80,8 +85,8 @@ void HTMLRenderer::LineBuffer::flush(void) ostream & out = renderer->html_fout; out << "
install_height(max_ascent) @@ -177,7 +182,7 @@ void HTMLRenderer::LineBuffer::flush(void) } -void HTMLRenderer::LineBuffer::set_state (State & state) +void HTMLRenderer::TextLineBuffer::set_state (State & state) { state.ids[State::FONT_ID] = renderer->cur_font_info->id; state.ids[State::FONT_SIZE_ID] = renderer->cur_fs_id; @@ -192,7 +197,7 @@ void HTMLRenderer::LineBuffer::set_state (State & state) state.draw_font_size = renderer->draw_font_size; } -void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state) +void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state) { bool first = true; for(int i = 0; i < ID_COUNT; ++i) @@ -225,13 +230,13 @@ void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_s } } -void HTMLRenderer::LineBuffer::State::end(ostream & out) const +void HTMLRenderer::TextLineBuffer::State::end(ostream & out) const { if(need_close) out << ""; } -void HTMLRenderer::LineBuffer::State::hash(void) +void HTMLRenderer::TextLineBuffer::State::hash(void) { hash_value = 0; for(int i = 0; i < ID_COUNT; ++i) @@ -240,7 +245,7 @@ void HTMLRenderer::LineBuffer::State::hash(void) } } -int HTMLRenderer::LineBuffer::State::diff(const State & s) const +int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const { /* * A quick check based on hash_value @@ -256,5 +261,5 @@ int HTMLRenderer::LineBuffer::State::diff(const State & s) const return d; } -const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr"; +const char * HTMLRenderer::TextLineBuffer::State::format_str = "fsclwr"; } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h new file mode 100644 index 0000000..e13ef20 --- /dev/null +++ b/src/HTMLRenderer/TextLineBuffer.h @@ -0,0 +1,78 @@ +#ifndef TEXTLINEBUFFER_H__ +#define TEXTLINEBUFFER_H__ + +#include +#include + +namespace pdf2htmlEX { + +class HTMLRenderer; +class HTMLRenderer::TextLineBuffer +{ +public: + TextLineBuffer (HTMLRenderer * renderer) : renderer(renderer) { } + + class State { + public: + void begin(std::ostream & out, const State * prev_state); + void end(std::ostream & out) const; + void hash(void); + int diff(const State & s) const; + + enum { + FONT_ID, + FONT_SIZE_ID, + COLOR_ID, + LETTER_SPACE_ID, + WORD_SPACE_ID, + RISE_ID, + + ID_COUNT + }; + + long long ids[ID_COUNT]; + + double ascent; + double descent; + double draw_font_size; + + size_t start_idx; // index of the first Text using this state + // for optimzation + long long hash_value; + bool need_close; + + static const char * format_str; // class names for each id + }; + + + class Offset { + public: + size_t start_idx; // should put this idx before text[start_idx]; + double width; + }; + + void reset(GfxState * state); + void append_unicodes(const Unicode * u, int l); + void append_offset(double width); + void append_state(void); + void flush(void); + +private: + // retrieve state from renderer + void set_state(State & state); + + HTMLRenderer * renderer; + + double x, y; + long long tm_id; + + std::vector states; + std::vector offsets; + std::vector text; + + // for flush + std::vector stack; +}; + +} // namespace pdf2htmlEX +#endif //TEXTLINEBUFFER_H__ diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc index e745fe4..9d21d46 100644 --- a/src/HTMLRenderer/draw.cc +++ b/src/HTMLRenderer/draw.cc @@ -14,8 +14,9 @@ #include #include "HTMLRenderer.h" -#include "util.h" -#include "namespace.h" +#include "util/misc.h" +#include "util/math.h" +#include "util/namespace.h" namespace pdf2htmlEX { @@ -33,36 +34,36 @@ static bool is_horizontal_line(GfxSubpath * path) { return ((path->getNumPoints() == 2) && (!path->getCurve(1)) - && (_equal(path->getY(0), path->getY(1)))); + && (equal(path->getY(0), path->getY(1)))); } static bool is_vertical_line(GfxSubpath * path) { return ((path->getNumPoints() == 2) && (!path->getCurve(1)) - && (_equal(path->getX(0), path->getX(1)))); + && (equal(path->getX(0), path->getX(1)))); } static bool is_rectangle(GfxSubpath * path) { if (!(((path->getNumPoints() != 4) && (path->isClosed())) || ((path->getNumPoints() == 5) - && _equal(path->getX(0), path->getX(4)) - && _equal(path->getY(0), path->getY(4))))) + && equal(path->getX(0), path->getX(4)) + && equal(path->getY(0), path->getY(4))))) return false; for(int i = 1; i < path->getNumPoints(); ++i) if(path->getCurve(i)) return false; - return (_equal(path->getY(0), path->getY(1)) - && _equal(path->getX(1), path->getX(2)) - && _equal(path->getY(2), path->getY(3)) - && _equal(path->getX(3), path->getX(0))) - || (_equal(path->getX(0), path->getX(1)) - && _equal(path->getY(1), path->getY(2)) - && _equal(path->getX(2), path->getX(3)) - && _equal(path->getY(3), path->getY(0))); + return (equal(path->getY(0), path->getY(1)) + && equal(path->getX(1), path->getX(2)) + && equal(path->getY(2), path->getY(3)) + && equal(path->getX(3), path->getX(0))) + || (equal(path->getX(0), path->getX(1)) + && equal(path->getY(1), path->getY(2)) + && equal(path->getX(2), path->getX(3)) + && equal(path->getY(3), path->getY(0))); } static void get_shading_bbox(GfxState * state, GfxShading * shading, @@ -105,7 +106,7 @@ static void get_shading_bbox(GfxState * state, GfxShading * shading, */ static double get_angle(double dx, double dy) { - double r = _hypot(dx, dy); + double r = hypot(dx, dy); /* * acos always returns [0, pi] @@ -208,10 +209,10 @@ void LinearGradient::dumpto (ostream & out) auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"}; for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) { - out << "background-image:" << (*iter) << "linear-gradient(" << _round(angle) << "rad"; + out << "background-image:" << (*iter) << "linear-gradient(" << round(angle) << "rad"; for(auto iter2 = stops.begin(); iter2 != stops.end(); ++iter2) { - out << "," << (iter2->rgb) << " " << _round((iter2->pos) * 100) << "%"; + out << "," << (iter2->rgb) << " " << round((iter2->pos) * 100) << "%"; } out << ");"; } @@ -318,7 +319,7 @@ bool HTMLRenderer::css_do_path(GfxState *state, bool fill, bool test_only) GfxRGB * ps = fill ? nullptr : (&stroke_color); GfxRGB * pf = fill ? (&fill_color) : nullptr; - if(_equal(h, 0) || _equal(w, 0)) + if(equal(h, 0) || equal(w, 0)) { // orthogonal line @@ -351,7 +352,7 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co double new_tm[6]; memcpy(new_tm, tm, sizeof(new_tm)); - _tm_transform(new_tm, x, y); + tm_transform(new_tm, x, y); double scale = 1.0; { @@ -359,8 +360,8 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co double i1 = (new_tm[0] + new_tm[2]) / sqrt2; double i2 = (new_tm[1] + new_tm[3]) / sqrt2; - scale = _hypot(i1, i2); - if(_is_positive(scale)) + scale = hypot(i1, i2); + if(is_positive(scale)) { for(int i = 0; i < 4; ++i) new_tm[i] /= scale; @@ -383,8 +384,8 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co if(i > 0) html_fout << ' '; double lw = line_width_array[i] * scale; - html_fout << _round(lw); - if(_is_positive(lw)) html_fout << "px"; + html_fout << round(lw); + if(is_positive(lw)) html_fout << "px"; } html_fout << ";"; } @@ -407,10 +408,10 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co style_function(style_function_data, html_fout); } - html_fout << "bottom:" << _round(y) << "px;" - << "left:" << _round(x) << "px;" - << "width:" << _round(w * scale) << "px;" - << "height:" << _round(h * scale) << "px;"; + html_fout << "bottom:" << round(y) << "px;" + << "left:" << round(x) << "px;" + << "width:" << round(w * scale) << "px;" + << "height:" << round(h * scale) << "px;"; html_fout << "\">
"; } diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index 6caec63..cb893bb 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -11,7 +11,10 @@ #include #include "HTMLRenderer.h" -#include "namespace.h" +#include "util/namespace.h" +#include "util/base64stream.h" +#include "util/math.h" +#include "util/misc.h" namespace pdf2htmlEX { @@ -38,7 +41,7 @@ void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suff css_fout << ")format(\"" << fontfileformat << "\");}.f" << info.id << "{font-family:f" << info.id - << ";line-height:" << _round(info.ascent - info.descent) + << ";line-height:" << round(info.ascent - info.descent) << ";font-style:normal;font-weight:normal;}"; css_fout << endl; @@ -81,14 +84,14 @@ void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, cons else css_fout << "font-style:normal;"; - css_fout << "line-height:" << _round(info.ascent - info.descent) << ";"; + css_fout << "line-height:" << round(info.ascent - info.descent) << ";"; css_fout << "}" << endl; } void HTMLRenderer::export_font_size (long long fs_id, double font_size) { - css_fout << ".s" << fs_id << "{font-size:" << _round(font_size) << "px;}" << endl; + css_fout << ".s" << fs_id << "{font-size:" << round(font_size) << "px;}" << endl; } void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) @@ -99,7 +102,7 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) // we have already shifted the origin // TODO: recognize common matices - if(_tm_equal(tm, id_matrix, 4)) + if(tm_equal(tm, ID_MATRIX, 4)) { auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"}; for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) @@ -112,10 +115,10 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) { // PDF use a different coordinate system from Web css_fout << *iter << "transform:matrix(" - << _round(tm[0]) << ',' - << _round(-tm[1]) << ',' - << _round(-tm[2]) << ',' - << _round(tm[3]) << ','; + << round(tm[0]) << ',' + << round(-tm[1]) << ',' + << round(-tm[2]) << ',' + << round(tm[3]) << ','; css_fout << "0,0);"; } @@ -125,12 +128,12 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) void HTMLRenderer::export_letter_space (long long ls_id, double letter_space) { - css_fout << ".l" << ls_id << "{letter-spacing:" << _round(letter_space) << "px;}" << endl; + css_fout << ".l" << ls_id << "{letter-spacing:" << round(letter_space) << "px;}" << endl; } void HTMLRenderer::export_word_space (long long ws_id, double word_space) { - css_fout << ".w" << ws_id << "{word-spacing:" << _round(word_space) << "px;}" << endl; + css_fout << ".w" << ws_id << "{word-spacing:" << round(word_space) << "px;}" << endl; } void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb) @@ -141,19 +144,19 @@ void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb) void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) { if(ws_width > 0) - css_fout << "._" << ws_id << "{display:inline-block;width:" << _round(ws_width) << "px;}" << endl; + css_fout << "._" << ws_id << "{display:inline-block;width:" << round(ws_width) << "px;}" << endl; else - css_fout << "._" << ws_id << "{display:inline;margin-left:" << _round(ws_width) << "px;}" << endl; + css_fout << "._" << ws_id << "{display:inline;margin-left:" << round(ws_width) << "px;}" << endl; } void HTMLRenderer::export_rise (long long rise_id, double rise) { - css_fout << ".r" << rise_id << "{top:" << _round(-rise) << "px;}" << endl; + css_fout << ".r" << rise_id << "{top:" << round(-rise) << "px;}" << endl; } void HTMLRenderer::export_height (long long height_id, double height) { - css_fout << ".h" << height_id << "{height:" << _round(height) << "px;}" << endl; + css_fout << ".h" << height_id << "{height:" << round(height) << "px;}" << endl; } } diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 9080e0f..1aa2d21 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -14,10 +14,14 @@ #include #include "HTMLRenderer.h" -#include "BackgroundRenderer.h" -#include "namespace.h" -#include "ffw.h" +#include "TextLineBuffer.h" #include "pdf2htmlEX-config.h" +#include "BackgroundRenderer/BackgroundRenderer.h" +#include "util/namespace.h" +#include "util/ffw.h" +#include "util/base64stream.h" +#include "util/math.h" +#include "util/path.h" namespace pdf2htmlEX { @@ -28,6 +32,8 @@ using std::max; using std::min_element; using std::vector; using std::abs; +using std::cerr; +using std::endl; static void dummy(void *, enum ErrorCategory, int pos, char *) { @@ -36,9 +42,9 @@ static void dummy(void *, enum ErrorCategory, int pos, char *) HTMLRenderer::HTMLRenderer(const Param * param) :OutputDev() ,line_opened(false) - ,line_buf(this) + ,text_line_buf(new TextLineBuffer(this)) ,preprocessor(param) - ,image_count(0) + ,tmp_files(*param) ,param(param) { if(!(param->debug)) @@ -55,8 +61,8 @@ HTMLRenderer::HTMLRenderer(const Param * param) HTMLRenderer::~HTMLRenderer() { + delete text_line_buf; ffw_finalize(); - clean_tmp_files(); delete [] cur_mapping; delete [] cur_mapping2; delete [] width_list; @@ -76,7 +82,7 @@ void HTMLRenderer::process(PDFDoc *doc) bg_renderer->startDoc(doc); } - int page_count = (param->last_page - param->first_page); + int page_count = (param->last_page - param->first_page + 1); for(int i = param->first_page; i <= param->last_page ; ++i) { cerr << "Working: " << (i-param->first_page) << "/" << page_count << '\r' << flush; @@ -87,21 +93,23 @@ void HTMLRenderer::process(PDFDoc *doc) html_fout.open((char*)page_fn, ofstream::binary); if(!html_fout) throw string("Cannot open ") + (char*)page_fn + " for writing"; - fix_stream(html_fout); + set_stream_flags(html_fout); } if(param->process_nontext) { auto fn = str_fmt("%s/p%x.png", (param->single_html ? param->tmp_dir : param->dest_dir).c_str(), i); if(param->single_html) - add_tmp_file((char*)fn); + tmp_files.add((char*)fn); bg_renderer->render_page(doc, i, (char*)fn); } doc->displayPage(this, i, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, - 0, true, false, false, + 0, + (param->use_cropbox == 0), + false, false, nullptr, nullptr, nullptr, nullptr); if(param->split_pages) @@ -170,8 +178,8 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) cur_font_size = draw_font_size = 0; cur_fs_id = install_font_size(cur_font_size); - memcpy(cur_text_tm, id_matrix, sizeof(cur_text_tm)); - memcpy(draw_text_tm, id_matrix, sizeof(draw_text_tm)); + memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm)); + memcpy(draw_text_tm, ID_MATRIX, sizeof(draw_text_tm)); cur_ttm_id = install_transform_matrix(draw_text_tm); cur_letter_space = cur_word_space = 0; @@ -210,7 +218,7 @@ void HTMLRenderer::endPage() { for(int i = 0; i < 6; ++i) { if(i > 0) html_fout << ","; - html_fout << _round(default_ctm[i]); + html_fout << round(default_ctm[i]); } html_fout << "]"; @@ -232,17 +240,17 @@ void HTMLRenderer::pre_process(PDFDoc * doc) vector zoom_factors; - if(_is_positive(param->zoom)) + if(is_positive(param->zoom)) { zoom_factors.push_back(param->zoom); } - if(_is_positive(param->fit_width)) + if(is_positive(param->fit_width)) { zoom_factors.push_back((param->fit_width) / preprocessor.get_max_width()); } - if(_is_positive(param->fit_height)) + if(is_positive(param->fit_height)) { zoom_factors.push_back((param->fit_height) / preprocessor.get_max_height()); } @@ -280,13 +288,13 @@ void HTMLRenderer::pre_process(PDFDoc * doc) : str_fmt("%s/%s", param->dest_dir.c_str(), param->css_filename.c_str()); if(param->single_html && (!param->split_pages)) - add_tmp_file((char*)fn); + tmp_files.add((char*)fn); css_path = (char*)fn, css_fout.open(css_path, ofstream::binary); if(!css_fout) throw string("Cannot open ") + (char*)fn + " for writing"; - fix_stream(css_fout); + set_stream_flags(css_fout); } // if split-pages is specified, open & close the file in the process loop @@ -301,13 +309,13 @@ void HTMLRenderer::pre_process(PDFDoc * doc) * Otherwise just generate it */ auto fn = str_fmt("%s/__pages", param->tmp_dir.c_str()); - add_tmp_file((char*)fn); + tmp_files.add((char*)fn); html_path = (char*)fn; html_fout.open(html_path, ofstream::binary); if(!html_fout) throw string("Cannot open ") + (char*)fn + " for writing"; - fix_stream(html_fout); + set_stream_flags(html_fout); } } @@ -327,7 +335,7 @@ void HTMLRenderer::post_process() output.open((char*)fn, ofstream::binary); if(!output) throw string("Cannot open ") + (char*)fn + " for writing"; - fix_stream(output); + set_stream_flags(output); } // apply manifest @@ -385,40 +393,13 @@ void HTMLRenderer::post_process() } } -void HTMLRenderer::fix_stream (std::ostream & out) +void HTMLRenderer::set_stream_flags(std::ostream & out) { // we output all ID's in hex // browsers are not happy with scientific notations out << hex << fixed; } -void HTMLRenderer::add_tmp_file(const string & fn) -{ - if(!param->clean_tmp) - return; - - if(tmp_files.insert(fn).second && param->debug) - cerr << "Add new temporary file: " << fn << endl; -} - -void HTMLRenderer::clean_tmp_files() -{ - if(!param->clean_tmp) - return; - - for(auto iter = tmp_files.begin(); iter != tmp_files.end(); ++iter) - { - const string & fn = *iter; - remove(fn.c_str()); - if(param->debug) - cerr << "Remove temporary file: " << fn << endl; - } - - remove(param->tmp_dir.c_str()); - if(param->debug) - cerr << "Remove temporary directory: " << param->tmp_dir << endl; -} - void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy) { string fn = get_filename(path); diff --git a/src/HTMLRenderer/image.cc b/src/HTMLRenderer/image.cc index cac7c2c..9c3da52 100644 --- a/src/HTMLRenderer/image.cc +++ b/src/HTMLRenderer/image.cc @@ -8,7 +8,7 @@ */ #include "HTMLRenderer.h" -#include "namespace.h" +#include "util/namespace.h" namespace pdf2htmlEX { diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index eea21d7..ea3efa7 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -15,12 +15,15 @@ #include "Param.h" #include "HTMLRenderer.h" -#include "namespace.h" -#include "util.h" +#include "util/namespace.h" +#include "util/math.h" +#include "util/misc.h" namespace pdf2htmlEX { using std::abs; +using std::cerr; +using std::endl; const FontInfo * HTMLRenderer::install_font(GfxFont * font) { @@ -203,7 +206,7 @@ void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info) long long HTMLRenderer::install_font_size(double font_size) { auto iter = font_size_map.lower_bound(font_size - EPS); - if((iter != font_size_map.end()) && (_equal(iter->first, font_size))) + if((iter != font_size_map.end()) && (equal(iter->first, font_size))) return iter->second; long long new_fs_id = font_size_map.size(); @@ -218,7 +221,7 @@ long long HTMLRenderer::install_transform_matrix(const double * tm) memcpy(m.m, tm, sizeof(m.m)); auto iter = transform_matrix_map.lower_bound(m); - if((iter != transform_matrix_map.end()) && (_tm_equal(m.m, iter->first.m, 4))) + if((iter != transform_matrix_map.end()) && (tm_equal(m.m, iter->first.m, 4))) return iter->second; long long new_tm_id = transform_matrix_map.size(); @@ -230,7 +233,7 @@ long long HTMLRenderer::install_transform_matrix(const double * tm) long long HTMLRenderer::install_letter_space(double letter_space) { auto iter = letter_space_map.lower_bound(letter_space - EPS); - if((iter != letter_space_map.end()) && (_equal(iter->first, letter_space))) + if((iter != letter_space_map.end()) && (equal(iter->first, letter_space))) return iter->second; long long new_ls_id = letter_space_map.size(); @@ -242,7 +245,7 @@ long long HTMLRenderer::install_letter_space(double letter_space) long long HTMLRenderer::install_word_space(double word_space) { auto iter = word_space_map.lower_bound(word_space - EPS); - if((iter != word_space_map.end()) && (_equal(iter->first, word_space))) + if((iter != word_space_map.end()) && (equal(iter->first, word_space))) return iter->second; long long new_ws_id = word_space_map.size(); diff --git a/src/HTMLRenderer/link.cc b/src/HTMLRenderer/link.cc index 83cf6aa..fa78b9e 100644 --- a/src/HTMLRenderer/link.cc +++ b/src/HTMLRenderer/link.cc @@ -11,16 +11,20 @@ #include #include -#include #include -#include "namespace.h" +#include "HTMLRenderer.h" +#include "util/namespace.h" +#include "util/math.h" +#include "util/misc.h" namespace pdf2htmlEX { using std::ostringstream; using std::min; using std::max; +using std::cerr; +using std::endl; /* * The detailed rectangle area of the link destination @@ -211,9 +215,9 @@ void HTMLRenderer::processLink(AnnotLink * al) border_top_bottom_width, border_left_right_width); if(abs(border_top_bottom_width - border_left_right_width) < EPS) - html_fout << "border-width:" << _round(border_top_bottom_width) << "px;"; + html_fout << "border-width:" << round(border_top_bottom_width) << "px;"; else - html_fout << "border-width:" << _round(border_top_bottom_width) << "px " << _round(border_left_right_width) << "px;"; + html_fout << "border-width:" << round(border_top_bottom_width) << "px " << round(border_left_right_width) << "px;"; } auto style = border->getStyle(); switch(style) @@ -267,13 +271,13 @@ void HTMLRenderer::processLink(AnnotLink * al) html_fout << "border-style:none;"; } - _tm_transform(default_ctm, x, y); + tm_transform(default_ctm, x, y); html_fout << "position:absolute;" - << "left:" << _round(x) << "px;" - << "bottom:" << _round(y) << "px;" - << "width:" << _round(w) << "px;" - << "height:" << _round(h) << "px;"; + << "left:" << round(x) << "px;" + << "bottom:" << round(y) << "px;" + << "width:" << round(w) << "px;" + << "height:" << round(h) << "px;"; // fix for IE html_fout << "background-color:rgba(255,255,255,0.000001);"; diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index e5a1dac..4115eef 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -16,8 +16,9 @@ #include #include "HTMLRenderer.h" -#include "namespace.h" -#include "util.h" +#include "TextLineBuffer.h" +#include "util/namespace.h" +#include "util/math.h" namespace pdf2htmlEX { @@ -104,7 +105,7 @@ void HTMLRenderer::check_state_change(GfxState * state) } double new_font_size = state->getFontSize(); - if(!_equal(cur_font_size, new_font_size)) + if(!equal(cur_font_size, new_font_size)) { need_rescale_font = true; cur_font_size = new_font_size; @@ -132,7 +133,7 @@ void HTMLRenderer::check_state_change(GfxState * state) new_ctm[5] = m1[1] * m2[4] + m1[3] * m2[5] + m1[5]; //new_ctm[4] = new_ctm[5] = 0; - if(!_tm_equal(new_ctm, cur_text_tm)) + if(!tm_equal(new_ctm, cur_text_tm)) { need_recheck_position = true; need_rescale_font = true; @@ -147,10 +148,10 @@ void HTMLRenderer::check_state_change(GfxState * state) double new_draw_text_tm[6]; memcpy(new_draw_text_tm, cur_text_tm, sizeof(new_draw_text_tm)); - double new_draw_text_scale = 1.0/text_scale_factor2 * _hypot(new_draw_text_tm[2], new_draw_text_tm[3]); + double new_draw_text_scale = 1.0/text_scale_factor2 * hypot(new_draw_text_tm[2], new_draw_text_tm[3]); double new_draw_font_size = cur_font_size; - if(_is_positive(new_draw_text_scale)) + if(is_positive(new_draw_text_scale)) { new_draw_font_size *= new_draw_text_scale; for(int i = 0; i < 4; ++i) @@ -161,19 +162,28 @@ void HTMLRenderer::check_state_change(GfxState * state) new_draw_text_scale = 1.0; } - if(!(_equal(new_draw_text_scale, draw_text_scale))) + if(!is_positive(new_draw_font_size)) + { + // Page is flipped and css can't handle it. + new_draw_font_size = -new_draw_font_size; + + for(int i = 0; i < 4; ++i) + new_draw_text_tm[i] *= -1; + } + + if(!(equal(new_draw_text_scale, draw_text_scale))) { draw_text_scale_changed = true; draw_text_scale = new_draw_text_scale; } - if(!(_equal(new_draw_font_size, draw_font_size))) + if(!(equal(new_draw_font_size, draw_font_size))) { new_line_state = max(new_line_state, NLS_SPAN); draw_font_size = new_draw_font_size; cur_fs_id = install_font_size(draw_font_size); } - if(!(_tm_equal(new_draw_text_tm, draw_text_tm, 4))) + if(!(tm_equal(new_draw_text_tm, draw_text_tm, 4))) { new_line_state = max(new_line_state, NLS_DIV); memcpy(draw_text_tm, new_draw_text_tm, sizeof(draw_text_tm)); @@ -199,21 +209,21 @@ void HTMLRenderer::check_state_change(GfxState * state) */ bool merged = false; - if(_tm_equal(old_ctm, cur_text_tm, 4)) + if(tm_equal(old_ctm, cur_text_tm, 4)) { double dy = cur_ty - draw_ty; double tdx = old_ctm[4] - cur_text_tm[4] - cur_text_tm[2] * dy; double tdy = old_ctm[5] - cur_text_tm[5] - cur_text_tm[3] * dy; - if(_equal(cur_text_tm[0] * tdy, cur_text_tm[1] * tdx)) + if(equal(cur_text_tm[0] * tdy, cur_text_tm[1] * tdx)) { - if(_is_positive(cur_text_tm[0])) + if(is_positive(cur_text_tm[0])) { draw_tx += tdx / cur_text_tm[0]; draw_ty += dy; merged = true; } - else if (_is_positive(cur_text_tm[1])) + else if (is_positive(cur_text_tm[1])) { draw_tx += tdy / cur_text_tm[1]; draw_ty += dy; @@ -221,7 +231,7 @@ void HTMLRenderer::check_state_change(GfxState * state) } else { - if((_equal(tdx,0)) && (_equal(tdy,0))) + if((equal(tdx,0)) && (equal(tdy,0))) { // free draw_tx = cur_tx; @@ -246,7 +256,7 @@ void HTMLRenderer::check_state_change(GfxState * state) if(all_changed || letter_space_changed || draw_text_scale_changed) { double new_letter_space = state->getCharSpace(); - if(!_equal(cur_letter_space, new_letter_space)) + if(!equal(cur_letter_space, new_letter_space)) { new_line_state = max(new_line_state, NLS_SPAN); cur_letter_space = new_letter_space; @@ -259,7 +269,7 @@ void HTMLRenderer::check_state_change(GfxState * state) if(all_changed || word_space_changed || draw_text_scale_changed) { double new_word_space = state->getWordSpace(); - if(!_equal(cur_word_space, new_word_space)) + if(!equal(cur_word_space, new_word_space)) { new_line_state = max(new_line_state, NLS_SPAN); cur_word_space = new_word_space; @@ -294,7 +304,7 @@ void HTMLRenderer::check_state_change(GfxState * state) if(all_changed || rise_changed || draw_text_scale_changed) { double new_rise = state->getRise(); - if(!_equal(cur_rise, new_rise)) + if(!equal(cur_rise, new_rise)) { new_line_state = max(new_line_state, NLS_SPAN); cur_rise = new_rise; @@ -333,7 +343,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state) { close_text_line(); - line_buf.reset(state); + text_line_buf->reset(state); //resync position draw_ty = cur_ty; @@ -350,14 +360,14 @@ void HTMLRenderer::prepare_text_line(GfxState * state) } else { - line_buf.append_offset(target); + text_line_buf->append_offset(target); draw_tx += target / draw_text_scale; } } if(new_line_state != NLS_NONE) { - line_buf.append_state(); + text_line_buf->append_state(); } line_opened = true; @@ -368,7 +378,7 @@ void HTMLRenderer::close_text_line() if(line_opened) { line_opened = false; - line_buf.flush(); + text_line_buf->flush(); } } diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 7e9b566..1ccf254 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -15,9 +15,14 @@ #include #include -#include "ffw.h" #include "HTMLRenderer.h" -#include "namespace.h" +#include "TextLineBuffer.h" +#include "util/ffw.h" +#include "util/namespace.h" +#include "util/unicode.h" +#include "util/path.h" +#include "util/math.h" +#include "util/misc.h" namespace pdf2htmlEX { @@ -26,6 +31,8 @@ using std::min; using std::all_of; using std::floor; using std::swap; +using std::cerr; +using std::endl; string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) { @@ -127,7 +134,7 @@ string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) obj.streamReset(); filepath = (char*)str_fmt("%s/f%llx%s", param->tmp_dir.c_str(), fn_id, suffix.c_str()); - add_tmp_file(filepath); + tmp_files.add(filepath); ofstream outf(filepath, ofstream::binary); if(!outf) @@ -171,7 +178,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo if(param->debug) { auto fn = str_fmt("%s/__raw_font_%lld", param->tmp_dir.c_str(), info.id, param->font_suffix.c_str()); - add_tmp_file((char*)fn); + tmp_files.add((char*)fn); ofstream((char*)fn, ofstream::binary) << ifstream(filepath).rdbuf(); } @@ -374,7 +381,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo // in auto mode, just drop the tounicode map if(!retried) { - cerr << "ToUnicode CMap is not valid and got dropped" << endl; + cerr << "ToUnicode CMap is not valid and got dropped for font: " << hex << info.id << dec << endl; retried = true; codeset.clear(); info.use_tounicode = false; @@ -410,7 +417,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo ffw_reencode_raw(cur_mapping, max_key + 1, 1); - // we need the space chracter for offsets + // we need the space character for offsets if(!has_space) { int space_width; @@ -437,9 +444,9 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo * */ string cur_tmp_fn = (char*)str_fmt("%s/__tmp_font1%s", param->tmp_dir.c_str(), param->font_suffix.c_str()); - add_tmp_file(cur_tmp_fn); + tmp_files.add(cur_tmp_fn); string other_tmp_fn = (char*)str_fmt("%s/__tmp_font2%s", param->tmp_dir.c_str(), param->font_suffix.c_str()); - add_tmp_file(other_tmp_fn); + tmp_files.add(other_tmp_fn); ffw_save(cur_tmp_fn.c_str()); @@ -482,7 +489,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo info.id, param->font_suffix.c_str()); if(param->single_html) - add_tmp_file(fn); + tmp_files.add(fn); ffw_load_font(cur_tmp_fn.c_str()); ffw_metric(&info.ascent, &info.descent); @@ -517,14 +524,6 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) char *p = s->getCString(); int len = s->getLength(); - //debug - { - if(strcmp(p, "ORTUG") == 0) - { - cerr << "DEBUG: " << (int)(state->getRender()) << endl; - } - } - double dx = 0; double dy = 0; double dxerr = 0; @@ -538,10 +537,11 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) CharCode code; Unicode *u = nullptr; - while (len > 0) { + while (len > 0) + { auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy); - if(!(_equal(ox, 0) && _equal(oy, 0))) + if(!(equal(ox, 0) && equal(oy, 0))) { cerr << "TODO: non-zero origins" << endl; } @@ -556,25 +556,25 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) if(is_space && (param->space_as_offset)) { // ignore horiz_scaling, as it's merged in CTM - line_buf.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); + text_line_buf->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); } else { if((param->decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) { - line_buf.append_unicodes(u, uLen); + text_line_buf->append_unicodes(u, uLen); } else { if(cur_font_info->use_tounicode) { Unicode uu = check_unicode(u, uLen, code, font); - line_buf.append_unicodes(&uu, 1); + text_line_buf->append_unicodes(&uu, 1); } else { Unicode uu = unicode_from_font(code, font); - line_buf.append_unicodes(&uu, 1); + text_line_buf->append_unicodes(&uu, 1); } } } diff --git a/src/include/Param.h b/src/Param.h similarity index 98% rename from src/include/Param.h rename to src/Param.h index ee7610e..5f9a7d3 100644 --- a/src/include/Param.h +++ b/src/Param.h @@ -28,6 +28,7 @@ struct Param double zoom; double fit_width, fit_height; double h_dpi, v_dpi; + int use_cropbox; int process_nontext; int single_html; diff --git a/src/include/util.h b/src/include/util.h deleted file mode 100644 index bb09467..0000000 --- a/src/include/util.h +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Constants & Misc functions - * - * - * by WangLu - * 2012.08.10 - */ - - -#ifndef UTIL_H__ -#define UTIL_H__ - -#include -#include -#include -#include -#include -#include -#include - -#ifndef nullptr -#define nullptr (NULL) -#endif - -namespace pdf2htmlEX { - -static const double EPS = 1e-6; -extern const double id_matrix[6]; - -static const double DEFAULT_DPI = 72.0; - -extern const std::map BASE_14_FONT_CSS_FONT_MAP; -extern const std::map GB_ENCODED_FONT_NAME_MAP; -// map to embed files into html -// key: (suffix, if_embed_content) -// value: (prefix string, suffix string) -extern const std::map, std::pair > EMBED_STRING_MAP; - -static inline double _round(double x) { return (std::abs(x) > EPS) ? x : 0.0; } -static inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; } -static inline bool _is_positive(double x) { return x > EPS; } -static inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6) -{ - for(int i = 0; i < size; ++i) - if(!_equal(tm1[i], tm2[i])) - return false; - return true; -} -static inline double _hypot(double x, double y) { return std::sqrt(x*x+y*y); } - -void _tm_transform(const double * tm, double & x, double & y, bool is_delta = false); -void _tm_multiply(double * tm_left, const double * tm_right); - -static inline long long hash_ref(const Ref * id) -{ - return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen); -} - -/* - * http://en.wikipedia.org/wiki/HTML_decimal_character_rendering - */ -bool isLegalUnicode(Unicode u); - -Unicode map_to_private(CharCode code); - -/* - * Try to determine the Unicode value directly from the information in the font - */ -Unicode unicode_from_font (CharCode code, GfxFont * font); - -/* - * We have to use a single Unicode value to reencode fonts - * if we got multi-unicode values, it might be expanded ligature, try to restore it - * if we cannot figure it out at the end, use a private mapping - */ -Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font); - -void outputUnicodes(std::ostream & out, const Unicode * u, int uLen); - -class GfxRGB_hash -{ -public: - size_t operator () (const GfxRGB & rgb) const - { - return (colToByte(rgb.r) << 16) | (colToByte(rgb.g) << 8) | (colToByte(rgb.b)); - } -}; - -class GfxRGB_equal -{ -public: - bool operator ()(const GfxRGB & rgb1, const GfxRGB & rgb2) const - { - return ((rgb1.r == rgb2.r) && (rgb1.g == rgb2.g) && (rgb1.b == rgb1.b)); - } -}; - -// we may need more info of a font in the future -class FontInfo -{ -public: - long long id; - bool use_tounicode; - int em_size; - double ascent, descent; -}; - -class Matrix_less -{ -public: - bool operator () (const Matrix & m1, const Matrix & m2) const - { - // Note that we only care about the first 4 elements - for(int i = 0; i < 4; ++i) - { - if(m1.m[i] < m2.m[i] - EPS) - return true; - if(m1.m[i] > m2.m[i] + EPS) - return false; - } - return false; - } -}; - -class base64stream -{ -public: - - base64stream(std::istream & in) : in(&in) { } - base64stream(std::istream && in) : in(&in) { } - - std::ostream & dumpto(std::ostream & out) - { - unsigned char buf[3]; - while(in->read((char*)buf, 3)) - { - out << base64_encoding[(buf[0] & 0xfc)>>2] - << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)] - << base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)] - << base64_encoding[(buf[2] & 0x3f)]; - } - auto cnt = in->gcount(); - if(cnt > 0) - { - for(int i = cnt; i < 3; ++i) - buf[i] = 0; - - out << base64_encoding[(buf[0] & 0xfc)>>2] - << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]; - - if(cnt > 1) - { - out << base64_encoding[(buf[1] & 0x0f)<<2]; - } - else - { - out << '='; - } - out << '='; - } - - return out; - } - -private: - std::istream * in; - static const char * base64_encoding; -}; - -static inline std::ostream & operator << (std::ostream & out, base64stream & bf) { return bf.dumpto(out); } -static inline std::ostream & operator << (std::ostream & out, base64stream && bf) { return bf.dumpto(out); } - -class string_formatter -{ -public: - class guarded_pointer - { - public: - guarded_pointer(string_formatter * sf) : sf(sf) { ++(sf->buf_cnt); } - ~guarded_pointer(void) { --(sf->buf_cnt); } - operator char* () { return &(sf->buf.front()); } - private: - string_formatter * sf; - }; - - string_formatter() : buf_cnt(0) { buf.reserve(L_tmpnam); } - /* - * Important: - * there is only one buffer, so new strings will replace old ones - */ - guarded_pointer operator () (const char * format, ...) { - assert((buf_cnt == 0) && "string_formatter: buffer is reused!"); - - va_list vlist; - va_start(vlist, format); - int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); - va_end(vlist); - if(l >= (int)buf.capacity()) - { - buf.reserve(std::max((long)(l+1), (long)buf.capacity() * 2)); - va_start(vlist, format); - l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); - va_end(vlist); - } - assert(l >= 0); // we should fail when vsnprintf fail - assert(l < (int)buf.capacity()); - return guarded_pointer(this); - } -private: - friend class guarded_pointer; - std::vector buf; - int buf_cnt; -}; - -void create_directories(std::string path); - -bool is_truetype_suffix(const std::string & suffix); - -std::string get_filename(const std::string & path); -std::string get_suffix(const std::string & path); - -/* - * In PDF, edges of the rectangle are in the middle of the borders - * In HTML, edges are completely outside the rectangle - */ -void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2, - double border_width, - double & x, double & y, double & w, double & h, - double & border_top_bottom_width, - double & border_left_right_width); - -std::ostream & operator << (std::ostream & out, const GfxRGB & rgb); - -} // namespace util -#endif //UTIL_H__ diff --git a/src/include/pdf2htmlEX-config.h.in b/src/pdf2htmlEX-config.h.in similarity index 100% rename from src/include/pdf2htmlEX-config.h.in rename to src/pdf2htmlEX-config.h.in diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index f56b778..3a82ed5 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -19,10 +19,11 @@ #include #include -#include "HTMLRenderer.h" #include "Param.h" #include "pdf2htmlEX-config.h" -#include "ArgParser.h" +#include "HTMLRenderer/HTMLRenderer.h" +#include "util/ArgParser.h" +#include "util/path.h" using namespace std; using namespace pdf2htmlEX; @@ -66,6 +67,7 @@ void parse_options (int argc, char **argv) .add("fit-height", ¶m.fit_height, 0, "fit height", nullptr, true) .add("hdpi", ¶m.h_dpi, 144.0, "horizontal DPI for non-text") .add("vdpi", ¶m.v_dpi, 144.0, "vertical DPI for non-text") + .add("use-cropbox", ¶m.use_cropbox, 0, "use CropBox instead of MediaBox") .add("process-nontext", ¶m.process_nontext, 1, "process nontext objects") .add("single-html", ¶m.single_html, 1, "combine everything into one single HTML file") diff --git a/src/util.cc b/src/util.cc deleted file mode 100644 index a69654e..0000000 --- a/src/util.cc +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Misc functions - * - * - * by WangLu - * 2012.08.10 - */ - -#include -#include - -#include -#include -#include -#include -#include - -// for mkdir -#include -#include - -#include "util.h" - -using std::cerr; -using std::endl; -using std::string; -using std::map; -using std::ostream; - -namespace pdf2htmlEX { - -const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; - -const map BASE_14_FONT_CSS_FONT_MAP({ - { "Courier", "Courier,monospace" }, - { "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" }, - { "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" }, - { "Symbol", "Symbol,\"Standard Symbols L\"" }, - { "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" }, -}); - -const map GB_ENCODED_FONT_NAME_MAP({ - {"\xCB\xCE\xCC\xE5", "SimSun"}, - {"\xBA\xDA\xCC\xE5", "SimHei"}, - {"\xBF\xAC\xCC\xE5_GB2312", "SimKai"}, - {"\xB7\xC2\xCB\xCE_GB2312", "SimFang"}, - {"\xC1\xA5\xCA\xE9", "SimLi"}, -}); - -const std::map, std::pair > EMBED_STRING_MAP({ - {{".css", 0}, {""}}, - {{".css", 1}, {""}}, - {{".js", 0}, {""}}, - {{".js", 1}, {""}} -}); - -void _tm_transform(const double * tm, double & x, double & y, bool is_delta) -{ - double xx = x, yy = y; - x = tm[0] * xx + tm[2] * yy; - y = tm[1] * xx + tm[3] * yy; - if(!is_delta) - { - x += tm[4]; - y += tm[5]; - } -} - -void _tm_multiply(double * tm_left, const double * tm_right) -{ - double old[4]; - memcpy(old, tm_left, sizeof(old)); - - tm_left[0] = old[0] * tm_right[0] + old[2] * tm_right[1]; - tm_left[1] = old[1] * tm_right[0] + old[3] * tm_right[1]; - tm_left[2] = old[0] * tm_right[2] + old[2] * tm_right[3]; - tm_left[3] = old[1] * tm_right[2] + old[3] * tm_right[3]; - tm_left[4] += old[0] * tm_right[4] + old[2] * tm_right[5]; - tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5]; -} - -bool isLegalUnicode(Unicode u) -{ - /* - if((u == 9) || (u == 10) || (u == 13)) - return true; - */ - - if(u <= 31) - return false; - - if((u >= 127) && (u <= 159)) - return false; - - if((u >= 0xd800) && (u <= 0xdfff)) - return false; - - return true; -} - -Unicode map_to_private(CharCode code) -{ - Unicode private_mapping = (Unicode)(code + 0xE000); - if(private_mapping > 0xF8FF) - { - private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000); - if(private_mapping > 0xFFFFD) - { - private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000); - if(private_mapping > 0x10FFFD) - { - cerr << "Warning: all private use unicode are used" << endl; - } - } - } - return private_mapping; -} - -Unicode unicode_from_font (CharCode code, GfxFont * font) -{ - if(!font->isCIDFont()) - { - char * cname = dynamic_cast(font)->getCharName(code); - // may be untranslated ligature - if(cname) - { - Unicode ou = globalParams->mapNameToUnicode(cname); - - if(isLegalUnicode(ou)) - return ou; - } - } - - return map_to_private(code); -} - -Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font) -{ - if(len == 0) - return map_to_private(code); - - if(len == 1) - { - if(isLegalUnicode(*u)) - return *u; - } - - return unicode_from_font(code, font); -} - -/* - * Copied from UTF.h / UTF8.h in poppler - */ -static int mapUTF8(Unicode u, char *buf, int bufSize) { - if (u <= 0x0000007f) { - if (bufSize < 1) { - return 0; - } - buf[0] = (char)u; - return 1; - } else if (u <= 0x000007ff) { - if (bufSize < 2) { - return 0; - } - buf[0] = (char)(0xc0 + (u >> 6)); - buf[1] = (char)(0x80 + (u & 0x3f)); - return 2; - } else if (u <= 0x0000ffff) { - if (bufSize < 3) { - return 0; - } - buf[0] = (char)(0xe0 + (u >> 12)); - buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); - buf[2] = (char)(0x80 + (u & 0x3f)); - return 3; - } else if (u <= 0x0010ffff) { - if (bufSize < 4) { - return 0; - } - buf[0] = (char)(0xf0 + (u >> 18)); - buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); - buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); - buf[3] = (char)(0x80 + (u & 0x3f)); - return 4; - } else { - return 0; - } -} - -void outputUnicodes(ostream & out, const Unicode * u, int uLen) -{ - for(int i = 0; i < uLen; ++i) - { - switch(u[i]) - { - case '&': - out << "&"; - break; - case '\"': - out << """; - break; - case '\'': - out << "'"; - break; - case '<': - out << "<"; - break; - case '>': - out << ">"; - break; - default: - { - char buf[4]; - auto n = mapUTF8(u[i], buf, 4); - out.write(buf, n); - } - } - } -} - -const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - -void create_directories(string path) -{ - if(path.empty()) return; - - size_t idx = path.rfind('/'); - if(idx != string::npos) - { - create_directories(path.substr(0, idx)); - } - - int r = mkdir(path.c_str(), S_IRWXU); - if(r != 0) - { - if(errno == EEXIST) - { - struct stat stat_buf; - if((stat(path.c_str(), &stat_buf) == 0) && S_ISDIR(stat_buf.st_mode)) - return; - } - - throw string("Cannot create directory: ") + path; - } -} - -bool is_truetype_suffix(const string & suffix) -{ - return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf"); -} - -string get_filename (const string & path) -{ - size_t idx = path.rfind('/'); - if(idx == string::npos) - return path; - else if (idx == path.size() - 1) - return ""; - return path.substr(idx + 1); -} - -string get_suffix(const string & path) -{ - string fn = get_filename(path); - size_t idx = fn.rfind('.'); - if(idx == string::npos) - return ""; - else - { - string s = fn.substr(idx); - for(auto iter = s.begin(); iter != s.end(); ++iter) - *iter = tolower(*iter); - return s; - } -} - -void css_fix_rectangle_border_width(double x1, double y1, - double x2, double y2, - double border_width, - double & x, double & y, double & w, double & h, - double & border_top_bottom_width, - double & border_left_right_width) -{ - w = x2 - x1; - if(w > border_width) - { - w -= border_width; - border_left_right_width = border_width; - } - else - { - border_left_right_width = border_width + w/2; - w = 0; - } - x = x1 - border_width / 2; - - h = y2 - y1; - if(h > border_width) - { - h -= border_width; - border_top_bottom_width = border_width; - } - else - { - border_top_bottom_width = border_width + h/2; - h = 0; - } - y = y1 - border_width / 2; -} - -ostream & operator << (ostream & out, const GfxRGB & rgb) -{ - auto flags= out.flags(); - out << std::dec << "rgb(" - << (int)colToByte(rgb.r) << "," - << (int)colToByte(rgb.g) << "," - << (int)colToByte(rgb.b) << ")"; - out.flags(flags); - return out; -} - -} // namespace pdf2htmlEX diff --git a/src/ArgParser.cc b/src/util/ArgParser.cc similarity index 100% rename from src/ArgParser.cc rename to src/util/ArgParser.cc diff --git a/src/include/ArgParser.h b/src/util/ArgParser.h similarity index 100% rename from src/include/ArgParser.h rename to src/util/ArgParser.h diff --git a/src/Preprocessor.cc b/src/util/Preprocessor.cc similarity index 95% rename from src/Preprocessor.cc rename to src/util/Preprocessor.cc index b2a9677..1c77337 100644 --- a/src/Preprocessor.cc +++ b/src/util/Preprocessor.cc @@ -15,7 +15,8 @@ #include #include "Preprocessor.h" -#include "util.h" +#include "util/misc.h" +#include "util/const.h" namespace pdf2htmlEX { @@ -41,7 +42,7 @@ Preprocessor::~Preprocessor(void) void Preprocessor::process(PDFDoc * doc) { - int page_count = (param->last_page - param->first_page); + int page_count = (param->last_page - param->first_page + 1); for(int i = param->first_page; i <= param->last_page ; ++i) { cerr << "Preprocessing: " << (i-param->first_page) << "/" << page_count << '\r' << flush; diff --git a/src/include/Preprocessor.h b/src/util/Preprocessor.h similarity index 100% rename from src/include/Preprocessor.h rename to src/util/Preprocessor.h diff --git a/src/util/StringFormatter.cc b/src/util/StringFormatter.cc new file mode 100644 index 0000000..b361c2d --- /dev/null +++ b/src/util/StringFormatter.cc @@ -0,0 +1,30 @@ +#include +#include +#include + +#include "StringFormatter.h" + +namespace pdf2htmlEX { + +StringFormatter::GuardedPointer StringFormatter::operator () (const char * format, ...) +{ + assert((buf_cnt == 0) && "StringFormatter: buffer is reused!"); + + va_list vlist; + va_start(vlist, format); + int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + va_end(vlist); + if(l >= (int)buf.capacity()) + { + buf.reserve(std::max((long)(l+1), (long)buf.capacity() * 2)); + va_start(vlist, format); + l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + va_end(vlist); + } + assert(l >= 0); // we should fail when vsnprintf fail + assert(l < (int)buf.capacity()); + return GuardedPointer(this); +} + +} //namespace pdf2htmlEX + diff --git a/src/util/StringFormatter.h b/src/util/StringFormatter.h new file mode 100644 index 0000000..2d34126 --- /dev/null +++ b/src/util/StringFormatter.h @@ -0,0 +1,44 @@ +/* + * Buffer reusing string formatter + * + * by WangLu + * 2012.11.29 + */ + +#ifndef STRINGFORMATTER_H__ +#define STRINGFORMATTER_H__ + +#include +#include + +namespace pdf2htmlEX { + +class StringFormatter +{ +public: + class GuardedPointer + { + public: + GuardedPointer(StringFormatter * sf) : sf(sf) { ++(sf->buf_cnt); } + GuardedPointer(const GuardedPointer & gp) : sf(gp.sf) { ++(sf->buf_cnt); } + ~GuardedPointer(void) { --(sf->buf_cnt); } + operator char* () const { return &(sf->buf.front()); } + private: + StringFormatter * sf; + }; + + StringFormatter() : buf_cnt(0) { buf.reserve(L_tmpnam); } + /* + * Important: + * there is only one buffer, so new strings will replace old ones + */ + GuardedPointer operator () (const char * format, ...); + +private: + friend class GuardedPointer; + std::vector buf; + int buf_cnt; +}; + +} //namespace pdf2htmlEX +#endif //STRINGFORMATTER_H__ diff --git a/src/util/TmpFiles.cc b/src/util/TmpFiles.cc new file mode 100644 index 0000000..616ac28 --- /dev/null +++ b/src/util/TmpFiles.cc @@ -0,0 +1,56 @@ +/* + * TmpFiles.cc + * + * Collect and clean-up temporary files + * + * implemented by WangLu + * split off by Filodej + */ + +#include +#include "TmpFiles.h" +#include "Param.h" + +using namespace std; + +namespace pdf2htmlEX { + + +TmpFiles::TmpFiles( const Param& param ) + : param( param ) +{ } + +TmpFiles::~TmpFiles() +{ + clean(); +} + +void TmpFiles::add( const string & fn) +{ + if(!param.clean_tmp) + return; + + if(tmp_files.insert(fn).second && param.debug) + cerr << "Add new temporary file: " << fn << endl; +} + +void TmpFiles::clean() +{ + if(!param.clean_tmp) + return; + + for(auto iter = tmp_files.begin(); iter != tmp_files.end(); ++iter) + { + const string & fn = *iter; + remove(fn.c_str()); + if(param.debug) + cerr << "Remove temporary file: " << fn << endl; + } + + remove(param.tmp_dir.c_str()); + if(param.debug) + cerr << "Remove temporary directory: " << param.tmp_dir << endl; +} + + +} // namespace pdf2htmlEX diff --git a/src/util/TmpFiles.h b/src/util/TmpFiles.h new file mode 100644 index 0000000..f036593 --- /dev/null +++ b/src/util/TmpFiles.h @@ -0,0 +1,29 @@ +#ifndef TMPFILES_H__ +#define TMPFILES_H__ + +#include +#include +#include "Param.h" + +namespace pdf2htmlEX { + +class TmpFiles +{ +public: + explicit TmpFiles( const Param& param ); + ~TmpFiles(); + + void add( const std::string& fn); + +private: + void clean(); + +private: + const Param& param; + std::set tmp_files; + +}; + +} // namespace pdf2htmlEX + +#endif //TMPFILES_H__ diff --git a/src/util/base64stream.cc b/src/util/base64stream.cc new file mode 100644 index 0000000..7df00fe --- /dev/null +++ b/src/util/base64stream.cc @@ -0,0 +1,45 @@ +#include "base64stream.h" + +namespace pdf2htmlEX { + +using std::ostream; + +ostream & base64stream::dumpto(ostream & out) +{ + unsigned char buf[3]; + while(in->read((char*)buf, 3)) + { + out << base64_encoding[(buf[0] & 0xfc)>>2] + << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)] + << base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)] + << base64_encoding[(buf[2] & 0x3f)]; + } + auto cnt = in->gcount(); + if(cnt > 0) + { + for(int i = cnt; i < 3; ++i) + buf[i] = 0; + + out << base64_encoding[(buf[0] & 0xfc)>>2] + << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]; + + if(cnt > 1) + { + out << base64_encoding[(buf[1] & 0x0f)<<2]; + } + else + { + out << '='; + } + out << '='; + } + + return out; +} + +const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +ostream & operator << (ostream & out, base64stream & bf) { return bf.dumpto(out); } +ostream & operator << (ostream & out, base64stream && bf) { return bf.dumpto(out); } + +} //namespace pdf2htmlEX diff --git a/src/util/base64stream.h b/src/util/base64stream.h new file mode 100644 index 0000000..46955cf --- /dev/null +++ b/src/util/base64stream.h @@ -0,0 +1,33 @@ +/* + * Base64 Encoding + * + * by WangLu + * 2012.11.29 + */ + +#ifndef BASE64STREAM_H__ +#define BASE64STREAM_H__ + +#include + +namespace pdf2htmlEX { + +class base64stream +{ +public: + + base64stream(std::istream & in) : in(&in) { } + base64stream(std::istream && in) : in(&in) { } + + std::ostream & dumpto(std::ostream & out); + +private: + std::istream * in; + static const char * base64_encoding; +}; + +std::ostream & operator << (std::ostream & out, base64stream & bf); +std::ostream & operator << (std::ostream & out, base64stream && bf); + +} //namespace pdf2htmlEX +#endif //BASE64STREAM_H__ diff --git a/src/util/const.cc b/src/util/const.cc new file mode 100644 index 0000000..1a5d2d4 --- /dev/null +++ b/src/util/const.cc @@ -0,0 +1,39 @@ +/* + * Constants + * + * by WangLu + * 2012.11.29 + */ + +#include "const.h" + +namespace pdf2htmlEX { + +using std::map; +using std::string; + +const double ID_MATRIX[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + +const map BASE_14_FONT_CSS_FONT_MAP({ + { "Courier", "Courier,monospace" }, + { "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" }, + { "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" }, + { "Symbol", "Symbol,\"Standard Symbols L\"" }, + { "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" }, +}); + +const map GB_ENCODED_FONT_NAME_MAP({ + {"\xCB\xCE\xCC\xE5", "SimSun"}, + {"\xBA\xDA\xCC\xE5", "SimHei"}, + {"\xBF\xAC\xCC\xE5_GB2312", "SimKai"}, + {"\xB7\xC2\xCB\xCE_GB2312", "SimFang"}, + {"\xC1\xA5\xCA\xE9", "SimLi"}, +}); + +const std::map, std::pair > EMBED_STRING_MAP({ + {{".css", 0}, {""}}, + {{".css", 1}, {""}}, + {{".js", 0}, {""}}, + {{".js", 1}, {""}} +}); +} //namespace pdf2htmlEX diff --git a/src/util/const.h b/src/util/const.h new file mode 100644 index 0000000..ffd2357 --- /dev/null +++ b/src/util/const.h @@ -0,0 +1,35 @@ +/* + * Constants + * + * by WangLu + * 2012.11.29 + */ + +#ifndef CONST_H__ +#define CONST_H__ + +#include +#include + +namespace pdf2htmlEX { + +#ifndef nullptr +#define nullptr (NULL) +#endif + +static const double EPS = 1e-6; +static const double DEFAULT_DPI = 72.0; +extern const double ID_MATRIX[6]; + +// PDF base 14 font name -> CSS font name +extern const std::map BASE_14_FONT_CSS_FONT_MAP; +// For GB encoded font names +extern const std::map GB_ENCODED_FONT_NAME_MAP; +// map to embed files into html +// key: (suffix, if_embed_content) +// value: (prefix string, suffix string) +extern const std::map, std::pair > EMBED_STRING_MAP; + +} // namespace pdf2htmlEX + +#endif //CONST_H__ diff --git a/src/ffw.c b/src/util/ffw.c similarity index 100% rename from src/ffw.c rename to src/util/ffw.c diff --git a/src/include/ffw.h b/src/util/ffw.h similarity index 100% rename from src/include/ffw.h rename to src/util/ffw.h diff --git a/src/util/math.cc b/src/util/math.cc new file mode 100644 index 0000000..d23d48f --- /dev/null +++ b/src/util/math.cc @@ -0,0 +1,32 @@ +#include +#include "math.h" + +namespace pdf2htmlEX { + +void tm_transform(const double * tm, double & x, double & y, bool is_delta) +{ + double xx = x, yy = y; + x = tm[0] * xx + tm[2] * yy; + y = tm[1] * xx + tm[3] * yy; + if(!is_delta) + { + x += tm[4]; + y += tm[5]; + } +} + +void tm_multiply(double * tm_left, const double * tm_right) +{ + double old[4]; + memcpy(old, tm_left, sizeof(old)); + + tm_left[0] = old[0] * tm_right[0] + old[2] * tm_right[1]; + tm_left[1] = old[1] * tm_right[0] + old[3] * tm_right[1]; + tm_left[2] = old[0] * tm_right[2] + old[2] * tm_right[3]; + tm_left[3] = old[1] * tm_right[2] + old[3] * tm_right[3]; + tm_left[4] += old[0] * tm_right[4] + old[2] * tm_right[5]; + tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5]; +} + +} //namespace pdf2htmlEX + diff --git a/src/util/math.h b/src/util/math.h new file mode 100644 index 0000000..9c9f5db --- /dev/null +++ b/src/util/math.h @@ -0,0 +1,33 @@ +/* + * Math functions + * + * by WangLu + * 2012.11.29 + */ + +#ifndef MATH_H__ +#define MATH_H__ + +#include + +#include "const.h" + +namespace pdf2htmlEX { + +static inline double round(double x) { return (std::abs(x) > EPS) ? x : 0.0; } +static inline bool equal(double x, double y) { return std::abs(x-y) < EPS; } +static inline bool is_positive(double x) { return x > EPS; } +static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6) +{ + for(int i = 0; i < size; ++i) + if(!equal(tm1[i], tm2[i])) + return false; + return true; +} +static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); } + +void tm_transform(const double * tm, double & x, double & y, bool is_delta = false); +void tm_multiply(double * tm_left, const double * tm_right); + +} //namespace pdf2htmlEX +#endif //MATH_H__ diff --git a/src/util/misc.cc b/src/util/misc.cc new file mode 100644 index 0000000..e2572c0 --- /dev/null +++ b/src/util/misc.cc @@ -0,0 +1,66 @@ +/* + * Misc functions + * + * + * by WangLu + * 2012.08.10 + */ + +#include + +#include "misc.h" + +using std::cerr; +using std::endl; +using std::string; +using std::map; +using std::ostream; + +namespace pdf2htmlEX { + +void css_fix_rectangle_border_width(double x1, double y1, + double x2, double y2, + double border_width, + double & x, double & y, double & w, double & h, + double & border_top_bottom_width, + double & border_left_right_width) +{ + w = x2 - x1; + if(w > border_width) + { + w -= border_width; + border_left_right_width = border_width; + } + else + { + border_left_right_width = border_width + w/2; + w = 0; + } + x = x1 - border_width / 2; + + h = y2 - y1; + if(h > border_width) + { + h -= border_width; + border_top_bottom_width = border_width; + } + else + { + border_top_bottom_width = border_width + h/2; + h = 0; + } + y = y1 - border_width / 2; +} + +ostream & operator << (ostream & out, const GfxRGB & rgb) +{ + auto flags= out.flags(); + out << std::dec << "rgb(" + << (int)colToByte(rgb.r) << "," + << (int)colToByte(rgb.g) << "," + << (int)colToByte(rgb.b) << ")"; + out.flags(flags); + return out; +} + +} // namespace pdf2htmlEX diff --git a/src/util/misc.h b/src/util/misc.h new file mode 100644 index 0000000..11ae739 --- /dev/null +++ b/src/util/misc.h @@ -0,0 +1,37 @@ +/* + * Help classes and Functions + * + * by WangLu + * 2012.08.10 + */ + + +#ifndef UTIL_H__ +#define UTIL_H__ + +#include + +#include + +namespace pdf2htmlEX { + +static inline long long hash_ref(const Ref * id) +{ + return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen); +} + +/* + * In PDF, edges of the rectangle are in the middle of the borders + * In HTML, edges are completely outside the rectangle + */ +void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2, + double border_width, + double & x, double & y, double & w, double & h, + double & border_top_bottom_width, + double & border_left_right_width); + +std::ostream & operator << (std::ostream & out, const GfxRGB & rgb); + +} // namespace pdf2htmlEX + +#endif //UTIL_H__ diff --git a/src/include/namespace.h b/src/util/namespace.h similarity index 89% rename from src/include/namespace.h rename to src/util/namespace.h index a74f936..46dcd0f 100644 --- a/src/include/namespace.h +++ b/src/util/namespace.h @@ -12,8 +12,6 @@ using std::hex; using std::dec; using std::string; -using std::cout; -using std::cerr; using std::endl; using std::make_pair; using std::ifstream; diff --git a/src/util/path.cc b/src/util/path.cc new file mode 100644 index 0000000..ce80a4f --- /dev/null +++ b/src/util/path.cc @@ -0,0 +1,73 @@ +/* + * Functions manipulating filenames and paths + * + * by WangLu + * 2012.11.29 + */ + +#include +#include +#include + +#include "path.h" + +using std::string; + +namespace pdf2htmlEX { + +void create_directories(const string & path) +{ + if(path.empty()) return; + + size_t idx = path.rfind('/'); + if(idx != string::npos) + { + create_directories(path.substr(0, idx)); + } + + int r = mkdir(path.c_str(), S_IRWXU); + if(r != 0) + { + if(errno == EEXIST) + { + struct stat stat_buf; + if((stat(path.c_str(), &stat_buf) == 0) && S_ISDIR(stat_buf.st_mode)) + return; + } + + throw string("Cannot create directory: ") + path; + } +} + +bool is_truetype_suffix(const string & suffix) +{ + return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf"); +} + +string get_filename (const string & path) +{ + size_t idx = path.rfind('/'); + if(idx == string::npos) + return path; + else if (idx == path.size() - 1) + return ""; + return path.substr(idx + 1); +} + +string get_suffix(const string & path) +{ + string fn = get_filename(path); + size_t idx = fn.rfind('.'); + if(idx == string::npos) + return ""; + else + { + string s = fn.substr(idx); + for(auto iter = s.begin(); iter != s.end(); ++iter) + *iter = tolower(*iter); + return s; + } +} + + +} //namespace pdf2htmlEX diff --git a/src/util/path.h b/src/util/path.h new file mode 100644 index 0000000..4f82a8e --- /dev/null +++ b/src/util/path.h @@ -0,0 +1,23 @@ +/* + * Function handling filenames and paths + * + * by WangLu + * 2012.11.29 + */ + +#ifndef PATH_H__ +#define PATH_H__ + +#include + +namespace pdf2htmlEX { + +void create_directories(const std::string & path); + +bool is_truetype_suffix(const std::string & suffix); + +std::string get_filename(const std::string & path); +std::string get_suffix(const std::string & path); + +} //namespace pdf2htmlEX +#endif //PATH_H__ diff --git a/src/util/unicode.cc b/src/util/unicode.cc new file mode 100644 index 0000000..86c85f1 --- /dev/null +++ b/src/util/unicode.cc @@ -0,0 +1,157 @@ +/* + * Unicode manipulation functions + * + * by WangLu + * 2012.11.29 + */ + +#include + +#include "unicode.h" + +namespace pdf2htmlEX { + +using std::cerr; +using std::endl; +using std::ostream; + +bool isLegalUnicode(Unicode u) +{ + /* + if((u == 9) || (u == 10) || (u == 13)) + return true; + */ + + if(u <= 31) + return false; + + if((u >= 127) && (u <= 159)) + return false; + + if((u >= 0xd800) && (u <= 0xdfff)) + return false; + + return true; +} + +Unicode map_to_private(CharCode code) +{ + Unicode private_mapping = (Unicode)(code + 0xE000); + if(private_mapping > 0xF8FF) + { + private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000); + if(private_mapping > 0xFFFFD) + { + private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000); + if(private_mapping > 0x10FFFD) + { + cerr << "Warning: all private use unicode are used" << endl; + } + } + } + return private_mapping; +} + +Unicode unicode_from_font (CharCode code, GfxFont * font) +{ + if(!font->isCIDFont()) + { + char * cname = dynamic_cast(font)->getCharName(code); + // may be untranslated ligature + if(cname) + { + Unicode ou = globalParams->mapNameToUnicode(cname); + + if(isLegalUnicode(ou)) + return ou; + } + } + + return map_to_private(code); +} + +Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font) +{ + if(len == 0) + return map_to_private(code); + + if(len == 1) + { + if(isLegalUnicode(*u)) + return *u; + } + + return unicode_from_font(code, font); +} + +/* + * Copied from UTF.h / UTF8.h in poppler + */ +static int mapUTF8(Unicode u, char *buf, int bufSize) { + if (u <= 0x0000007f) { + if (bufSize < 1) { + return 0; + } + buf[0] = (char)u; + return 1; + } else if (u <= 0x000007ff) { + if (bufSize < 2) { + return 0; + } + buf[0] = (char)(0xc0 + (u >> 6)); + buf[1] = (char)(0x80 + (u & 0x3f)); + return 2; + } else if (u <= 0x0000ffff) { + if (bufSize < 3) { + return 0; + } + buf[0] = (char)(0xe0 + (u >> 12)); + buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[2] = (char)(0x80 + (u & 0x3f)); + return 3; + } else if (u <= 0x0010ffff) { + if (bufSize < 4) { + return 0; + } + buf[0] = (char)(0xf0 + (u >> 18)); + buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); + buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[3] = (char)(0x80 + (u & 0x3f)); + return 4; + } else { + return 0; + } +} + +void outputUnicodes(ostream & out, const Unicode * u, int uLen) +{ + for(int i = 0; i < uLen; ++i) + { + switch(u[i]) + { + case '&': + out << "&"; + break; + case '\"': + out << """; + break; + case '\'': + out << "'"; + break; + case '<': + out << "<"; + break; + case '>': + out << ">"; + break; + default: + { + char buf[4]; + auto n = mapUTF8(u[i], buf, 4); + out.write(buf, n); + } + } + } +} + +} //namespace pdf2htmlEX diff --git a/src/util/unicode.h b/src/util/unicode.h new file mode 100644 index 0000000..9cc9dc6 --- /dev/null +++ b/src/util/unicode.h @@ -0,0 +1,41 @@ +/* + * Unicode manipulation functions + * + * by WangLu + * 2012.11.29 + */ + +#ifndef UNICODE_H__ +#define UNICODE_H__ + +#include + +#include +#include + +namespace pdf2htmlEX { + +/* + * Check if the unicode is valid for HTML + * http://en.wikipedia.org/wiki/HTML_decimal_character_rendering + */ +bool isLegalUnicode(Unicode u); + +Unicode map_to_private(CharCode code); + +/* * Try to determine the Unicode value directly from the information in the font */ +Unicode unicode_from_font (CharCode code, GfxFont * font); + +/* + * We have to use a single Unicode value to reencode fonts + * if we got multi-unicode values, it might be expanded ligature, try to restore it + * if we cannot figure it out at the end, use a private mapping + */ +Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font); + +void outputUnicodes(std::ostream & out, const Unicode * u, int uLen); + + +} // namespace pdf2htmlEX + +#endif //UNICODE_H__