diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..0a4053e --- /dev/null +++ b/AUTHORS @@ -0,0 +1,14 @@ +Deepak +filodej +hasufell +Herbert Jones +Hongliang Tian +John Hewson +Lu Wang + +Packagers: +Arthur Titeica +Deepak Thukral +Jamie Ly +Lu Wang + diff --git a/CMakeLists.txt b/CMakeLists.txt index 3fa6c52..0a6839d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 2.6.0 FATAL_ERROR) include_directories(${CMAKE_SOURCE_DIR}/src) -set(PDF2HTMLEX_VERSION "0.6") +set(PDF2HTMLEX_VERSION "0.7") set(ARCHIVE_NAME pdf2htmlex-${PDF2HTMLEX_VERSION}) add_custom_target(dist COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD @@ -154,6 +154,7 @@ add_executable(pdf2htmlEX src/HTMLRenderer/TextLineBuffer.h src/HTMLRenderer/TextLineBuffer.cc src/HTMLRenderer/link.cc + src/HTMLRenderer/outline.cc src/HTMLRenderer/state.cc src/HTMLRenderer/text.cc src/BackgroundRenderer/BackgroundRenderer.h diff --git a/ChangeLog b/ChangeLog index 8a89b88..3a89c38 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,17 @@ -Latest v0.6 +Latest v0.7 +* Process outline +* Fix build with poppler +* Many code cleaning jobs [John Hewson] + +v0.6 +2013.01.26 + +* new option --no-drm [John Hewson] +* Travis CI integration [John Hewson] * Add a class for 'left' * Fixed a bug of hashing/finding GfxRGB -* new option -v, --version [Thanks to John Hewson] +* new option -v, --version [John Hewson] * Render Type 3 fonts as image * New parameter: --use-cropbox * Progress indicator diff --git a/README.md b/README.md index dac1653..f9716d2 100644 --- a/README.md +++ b/README.md @@ -43,42 +43,26 @@ Readers can also be benefitted - Color - Transformation * Links +* Outline * [EXPERIMENTAL] Path drawing with CSS - Orthogonal lines - Rectangles - Linear gradients -* Not fully supported, and rendered as images +* Not fully supported (Rendered as images) - Type 3 fonts - Non-text object ## Get started -### Ubuntu - -[PPA](https://launchpad.net/~coolwanglu/+archive/pdf2htmlex), which is not so up-to-date. - -### ArchLinux - -[AUR Package](https://aur.archlinux.org/packages.php?ID=62426), special thanks to Arthur Titeica - -### Gentoo - -Install through Overlay gentoo-zh, mrueg or sunrise, thanks to the packagers. - -### Mac - -[Homebrew Formula](https://github.com/jamiely/homebrew/blob/pdf2htmlex/Library/Formula/pdf2htmlex.rb), special thanks to Jamie Ly - -[Macports (local repo)](https://github.com/iapain/pdf2htmlEX-macport), special thanks to Deepak Thukral - -### Windows - -The code may be built with Cygwin. - -Or with MinGW with some modifications. - -More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u.ac.jp/~okumura/texwiki/?pdf2htmlEX) (in Japanese), special thanks to Haruhiko Okumura +### Install + +Thanks to all packagers! + * [Ubuntu PPA](https://launchpad.net/~coolwanglu/+archive/pdf2htmlex) by Lu Wang , not always up-to-date. + * [ArchLinux AUR](https://aur.archlinux.org/packages.php?ID=62426) by Arthur Titeica + * [Gentoo Overlay](http://gpo.zugaina.org/app-text/pdf2htmlex), gentoo-zh, mrueg or sunrise, by respective packagers. + * [Homebrew Formula](https://github.com/jamiely/homebrew/blob/pdf2htmlex/Library/Formula/pdf2htmlex.rb) by Jamie Ly + * [Macports (local repo)](https://github.com/iapain/pdf2htmlEX-macport) by Deepak Thukral ### Build from source @@ -96,6 +80,10 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u * git version is recommended to avoid annoying compilation issues * [Optional] **ttfautohint** * run pdf2htmlEX with **--external-hint-tool=ttfautohint** to enable it +* [For Windows] + * Cygwin + * or MinGW, with some modifications to pdf2htmlEX. See [pdf2htmlEX on TeX Wiki](http://oku.edu.mie-u.ac.jp/~okumura/texwiki/?pdf2htmlEX) (in Japanese), special thanks to Haruhiko Okumura + #### Compiling @@ -106,9 +94,7 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u ## Usage pdf2htmlEX /path/to/foobar.pdf - pdf2htmlEX --help - man pdf2htmlEX ## FAQ @@ -131,6 +117,16 @@ GPLv2 & GPLv3 Dual licensed ### [**Donate Now**](http://coolwanglu.github.com/pdf2htmlEX/donate.html) +## Contact + +* Mailing list + * Please read `man pdf2htmlEX` and [**FAQ**](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ) before sending emails. Or your message might be ignored. + * Please use the **latest master branch**. + +* Lu Wang + * Please use the mailing list above unless for personal enquiries. + * Accepting messages in **Chinese**, **English** or **Japanese**. + ## Acknowledge pdf2htmlEX is made possible thanks to the following projects: @@ -147,18 +143,6 @@ pdf2htmlEX is inspired by the following projects: * Crocodoc * Google Doc - -## Contact - -* Mailing list - * Please read [**FAQ**](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ) before sending emails. Or your message might be ignored. - * Please use the **latest master branch**. - -* Lu Wang - * Please use the mailing list above unless for personal enquiries. - * Accepting messages in **Chinese**, **English** or **Japanese**. - - ### Special Thanks * Hongliang Tian diff --git a/TODO b/TODO index 9c0e03e..9c3771a 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,3 @@ -word space/offset before the first letter (calendar pdf) -add class for "left" - == Future: == Too difficult/complicated to implement: diff --git a/build_for_ppa.py b/build_for_ppa.py index ac3d4b1..90ac1e7 100755 --- a/build_for_ppa.py +++ b/build_for_ppa.py @@ -5,7 +5,7 @@ Dirty script for building package for PPA by WangLu 2011.01.13 -modified by pdf2htmlEX +modified for pdf2htmlEX 2012.08.28 """ diff --git a/debian/changelog b/debian/changelog index 6228965..cedab76 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,7 +1,24 @@ +pdf2htmlex (0.7-1~git201301292229r2595c-0ubuntu1) quantal; urgency=low + + * Fixed a CSS issue + + -- WANG Lu Tue, 29 Jan 2013 22:29:21 +0800 + +pdf2htmlex (0.7-1~git201301282229r2595c-0ubuntu1) quantal; urgency=low + + * Process PDF Outline + + -- WANG Lu Mon, 28 Jan 2013 22:29:35 +0800 + +pdf2htmlex (0.7-1~git201301261427r2595c-0ubuntu1) quantal; urgency=low + + * New version, see Changelog for changelog + + -- WANG Lu Sat, 26 Jan 2013 14:27:18 +0800 + pdf2htmlex (0.6-1~git201212182148rd76af-0ubuntu1) quantal; urgency=low * fix dependency of poppler for quantal - * -- WANG Lu Tue, 18 Dec 2012 21:48:35 +0800 diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 3850590..71b8033 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -1,4 +1,4 @@ -.TH pdf2htmlEX 1 "Aug 31, 2012" "pdf2htmlEX 0.1" +.TH pdf2htmlEX 1 "pdf2htmlEX @PDF2HTMLEX_VERSION@" .SH NAME .PP .nf @@ -22,59 +22,73 @@ Fonts are extracted form PDF and then embedded into HTML (Type 3 fonts are not s Other objects are rendered as images and also embedded. .SH OPTIONS -.TP -.B --help -Show all options -.TP -.B -v, --version -Show copyright and version -.TP -.B -o, --owner-password -Specify owner password -.TP -.B -u, --user-password -Specify user password -.TP -.B --no-drm <0|1> (Default: 0) -Override document DRM settings -.TP -.B --dest-dir (Default: .) -Specify destination folder -.TP -.B --data-dir (Default: @CMAKE_INSTALL_PREFIX@/share/pdf2htmlEX) -Specify the folder holding the manifest and other files + +.SS Pages + .TP .B -f, --first-page (Default: 1) Specify the first page to process + .TP .B -l, --last-page (Default: last page) Specify the last page to process -.TP + +.SS Dimensions + .B --zoom , --fit-width , --fit-height --zoom specifies the zoom factor directly; --fit-width/height specifies the maximum width/height of a page, the values are in pixels. If multiple values are specified, the minimum one will be used. If none is specified, pages will be rendered as 72DPI. -.TP -.B --hdpi , --vdpi (Default: 144) -Specify the horizontal and vertical DPI for images + .TP .B --use-cropbox <0|1> (Default: 0) Use CropBox instead of MediaBox for output. + .TP -.B --process-nontext <0|1> (Default: 1) -Whether to process non-text objects (as images) +.B --hdpi , --vdpi (Default: 144) +Specify the horizontal and vertical DPI for images + + +.SS Output Files + .TP .B --single-html <0|1> (Default: 1) Whether to embed everything into one HTML file. If switched off, there will be several files generated along with the HTML file including files for fonts, css, images. + +Note that the outline will always be embedded into the main HTML file no matter if this switch is on or not. +And only when this switch is off will there be a separate .outline file contains the outline. +You need to modify the manifest if you do not want outline embedded. + .TP .B --split-pages <0|1> (Default: 0) -If turned on, each page is saved in a separated files, also the generated css file will be store separatedly as if single-html=0 +If turned on, pages will be stored into separated files named as 0.page, 1.page, ... + +Also the css and outline will be stored into separated files, and the will be no .html generated. + +This switch is useful if you want pages to be loaded separately & dynamically -- in which case you need to compose the page yourself, and a supporting backend might be necessary. + +.TP +.B --dest-dir (Default: .) +Specify destination folder + +.TP +.B --css-filename (Default: ) +Specify the filename of the generated css file, if not embedded. + +If it's empty, the file name will be determined automatically. + +.TP +.B --outline-filename (Default: ) +Specify the filename of the generated outline file, if not embedded. + +If it's empty, the file name will be determined automatically. + +.SS Fonts -The output files will be named as 0.page, 1.page, ... .TP .B --embed-base-font <0|1> (Default: 1) Whether to embed base 14 fonts. @@ -82,20 +96,55 @@ Whether to embed base 14 fonts. There are several base font defined in PDF standards, which are supposed to be provided by the PDF reader. If this switch is on, local matched font will be used and embedded; otherwise only font names are exported such that web browsers may try to find proper fonts themselves. + .TP .B --embed-external-font <0|1> (Default: 0) Similar as above but for non-base fonts. + +.TP +.B --font-suffix (Default: .ttf) +Specify the suffix of fonts extracted from the PDF file. + .TP .B --decompose-ligature <0|1> (Default: 0) Decompose ligatures. For example 'fi' -> 'f''i'. + +.TP +.B --remove-unused-glyph <0|1> (Default: 1) +If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size. + +.TP +.B --auto-hint <0|1> (Default: 0) +If set to 1, hints will be generated for the fonts using fontforge. + +This may be preceded by --external-hint-tool. + +.TP +.B --external-hint-tool (Default: ) +If specified, the tool will be called in order to enhanced hinting for fonts, this will precede --auto-hint. + +The tool will be called as ' ', where suffix will be the same as specified for --font-suffix. + +.TP +.B --stretch-narrow-glyph <0|1> (Default: 0) +If set to 1, glyphs narrower than described in PDF will be stretched; otherwise space will be padded to the right of the glyphs + +.TP +.B --squeeze-wide-glyph <0|1> (Default: 1) +If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated. + +.SS Text + .TP .B --heps , --veps (Default: 1) Specify the maximum tolerable horizontal/vertical offset (in pixels). pdf2htmlEX would try to optimize the generated HTML file moving Text within this distance. + .TP .B --space-threshold (Default: 1.0/6) pdf2htmlEX would insert a whitespace character ' ' if the distance between two consecutive letters in the same line is wider than ratio * font_size. + .TP .B --font-size-multiplier (Default: 4.0) Many web browsers limit the minimum font size, and many would round the given font size, which results in incorrect rendering. @@ -103,11 +152,13 @@ Many web browsers limit the minimum font size, and many would round the given fo Specify a ratio greater than 1 would resolve this issue, however it might freeze some browsers. For some versions of Firefox, however, there will be a problem when the font size is too large, in which case a smaller value should be specified here. -.TP -.B --auto-hint <0|1> (Default: 0) -If set to 1, hints will be generated for the fonts using fontforge. -This may be preceded by --external-hint-tool. +.TP +.B --space-as-offset <0|1> (Default: 0) +Treat space characters as offsets, which may increase the size of the output. + +Turn it on if space characters are not displayed correctly, or you want to remove positional spaces. + .TP .B --tounicode <-1|0|1> (Default: 0) A ToUnicode map may be provided for each font in PDF which indicates the 'meaning' of the characters. However often there is better "ToUnicode" info in Type 0/1 fonts, and sometimes the ToUnicode map provided is wrong. @@ -117,40 +168,62 @@ If this value is set to 1, the ToUnicode Map is always applied, if provided in P If set to -1, a customized map is used such that rendering will be correct in HTML (visually the same), but you may not get correct characters by select & copy & paste. If set to 0, pdf2htmlEX would try its best to balance the two methods above. -.TP -.B --space-as-offset <0|1> (Default: 0) -Treat space characters as offsets, which may increase the size of the output. -Turn it on if space characters are not displayed correctly, or you want to remove positional spaces. -.TP -.B --stretch-narrow-glyph <0|1> (Default: 0) -If set to 1, glyphs narrower than described in PDF will be stretched; otherwise space will be padded to the right of the glyphs -.TP -.B --squeeze-wide-glyph <0|1> (Default: 1) -If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated. -.TP -.B --remove-unused-glyph <0|1> (Default: 1) -If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size. -.TP -.B --font-suffix (Default: .ttf), --font-format (Default: truetype) -Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. -.TP -.B --external-hint-tool (Default: ) -If specified, the tool will be called in order to enhanced hinting for fonts, this will precede --auto-hint. +.SS PDF Protection -The tool will be called as ' ', where suffix will be the same as specified for --font-suffix. .TP -.B --css-filename (Default: ) -Specify the filename of the generated css file, if not embedded. +.B -o, --owner-password +Specify owner password -If it's empty, the file name will be determined automatically. .TP -.B --debug <0|1> (Default: 0) -Show debug information. +.B -u, --user-password +Specify user password + +.TP +.B --no-drm <0|1> (Default: 0) +Override document DRM settings + +.SS Misc. + .TP .B --clean-tmp <0|1> (Default: 1) If switched off, intermediate files won't be cleaned in the end. +.TP +.B --process-nontext <0|1> (Default: 1) +Whether to process non-text objects (as images) + +.TP +.B --data-dir (Default: @CMAKE_INSTALL_PREFIX@/share/pdf2htmlEX) +Specify the folder holding the manifest and other files (see below for the manifest file)` + +.TP +.B --css-draw <0|1> (Default: 0) +Experimental and unsupported CSS drawing + +.TP +.B --debug <0|1> (Default: 0) +Print debug information. + +.SS Meta + +.TP +.B -v, --version +Print copyright and version info + +.TP +.B --help +Print usage information + +.SH MANIFEST and DATA-DIR +When split-pages is 0, the manifest file describes how the final html page should be generated. + +By default, pdf2htmlEX will use the manifest in the default data-dir (run `pdf2htmlEX -v` to check), which gives a simple demo of its syntax. + +You can modify the default one, or you can create a new one and specify the correct data-dir in the command line. + +When single-html is 1, all files referred by the manifest must be located in the data-dir. + .SH EXAMPLE .TP .B pdf2htmlEX /path/to/file.pdf @@ -164,7 +237,7 @@ Convert file.pdf into out/file.html and leave font/image files separated. .SH COPYRIGHT .PP -Copyright 2012 Lu Wang +Copyright 2012,2013 Lu Wang pdf2htmlEX is GPLv2 & GPLv3 dual licensed diff --git a/share/base.css b/share/base.css index dec7559..d56463c 100644 --- a/share/base.css +++ b/share/base.css @@ -1,19 +1,63 @@ /* Base CSS */ /* Copyright 2012 Lu Wang */ -#pdf-main { /* PDF container */ +#pdf-outline { /* PDF Outline */ position:absolute; top:0; left:0; bottom:0; + width:193px; + overflow:auto; + margin:0px; + padding:0 0 0 7px; + background-color:#707070; + display:none; +} +#pdf-outline.opened { + display:block; +} +#pdf-outline ul { + margin-left:13px; + margin-right:3px; + padding-left:3px; +} +#pdf-outline li { + list-style-type:disc; + list-style-position:outside; +} +#pdf-outline a { + font-size:13px; + color:#e8e8e8; +} +#pdf-outline a:visited { + color:#e8e8e8; +} +#pdf-outline a:hover{ + color:#e8e8e8; +} +#pdf-outline a:active{ + color:#e8e8e8; +} +#pdf-main { /* PDF container */ + position:absolute; + top:0; + left:0px; + bottom:0; right:0; overflow:auto; - background-color:grey; + background-color:#808080; /* margin & border-width have to be 0, * otherwise pdf2htmlEX may not calculate the coordinates correctly */ margin:0; border-width:0; } +#pdf-outline.opened + #pdf-main { + left:200px; +} +/* + * The followings are base classes, which are meant to be override by PDF specific classes + * So do not increase the specificity + */ .d { /* page decoration */ position:relative; margin: 13px auto; diff --git a/share/manifest b/share/manifest index b1d3af7..7227a8c 100644 --- a/share/manifest +++ b/share/manifest @@ -1,4 +1,4 @@ -# manifest +# pdf2htmlEX manifest # by WangLu # 2012.09.12 # @@ -22,26 +22,52 @@ """ +# base CSS styles @base.css +# PDF specific CSS styles $css +# necessary Javascript codes @jquery.js @pdf2htmlEX.js +# entry point of pdf2htmlEX """ +""" +""" -
""" -$pages - +# The container of outline +# By default this is hidden, pdf2htmlEX.js will add the 'opened' class if it is not empty +# You can add a class 'opened' here if you want it always opened or you don't use pdf2htmlEX.js +# e.g. +#
+""" +
+""" +$outline """
+""" + +# The container of PDF pages +# check base.css for an example and requirements of its CSS styles +""" +
+""" +$pages +""" +
+""" + + +""" """ diff --git a/share/pdf2htmlEX.js b/share/pdf2htmlEX.js index 59a1ee4..b50e82d 100644 --- a/share/pdf2htmlEX.js +++ b/share/pdf2htmlEX.js @@ -30,7 +30,7 @@ var pdf2htmlEX = (function(){ ,ctm[1] * pos[0] + ctm[3] * pos[1] + ctm[5]]; }; var Page = function(page, container) { - if(page == undefined) return undefined; + if(page == undefined) return; this.p = $(page); this.n = parseInt(this.p.attr('data-page-no'), 16); @@ -94,8 +94,9 @@ var pdf2htmlEX = (function(){ } }); - pdf2htmlEX.Viewer = function(container_id) { + pdf2htmlEX.Viewer = function(container_id, outline_id) { this.container_id = container_id; + this.outline_id = outline_id; this.init_before_loading_content(); var _ = this; @@ -113,8 +114,14 @@ var pdf2htmlEX = (function(){ }, init_after_loading_content : function() { + this.outline = $('#'+this.outline_id); this.container = $('#'+this.container_id); + // need a better design + if(this.outline.children().length > 0) { + this.outline.addClass('opened'); + } + var new_pages = new Array(); var pl= $('.p', this.container); /* don't use for(..in..) */ @@ -129,7 +136,10 @@ var pdf2htmlEX = (function(){ //this.zoom_fixer(); - this.container.on('click', '.a', this, this.annot_link_handler); + // used by outline/annot_link etc + // note that one is for the class 'a' and the other is for the tag 'a' + this.container.on('click', '.a', this, this.link_handler); + this.outline.on('click', 'a', this, this.link_handler); this.render(); }, @@ -228,18 +238,24 @@ var pdf2htmlEX = (function(){ get_containing_page : function(obj) { /* get the page obj containing obj */ - return this.pages[(new Page(obj.closest('.p')[0])).n]; + var p = obj.closest('.p')[0]; + return p && this.pages[(new Page(p)).n]; }, - annot_link_handler : function (e) { + link_handler : function (e) { var _ = e.data; var t = $(e.currentTarget); - var cur_page = _.get_containing_page(t); - if(cur_page == undefined) return; - var cur_pos = cur_page.position(); - //get the coordinates in default user system - cur_pos = transform(cur_page.ictm, [cur_pos[0], cur_page.height()-cur_pos[1]]); + var cur_pos = [0,0]; + + // cur_page might be undefined, e.g. from Outline + var cur_page = _.get_containing_page(t); + if(cur_page != undefined) + { + cur_pos = cur_page.position(); + //get the coordinates in default user system + cur_pos = transform(cur_page.ictm, [cur_pos[0], cur_page.height()-cur_pos[1]]); + } var detail_str = t.attr('data-dest-detail'); if(detail_str == undefined) return; @@ -281,9 +297,6 @@ var pdf2htmlEX = (function(){ upside_down = false; ok = true; break; - pos = [0,0]; - ok = true; - break; default: ok = false; break; diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 99f51eb..abd0bd7 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -147,7 +147,9 @@ class HTMLRenderer : public OutputDev virtual void setDefaultCTM(double *ctm); // Start a page. + // UGLY: These 2 versions are for different versions of poppler virtual void startPage(int pageNum, GfxState *state); + virtual void startPage(int pageNum, GfxState *state, XRef * xref); // End a page. virtual void endPage(); @@ -210,12 +212,17 @@ class HTMLRenderer : public OutputDev void pre_process(PDFDoc * doc); void post_process(); - // set flags + void process_outline(); + void process_outline_items(GooList * items); + void set_stream_flags (std::ostream & out); std::string dump_embedded_font (GfxFont * font, long long fn_id); void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false); + // convert a LinkAction to a string that our Javascript code can understand + std::string get_linkaction_str(LinkAction *, std::string & detail); + //////////////////////////////////////////////////// // manage styles //////////////////////////////////////////////////// @@ -241,7 +248,7 @@ class HTMLRenderer : public OutputDev * remote font: to be retrieved from the web server * local font: to be substituted with a local (client side) font */ - void export_remote_font(const FontInfo & info, const std::string & suffix, const std::string & fontfileformat, GfxFont * font); + void export_remote_font(const FontInfo & info, const std::string & suffix, GfxFont * font); void export_remote_default_font(long long fn_id); void export_local_font(const FontInfo & info, GfxFont * font, const std::string & original_font_name, const std::string & cssfont); @@ -300,6 +307,8 @@ class HTMLRenderer : public OutputDev XRef * xref; PDFDoc * cur_doc; + Catalog * cur_catalog; + double default_ctm[6]; // page info @@ -424,8 +433,11 @@ class HTMLRenderer : public OutputDev std::map left_map; const Param * param; - std::ofstream html_fout, css_fout; - std::string html_path, css_path; + + struct { + std::ofstream fs; + std::string path; + } f_outline, f_pages, f_css; static const std::string MANIFEST_FILENAME; }; diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc index 4c17afe..7fefea5 100644 --- a/src/HTMLRenderer/TextLineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -83,7 +83,7 @@ void HTMLRenderer::TextLineBuffer::flush(void) max_ascent = max(max_ascent, s.ascent * s.draw_font_size); } - ostream & out = renderer->html_fout; + ostream & out = renderer->f_pages.fs; out << "
0) html_fout << ' '; + if(i > 0) f_pages.fs << ' '; double lw = line_width_array[i] * scale; - html_fout << round(lw); - if(is_positive(lw)) html_fout << "px"; + f_pages.fs << round(lw); + if(is_positive(lw)) f_pages.fs << "px"; } - html_fout << ";"; + f_pages.fs << ";"; } else { - html_fout << "border:none;"; + f_pages.fs << "border:none;"; } if(fill_color) { - html_fout << "background-color:" << (*fill_color) << ";"; + f_pages.fs << "background-color:" << (*fill_color) << ";"; } else { - html_fout << "background-color:transparent;"; + f_pages.fs << "background-color:transparent;"; } if(style_function) { - style_function(style_function_data, html_fout); + style_function(style_function_data, f_pages.fs); } - html_fout << "bottom:" << round(y) << "px;" + f_pages.fs << "bottom:" << round(y) << "px;" << "left:" << round(x) << "px;" << "width:" << round(w * scale) << "px;" << "height:" << round(h * scale) << "px;"; - html_fout << "\">
"; + f_pages.fs << "\">
"; } diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index ef69417..c5e2f7b 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -18,11 +18,45 @@ namespace pdf2htmlEX { -void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, const string & fontfileformat, GfxFont * font) +using std::cerr; +using std::endl; + +void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, GfxFont * font) { - css_fout << "@font-face{" - << "font-family:f" << info.id << ";" - << "src:url("; + string mime_type, format; + if(suffix == ".ttf") + { + format = "truetype"; + mime_type = "application/x-font-ttf"; + } + else if(suffix == ".otf") + { + format = "opentype"; + mime_type = "application/x-font-otf"; + } + else if(suffix == ".woff") + { + format = "woff"; + mime_type = "application/font-woff"; + } + else if(suffix == ".eot") + { + format = "embedded-opentype"; + mime_type = "application/vnd.ms-fontobject"; + } + else if(suffix == ".svg") + { + format = "svg"; + mime_type = "image/svg+xml"; + } + else + { + cerr << "Warning: unknown font suffix: " << suffix << endl; + } + + f_css.fs << "@font-face{" + << "font-family:f" << info.id << ";" + << "src:url("; { auto fn = str_fmt("f%llx%s", info.id, suffix.c_str()); @@ -32,32 +66,32 @@ void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suff ifstream fin(path, ifstream::binary); if(!fin) throw "Cannot locate font file: " + path; - css_fout << "'data:font/" + fontfileformat + ";base64," << base64stream(fin) << "'"; + f_css.fs << "'data:font/" + mime_type + ";base64," << base64stream(fin) << "'"; } else { - css_fout << (char*)fn; + f_css.fs << (char*)fn; } } - css_fout << ")" - << "format(\"" << fontfileformat << "\");" - << "}" // end of @font-face - << ".f" << info.id << "{" - << "font-family:f" << info.id << ";" - << "line-height:" << round(info.ascent - info.descent) << ";" - << "font-style:normal;" - << "font-weight:normal;" - << "visibility:visible;" - << "}" // end of .f - << endl; + f_css.fs << ")" + << "format(\"" << format << "\");" + << "}" // end of @font-face + << ".f" << info.id << "{" + << "font-family:f" << info.id << ";" + << "line-height:" << round(info.ascent - info.descent) << ";" + << "font-style:normal;" + << "font-weight:normal;" + << "visibility:visible;" + << "}" // end of .f + << endl; } static string general_font_family(GfxFont * font) { - if(font -> isFixedWidth()) + if(font->isFixedWidth()) return "monospace"; - else if (font -> isSerif()) + else if (font->isSerif()) return "serif"; else return "sans-serif"; @@ -66,45 +100,45 @@ static string general_font_family(GfxFont * font) // TODO: this function is called when some font is unable to process, may use the name there as a hint void HTMLRenderer::export_remote_default_font(long long fn_id) { - css_fout << ".f" << fn_id << "{font-family:sans-serif;visibility:hidden;}" << endl; + f_css.fs << ".f" << fn_id << "{font-family:sans-serif;visibility:hidden;}" << endl; } void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont) { - css_fout << ".f" << info.id << "{"; - css_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";"; + f_css.fs << ".f" << info.id << "{"; + f_css.fs << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";"; string fn = original_font_name; for(auto iter = fn.begin(); iter != fn.end(); ++iter) *iter = tolower(*iter); if(font->isBold() || (fn.find("bold") != string::npos)) - css_fout << "font-weight:bold;"; + f_css.fs << "font-weight:bold;"; else - css_fout << "font-weight:normal;"; + f_css.fs << "font-weight:normal;"; if(fn.find("oblique") != string::npos) - css_fout << "font-style:oblique;"; + f_css.fs << "font-style:oblique;"; else if(font->isItalic() || (fn.find("italic") != string::npos)) - css_fout << "font-style:italic;"; + f_css.fs << "font-style:italic;"; else - css_fout << "font-style:normal;"; + f_css.fs << "font-style:normal;"; - css_fout << "line-height:" << round(info.ascent - info.descent) << ";"; + f_css.fs << "line-height:" << round(info.ascent - info.descent) << ";"; - css_fout << "visibility:visible;"; + f_css.fs << "visibility:visible;"; - css_fout << "}" << endl; + f_css.fs << "}" << endl; } void HTMLRenderer::export_font_size (long long fs_id, double font_size) { - css_fout << ".s" << fs_id << "{font-size:" << round(font_size) << "px;}" << endl; + f_css.fs << ".s" << fs_id << "{font-size:" << round(font_size) << "px;}" << endl; } void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) { - css_fout << ".t" << tm_id << "{"; + f_css.fs << ".t" << tm_id << "{"; // always ignore tm[4] and tm[5] because // we have already shifted the origin @@ -114,7 +148,7 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) { auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"}; for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) - css_fout << *iter << "transform:none;"; + f_css.fs << *iter << "transform:none;"; } else { @@ -122,53 +156,53 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) { // PDF use a different coordinate system from Web - css_fout << *iter << "transform:matrix(" + f_css.fs << *iter << "transform:matrix(" << round(tm[0]) << ',' << round(-tm[1]) << ',' << round(-tm[2]) << ',' << round(tm[3]) << ','; - css_fout << "0,0);"; + f_css.fs << "0,0);"; } } - css_fout << "}" << endl; + f_css.fs << "}" << endl; } void HTMLRenderer::export_letter_space (long long ls_id, double letter_space) { - css_fout << ".l" << ls_id << "{letter-spacing:" << round(letter_space) << "px;}" << endl; + f_css.fs << ".l" << ls_id << "{letter-spacing:" << round(letter_space) << "px;}" << endl; } void HTMLRenderer::export_word_space (long long ws_id, double word_space) { - css_fout << ".w" << ws_id << "{word-spacing:" << round(word_space) << "px;}" << endl; + f_css.fs << ".w" << ws_id << "{word-spacing:" << round(word_space) << "px;}" << endl; } void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb) { - css_fout << ".c" << color_id << "{color:" << (*rgb) << ";}" << endl; + f_css.fs << ".c" << color_id << "{color:" << (*rgb) << ";}" << endl; } void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) { if(ws_width > 0) - css_fout << "._" << ws_id << "{display:inline-block;width:" << round(ws_width) << "px;}" << endl; + f_css.fs << "._" << ws_id << "{display:inline-block;width:" << round(ws_width) << "px;}" << endl; else - css_fout << "._" << ws_id << "{display:inline;margin-left:" << round(ws_width) << "px;}" << endl; + f_css.fs << "._" << ws_id << "{display:inline;margin-left:" << round(ws_width) << "px;}" << endl; } void HTMLRenderer::export_rise (long long rise_id, double rise) { - css_fout << ".r" << rise_id << "{top:" << round(-rise) << "px;}" << endl; + f_css.fs << ".r" << rise_id << "{top:" << round(-rise) << "px;}" << endl; } void HTMLRenderer::export_height (long long height_id, double height) { - css_fout << ".h" << height_id << "{height:" << round(height) << "px;}" << endl; + f_css.fs << ".h" << height_id << "{height:" << round(height) << "px;}" << endl; } void HTMLRenderer::export_left (long long left_id, double left) { - css_fout << ".L" << left_id << "{left:" << round(left) << "px;}" << endl; + f_css.fs << ".L" << left_id << "{left:" << round(left) << "px;}" << endl; } } diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 1aa2d21..e67237d 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -13,6 +13,8 @@ #include #include +#include + #include "HTMLRenderer.h" #include "TextLineBuffer.h" #include "pdf2htmlEX-config.h" @@ -35,10 +37,6 @@ using std::abs; using std::cerr; using std::endl; -static void dummy(void *, enum ErrorCategory, int pos, char *) -{ -} - HTMLRenderer::HTMLRenderer(const Param * param) :OutputDev() ,line_opened(false) @@ -49,8 +47,8 @@ HTMLRenderer::HTMLRenderer(const Param * param) { if(!(param->debug)) { - //disable error function of poppler - setErrorCallback(&dummy, nullptr); + //disable error messages of poppler + globalParams->setErrQuiet(gTrue); } ffw_init(param->debug); @@ -71,10 +69,14 @@ HTMLRenderer::~HTMLRenderer() void HTMLRenderer::process(PDFDoc *doc) { cur_doc = doc; + cur_catalog = doc->getCatalog(); xref = doc->getXRef(); pre_process(doc); + /////////////////// + // Process pages + BackgroundRenderer * bg_renderer = nullptr; if(param->process_nontext) { @@ -90,10 +92,10 @@ void HTMLRenderer::process(PDFDoc *doc) if(param->split_pages) { auto page_fn = str_fmt("%s/%s%d.page", param->dest_dir.c_str(), param->output_filename.c_str(), i); - html_fout.open((char*)page_fn, ofstream::binary); - if(!html_fout) + f_pages.fs.open((char*)page_fn, ofstream::binary); + if(!f_pages.fs) throw string("Cannot open ") + (char*)page_fn + " for writing"; - set_stream_flags(html_fout); + set_stream_flags(f_pages.fs); } if(param->process_nontext) @@ -114,13 +116,17 @@ void HTMLRenderer::process(PDFDoc *doc) if(param->split_pages) { - html_fout.close(); + f_pages.fs.close(); } } if(page_count >= 0) cerr << "Working: " << page_count << "/" << page_count; cerr << endl; + //////////////////////// + // Process Outline + process_outline(); + post_process(); if(bg_renderer) @@ -135,6 +141,11 @@ void HTMLRenderer::setDefaultCTM(double *ctm) } void HTMLRenderer::startPage(int pageNum, GfxState *state) +{ + startPage(pageNum, state, nullptr); +} + +void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) { this->pageNum = pageNum; this->pageWidth = state->getPageWidth(); @@ -142,7 +153,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) assert((!line_opened) && "Open line in startPage detected!"); - html_fout + f_pages.fs << "
" @@ -151,7 +162,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) if(param->process_nontext) { - html_fout << "background-image:url("; + f_pages.fs << "background-image:url("; { if(param->single_html) @@ -160,18 +171,18 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) ifstream fin((char*)path, ifstream::binary); if(!fin) throw string("Cannot read background image ") + (char*)path; - html_fout << "'data:image/png;base64," << base64stream(fin) << "'"; + f_pages.fs << "'data:image/png;base64," << base64stream(fin) << "'"; } else { - html_fout << str_fmt("p%x.png", pageNum); + f_pages.fs << str_fmt("p%x.png", pageNum); } } - html_fout << ");background-position:0 0;background-size:" << pageWidth << "px " << pageHeight << "px;background-repeat:no-repeat;"; + f_pages.fs << ");background-position:0 0;background-size:" << pageWidth << "px " << pageHeight << "px;background-repeat:no-repeat;"; } - html_fout << "\">"; + f_pages.fs << "\">"; draw_text_scale = 1.0; cur_font_info = install_font(nullptr); @@ -206,26 +217,26 @@ void HTMLRenderer::endPage() { cur_doc->processLinks(this, pageNum); // close box - html_fout << "
"; + f_pages.fs << "
"; // dump info for js // TODO: create a function for this // BE CAREFUL WITH ESCAPES - html_fout << "
0) html_fout << ","; - html_fout << round(default_ctm[i]); + if(i > 0) f_pages.fs << ","; + f_pages.fs << round(default_ctm[i]); } - html_fout << "]"; + f_pages.fs << "]"; - html_fout << "}'>
"; + f_pages.fs << "}'>"; // close page - html_fout << "" << endl; + f_pages.fs << "" << endl; } void HTMLRenderer::pre_process(PDFDoc * doc) @@ -290,11 +301,32 @@ void HTMLRenderer::pre_process(PDFDoc * doc) if(param->single_html && (!param->split_pages)) tmp_files.add((char*)fn); - css_path = (char*)fn, - css_fout.open(css_path, ofstream::binary); - if(!css_fout) + f_css.path = (char*)fn; + f_css.fs.open(f_css.path, ofstream::binary); + if(!f_css.fs) throw string("Cannot open ") + (char*)fn + " for writing"; - set_stream_flags(css_fout); + set_stream_flags(f_css.fs); + } + + { + /* + * The logic for outline is similar to css + */ + + auto fn = (param->single_html && (!param->split_pages)) + ? str_fmt("%s/__outline", param->tmp_dir.c_str()) + : str_fmt("%s/%s", param->dest_dir.c_str(), param->outline_filename.c_str()); + + if(param->single_html && (!param->split_pages)) + tmp_files.add((char*)fn); + + f_outline.path = (char*)fn; + f_outline.fs.open(f_outline.path, ofstream::binary); + if(!f_outline.fs) + throw string("Cannot open") + (char*)fn + " for writing"; + + // might not be necessary + set_stream_flags(f_outline.fs); } // if split-pages is specified, open & close the file in the process loop @@ -303,7 +335,7 @@ void HTMLRenderer::pre_process(PDFDoc * doc) { /* * If single-html - * we have to keep the html file (for page) into a temporary place + * we have to keep the html file for pages into a temporary place * because we'll have to embed css before it * * Otherwise just generate it @@ -311,21 +343,22 @@ void HTMLRenderer::pre_process(PDFDoc * doc) auto fn = str_fmt("%s/__pages", param->tmp_dir.c_str()); tmp_files.add((char*)fn); - html_path = (char*)fn; - html_fout.open(html_path, ofstream::binary); - if(!html_fout) + f_pages.path = (char*)fn; + f_pages.fs.open(f_pages.path, ofstream::binary); + if(!f_pages.fs) throw string("Cannot open ") + (char*)fn + " for writing"; - set_stream_flags(html_fout); + set_stream_flags(f_pages.fs); } } void HTMLRenderer::post_process() { // close files - html_fout.close(); - css_fout.close(); + f_outline.fs.close(); + f_pages.fs.close(); + f_css.fs.close(); - //only when split-page, do we have some work left to do + //only when split-page == 0, do we have some work left to do if(param->split_pages) return; @@ -359,7 +392,9 @@ void HTMLRenderer::post_process() continue; } - if(line.empty() || line[0] == '#') + if(line.empty() + || (line.find_first_not_of(' ') == string::npos) + || line[0] == '#') continue; @@ -373,14 +408,23 @@ void HTMLRenderer::post_process() { if(line == "$css") { - embed_file(output, css_path, ".css", false); + embed_file(output, f_css.path, ".css", false); } - else if (line == "$pages") + else if (line == "$outline") { - ifstream fin(html_path, ifstream::binary); + ifstream fin(f_outline.path, ifstream::binary); if(!fin) throw "Cannot open read the pages"; output << fin.rdbuf(); + output.clear(); // output will set fail big if fin is empty + } + else if (line == "$pages") + { + ifstream fin(f_pages.path, ifstream::binary); + if(!fin) + throw "Cannot open read the pages"; + output << fin.rdbuf(); + output.clear(); // output will set fail big if fin is empty } else { @@ -418,8 +462,9 @@ void HTMLRenderer::embed_file(ostream & out, const string & path, const string & if(!fin) throw string("Cannot open file ") + path + " for embedding"; out << iter->second.first << endl - << fin.rdbuf() - << iter->second.second << endl; + << fin.rdbuf(); + out.clear(); // out will set fail big if fin is empty + out << iter->second.second << endl; } else { @@ -437,6 +482,7 @@ void HTMLRenderer::embed_file(ostream & out, const string & path, const string & if(!out) throw string("Cannot open file ") + path + " for embedding"; out << fin.rdbuf(); + out.clear(); // out will set fail big if fin is empty } } } diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index 4dac8d2..a3f7818 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -110,7 +110,7 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info) if(path != "") { embed_font(path, font, info); - export_remote_font(info, param->font_suffix, param->font_format, font); + export_remote_font(info, param->font_suffix, font); } else { @@ -129,7 +129,7 @@ void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, Font if(localfontloc != nullptr) { embed_font(localfontloc->path->getCString(), font, info); - export_remote_font(info, param->font_suffix, param->font_format, font); + export_remote_font(info, param->font_suffix, font); delete localfontloc; return; } @@ -186,7 +186,7 @@ void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info) if(localfontloc != nullptr) { embed_font(string(localfontloc->path->getCString()), font, info); - export_remote_font(info, param->font_suffix, param->font_format, font); + export_remote_font(info, param->font_suffix, font); delete localfontloc; return; } @@ -281,7 +281,7 @@ long long HTMLRenderer::install_whitespace(double ws_width, double & actual_widt { // ws_width is already mulitpled by draw_scale auto iter = whitespace_map.lower_bound(ws_width - param->h_eps); - if((iter != whitespace_map.end()) && (abs(iter->first - ws_width) < param->h_eps)) + if((iter != whitespace_map.end()) && (abs(iter->first - ws_width) <= param->h_eps)) { actual_width = iter->first; return iter->second; @@ -297,7 +297,7 @@ long long HTMLRenderer::install_whitespace(double ws_width, double & actual_widt long long HTMLRenderer::install_rise(double rise) { auto iter = rise_map.lower_bound(rise - param->v_eps); - if((iter != rise_map.end()) && (abs(iter->first - rise) < param->v_eps)) + if((iter != rise_map.end()) && (abs(iter->first - rise) <= param->v_eps)) { return iter->second; } @@ -311,7 +311,7 @@ long long HTMLRenderer::install_rise(double rise) long long HTMLRenderer::install_height(double height) { auto iter = height_map.lower_bound(height - EPS); - if((iter != height_map.end()) && (abs(iter->first - height) < EPS)) + if((iter != height_map.end()) && (abs(iter->first - height) <= EPS)) { return iter->second; } @@ -324,7 +324,7 @@ long long HTMLRenderer::install_height(double height) long long HTMLRenderer::install_left(double left) { auto iter = left_map.lower_bound(left - param->h_eps); - if((iter != left_map.end()) && (abs(iter->first - left) < param->h_eps)) + if((iter != left_map.end()) && (abs(iter->first - left) <= param->h_eps)) { return iter->second; } diff --git a/src/HTMLRenderer/link.cc b/src/HTMLRenderer/link.cc index fa78b9e..ec35f26 100644 --- a/src/HTMLRenderer/link.cc +++ b/src/HTMLRenderer/link.cc @@ -29,9 +29,27 @@ using std::endl; /* * The detailed rectangle area of the link destination * Will be parsed and performed by Javascript + * The string will be put into a HTML attribute, surrounded by single quotes + * So pay attention to the characters used here */ -static string get_dest_detail_str(int pageno, LinkDest * dest) +static string get_linkdest_detail_str(LinkDest * dest, Catalog * catalog, int & pageno) { + pageno = 0; + if(dest->isPageRef()) + { + auto pageref = dest->getPageRef(); + pageno = catalog->findPage(pageref.num, pageref.gen); + } + else + { + pageno = dest->getPageNum(); + } + + if(pageno <= 0) + { + return ""; + } + ostringstream sout; // dec sout << "[" << pageno; @@ -108,16 +126,11 @@ static string get_dest_detail_str(int pageno, LinkDest * dest) return sout.str(); } - -/* - * Based on pdftohtml from poppler - * TODO: CSS for link rectangles - * TODO: share rectangle draw with css-draw - */ -void HTMLRenderer::processLink(AnnotLink * al) + +string HTMLRenderer::get_linkaction_str(LinkAction * action, string & detail) { - std::string dest_str, dest_detail_str; - auto action = al->getAction(); + string dest_str; + detail = ""; if(action) { auto kind = action->getKind(); @@ -125,34 +138,21 @@ void HTMLRenderer::processLink(AnnotLink * al) { case actionGoTo: { - auto catalog = cur_doc->getCatalog(); auto * real_action = dynamic_cast(action); LinkDest * dest = nullptr; if(auto _ = real_action->getDest()) dest = _->copy(); else if (auto _ = real_action->getNamedDest()) - dest = catalog->findDest(_); + dest = cur_catalog->findDest(_); if(dest) { int pageno = 0; - if(dest->isPageRef()) - { - auto pageref = dest->getPageRef(); - pageno = catalog->findPage(pageref.num, pageref.gen); - } - else - { - pageno = dest->getPageNum(); - } - + detail = get_linkdest_detail_str(dest, cur_catalog, pageno); if(pageno > 0) { dest_str = (char*)str_fmt("#p%x", pageno); - dest_detail_str = get_dest_detail_str(pageno, dest); } - delete dest; - } } break; @@ -178,17 +178,30 @@ void HTMLRenderer::processLink(AnnotLink * al) } } - if(dest_str != "") + return dest_str; +} + +/* + * Based on pdftohtml from poppler + * TODO: CSS for link rectangles + * TODO: share rectangle draw with css-draw + */ +void HTMLRenderer::processLink(AnnotLink * al) +{ + string dest_detail_str; + string dest_str = get_linkaction_str(al->getAction(), dest_detail_str); + + if(!dest_str.empty()) { - html_fout << ""; + f_pages.fs << ">"; } - html_fout << "
getStyle(); switch(style) { case AnnotBorder::borderSolid: - html_fout << "border-style:solid;"; + f_pages.fs << "border-style:solid;"; break; case AnnotBorder::borderDashed: - html_fout << "border-style:dashed;"; + f_pages.fs << "border-style:dashed;"; break; case AnnotBorder::borderBeveled: - html_fout << "border-style:outset;"; + f_pages.fs << "border-style:outset;"; break; case AnnotBorder::borderInset: - html_fout << "border-style:inset;"; + f_pages.fs << "border-style:inset;"; break; case AnnotBorder::borderUnderlined: - html_fout << "border-style:none;border-bottom-style:solid;"; + f_pages.fs << "border-style:none;border-bottom-style:solid;"; break; default: cerr << "Warning:Unknown annotation border style: " << style << endl; - html_fout << "border-style:solid;"; + f_pages.fs << "border-style:solid;"; } @@ -257,36 +270,36 @@ void HTMLRenderer::processLink(AnnotLink * al) r = g = b = 0; } - html_fout << "border-color:rgb(" + f_pages.fs << "border-color:rgb(" << dec << (int)dblToByte(r) << "," << (int)dblToByte(g) << "," << (int)dblToByte(b) << hex << ");"; } else { - html_fout << "border-style:none;"; + f_pages.fs << "border-style:none;"; } } else { - html_fout << "border-style:none;"; + f_pages.fs << "border-style:none;"; } tm_transform(default_ctm, x, y); - html_fout << "position:absolute;" + f_pages.fs << "position:absolute;" << "left:" << round(x) << "px;" << "bottom:" << round(y) << "px;" << "width:" << round(w) << "px;" << "height:" << round(h) << "px;"; // fix for IE - html_fout << "background-color:rgba(255,255,255,0.000001);"; + f_pages.fs << "background-color:rgba(255,255,255,0.000001);"; - html_fout << "\">
"; + f_pages.fs << "\">"; if(dest_str != "") { - html_fout << "
"; + f_pages.fs << ""; } } diff --git a/src/HTMLRenderer/outline.cc b/src/HTMLRenderer/outline.cc new file mode 100644 index 0000000..b8cf6db --- /dev/null +++ b/src/HTMLRenderer/outline.cc @@ -0,0 +1,72 @@ +/* + * outline.cc + * + * Handling Outline items + * + * by WangLu + * 2013.01.28 + */ + +#include + +#include +#include + +#include "HTMLRenderer.h" +#include "util/namespace.h" +#include "util/unicode.h" + +namespace pdf2htmlEX { + +using std::ostream; + +void HTMLRenderer::process_outline_items(GooList * items) +{ + if((!items) || (items->getLength() == 0)) + return; + + f_outline.fs << "
    "; + + for(int i = 0; i < items->getLength(); ++i) + { + OutlineItem * item = (OutlineItem*)(items->get(i)); + + string detail; + string dest = get_linkaction_str(item->getAction(), detail); + + // we don't care dest is empty or not. + f_outline.fs << "
  • " + << ""; + + outputUnicodes(f_outline.fs, item->getTitle(), item->getTitleLength()); + + f_outline.fs << ""; + + // check kids + item->open(); + if(item->hasKids()) + { + process_outline_items(item->getKids()); + } + item->close(); + f_outline.fs << "
  • "; + } + + f_outline.fs << "
"; +} + +void HTMLRenderer::process_outline() +{ + Outline * outline = cur_doc->getOutline(); + if(!outline) + return; + + process_outline_items(outline->getItems()); +} + +}// namespace pdf2htmlEX diff --git a/src/Param.h b/src/Param.h index 2d22396..e8d6b90 100644 --- a/src/Param.h +++ b/src/Param.h @@ -15,63 +15,56 @@ namespace pdf2htmlEX { struct Param { - // PDF stuff - std::string owner_password, user_password; - std::string input_filename, output_filename; - int no_drm; - - // path - std::string dest_dir, tmp_dir, data_dir; - - // normal parameters + // pages int first_page, last_page; - + + // dimensions double zoom; double fit_width, fit_height; - double h_dpi, v_dpi; int use_cropbox; - - int process_nontext; + double h_dpi, v_dpi; + + // output files int single_html; int split_pages; + std::string dest_dir; + std::string css_filename; + std::string outline_filename; + + // fonts int embed_base_font; int embed_external_font; + std::string font_suffix; int decompose_ligature; - - // Advanced tweak - /* - * Position & Size - */ + int remove_unused_glyph; + int auto_hint; + std::string external_hint_tool; + int stretch_narrow_glyph; + int squeeze_wide_glyph; + + // text double h_eps, v_eps; double space_threshold; double font_size_multiplier; - - /* - * Font - */ - int auto_hint; - int tounicode; int space_as_offset; - int stretch_narrow_glyph; - int squeeze_wide_glyph; - int remove_unused_glyph; - - std::string font_suffix, font_format; - std::string external_hint_tool; - - /* - * Output - */ - std::string css_filename; - - /* - * Debug - */ - int debug; + int tounicode; + + // encryption + std::string owner_password, user_password; + int no_drm; + + // misc. int clean_tmp; - - // experimental + int process_nontext; + std::string data_dir; int css_draw; + int debug; + + // non-optional + std::string input_filename, output_filename; + + // not a paramater + std::string tmp_dir; }; } // namespace pdf2htmlEX diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 162ffea..bee540d 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -36,13 +36,8 @@ ArgParser argparser; void show_usage_and_exit(const char * dummy = nullptr) { - cerr << "Usage: pdf2htmlEX [Options] []" << endl; - cerr << endl; - cerr << "Options:" << endl; + cerr << "Usage: pdf2htmlEX [options] []" << endl; argparser.show_usage(cerr); - cerr << endl; - cerr << "Run 'man pdf2htmlEX' for detailed information" << endl; - cerr << endl; exit(EXIT_FAILURE); } @@ -53,60 +48,76 @@ void show_version_and_exit(const char * dummy = nullptr) cerr << "Libraries: "; cerr << "poppler " << POPPLER_VERSION << ", "; cerr << "libfontforge " << ffw_get_version() << endl; + cerr << "Default data-dir: " << PDF2HTMLEX_DATA_PATH << endl; exit(EXIT_SUCCESS); } void parse_options (int argc, char **argv) { + string deprecated_string; + argparser - .add("help,h", "show all options", &show_usage_and_exit) - .add("version,v", "show copyright and version info", &show_version_and_exit) - + // pages + .add("first-page,f", ¶m.first_page, 1, "first page to convert") + .add("last-page,l", ¶m.last_page, numeric_limits::max(), "last page to convert") + + // dimensions + .add("zoom", ¶m.zoom, 0, "zoom ratio", nullptr, true) + .add("fit-width", ¶m.fit_width, 0, "fit width to pixels", nullptr, true) + .add("fit-height", ¶m.fit_height, 0, "fit height to pixels", nullptr, true) + .add("use-cropbox", ¶m.use_cropbox, 0, "use CropBox instead of MediaBox") + .add("hdpi", ¶m.h_dpi, 144.0, "horizontal resolution for graphics in DPI") + .add("vdpi", ¶m.v_dpi, 144.0, "vertical resolution for graphics in DPI") + + // output files + .add("single-html", ¶m.single_html, 1, "generate a single HTML file") + .add("split-pages", ¶m.split_pages, 0, "split pages into separate files") + .add("dest-dir", ¶m.dest_dir, ".", "specify destination directory") + .add("css-filename", ¶m.css_filename, "", "filename of the generated css file") + .add("outline-filename", ¶m.outline_filename, "", "filename of the generated outline file") + + // fonts + .add("embed-base-font", ¶m.embed_base_font, 0, "embed local match for standard 14 fonts") + .add("embed-external-font", ¶m.embed_external_font, 0, "embed local match for external fonts") + .add("font-suffix", ¶m.font_suffix, ".ttf", "suffix for embedded font files (.ttf,.otf,.woff,.svg)") + .add("decompose-ligature", ¶m.decompose_ligature, 0, "decompose ligatures, such as \uFB01 -> fi") + .add("remove-unused-glyph", ¶m.remove_unused_glyph, 1, "remove unused glyphs in embedded fonts") + .add("auto-hint", ¶m.auto_hint, 0, "use fontforge autohint on fonts without hints") + .add("external-hint-tool", ¶m.external_hint_tool, "", "external tool for hinting fonts (overrides --auto-hint)") + .add("stretch-narrow-glyph", ¶m.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding them") + .add("squeeze-wide-glyph", ¶m.squeeze_wide_glyph, 1, "shrink wide glyphs instead of truncating them") + + // text + .add("heps", ¶m.h_eps, 1.0, "horizontal threshold for merging text, in pixels") + .add("veps", ¶m.v_eps, 1.0, "vertical threshold for merging text, in pixels") + .add("space-threshold", ¶m.space_threshold, (1.0/8), "word break threshold (threshold * em)") + .add("font-size-multiplier", ¶m.font_size_multiplier, 4.0, "a value greater than 1 increases the rendering accuracy") + .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") + .add("tounicode", ¶m.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)") + + // encryption .add("owner-password,o", ¶m.owner_password, "", "owner password (for encrypted files)", nullptr, true) .add("user-password,u", ¶m.user_password, "", "user password (for encrypted files)", nullptr, true) .add("no-drm", ¶m.no_drm, 0, "override document DRM settings") - - .add("dest-dir", ¶m.dest_dir, ".", "specify destination directory") + + // misc. + .add("clean-tmp", ¶m.clean_tmp, 1, "remove temporary files after conversion") + .add("process-nontext", ¶m.process_nontext, 1, "render graphics in addition to text") .add("data-dir", ¶m.data_dir, PDF2HTMLEX_DATA_PATH, "specify data directory") - - .add("first-page,f", ¶m.first_page, 1, "first page to process") - .add("last-page,l", ¶m.last_page, numeric_limits::max(), "last page to process") - - .add("zoom", ¶m.zoom, 0, "zoom ratio", nullptr, true) - .add("fit-width", ¶m.fit_width, 0, "fit width to pixels", nullptr, true) - .add("fit-height", ¶m.fit_height, 0, "fit height to pixels", nullptr, true) - .add("hdpi", ¶m.h_dpi, 144.0, "horizontal DPI for non-text") - .add("vdpi", ¶m.v_dpi, 144.0, "vertical DPI for non-text") - .add("use-cropbox", ¶m.use_cropbox, 0, "use CropBox instead of MediaBox") - - .add("process-nontext", ¶m.process_nontext, 1, "process nontext objects") - .add("single-html", ¶m.single_html, 1, "combine everything into one single HTML file") - .add("split-pages", ¶m.split_pages, 0, "split pages into separated files") - .add("embed-base-font", ¶m.embed_base_font, 0, "embed local matched font for base 14 fonts in the PDF file") - .add("embed-external-font", ¶m.embed_external_font, 0, "embed local matched font for external fonts in the PDF file") - .add("decompose-ligature", ¶m.decompose_ligature, 0, "decompose ligatures, for example 'fi' -> 'f''i'") - - .add("heps", ¶m.h_eps, 1.0, "max tolerated horizontal offset (in pixels)") - .add("veps", ¶m.v_eps, 1.0, "max tolerated vertical offset (in pixels)") - .add("space-threshold", ¶m.space_threshold, (1.0/8), "distance no thiner than (threshold * em) will be considered as a space character") - .add("font-size-multiplier", ¶m.font_size_multiplier, 4.0, "setting a value greater than 1 would increase the rendering accuracy") - .add("auto-hint", ¶m.auto_hint, 0, "Whether to generate hints for fonts") - .add("tounicode", ¶m.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled") - .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") - .add("stretch-narrow-glyph", ¶m.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding space") - .add("squeeze-wide-glyph", ¶m.squeeze_wide_glyph, 1, "squeeze wide glyphs instead of truncating") - .add("remove-unused-glyph", ¶m.remove_unused_glyph, 1, "remove unused glyphs in embedded fonts") - - .add("font-suffix", ¶m.font_suffix, ".ttf", "suffix for extracted font files") - .add("font-format", ¶m.font_format, "opentype", "format for extracted font files") - .add("external-hint-tool", ¶m.external_hint_tool, "", "external tool for hintting fonts.(overrides --auto-hint)") - .add("css-filename", ¶m.css_filename, "", "Specify the file name of the generated css file") - - .add("debug", ¶m.debug, 0, "output debug information") - .add("clean-tmp", ¶m.clean_tmp, 1, "clean temporary files after processing") - .add("css-draw", ¶m.css_draw, 0, "[Experimental and Unsupported] CSS Drawing") + .add("css-draw", ¶m.css_draw, 0, "[experimental and unsupported] CSS drawing") + .add("debug", ¶m.debug, 0, "print debugging information") + + // meta + .add("version,v", "print copyright and version info", &show_version_and_exit) + .add("help,h", "print usage information", &show_usage_and_exit) + .add("", ¶m.input_filename, "", "") .add("", ¶m.output_filename, "", "") + + // deprecated + .add("font-format", &deprecated_string, "", "", [] (const char*) { + cerr << "warning: --font-format is deprecated, @font-face format is inferred from --font-suffix" << endl; + }) ; try @@ -140,8 +151,7 @@ int main(int argc, char **argv) parse_options(argc, argv); if (param.input_filename == "") { - cerr << "Missing input filename" << endl; - exit(EXIT_FAILURE); + show_usage_and_exit(); } //prepare the directories @@ -202,7 +212,7 @@ int main(int argc, char **argv) param.first_page = min(max(param.first_page, 1), doc->getNumPages()); param.last_page = min(max(param.last_page, param.first_page), doc->getNumPages()); - if(param.output_filename == "") + if(param.output_filename.empty()) { const string s = get_filename(param.input_filename); @@ -223,7 +233,7 @@ int main(int argc, char **argv) } } - if(param.css_filename == "") + if(param.css_filename.empty()) { const string s = get_filename(param.input_filename); @@ -237,6 +247,21 @@ int main(int argc, char **argv) param.css_filename = s + ".css"; } } + if(param.outline_filename.empty()) + { + const string s = get_filename(param.input_filename); + + if(get_suffix(param.input_filename) == ".pdf") + { + param.outline_filename = s.substr(0, s.size() - 4) + ".outline"; + } + else + { + if(!param.split_pages) + param.outline_filename = s + ".outline"; + } + + } HTMLRenderer * htmlOut = new HTMLRenderer(¶m); htmlOut->process(doc); diff --git a/src/util/ArgParser.cc b/src/util/ArgParser.cc index 04381e6..0edc25f 100644 --- a/src/util/ArgParser.cc +++ b/src/util/ArgParser.cc @@ -76,7 +76,7 @@ void ArgParser::parse(int argc, char ** argv) const int v = p->shortname; if(!(opt_map.insert(make_pair(v, p)).second)) { - cerr << "Warning: duplicated shortname '" << v << "' used by -" << (char)(p->shortname) << " and -" << (char)(opt_map[p->shortname]->shortname) << endl; + cerr << "Warning: duplicated shortname: " << v << endl; } } @@ -93,7 +93,7 @@ void ArgParser::parse(int argc, char ** argv) const } if(!(opt_map.insert(make_pair(v, p)).second)) { - cerr << "Warning: duplicated shortname '" << v << "' used by --" << (p->name) << " and --" << (opt_map[p->shortname]->name) << endl; + cerr << "Warning: duplicated long name: " << (p->name) << endl; } } } @@ -146,6 +146,10 @@ void ArgParser::show_usage(ostream & out) const } } +template<> const char * ArgParser::get_type_name (void) { return "int"; } +template<> const char * ArgParser::get_type_name (void) { return "fp"; } +template<> const char * ArgParser::get_type_name (void) { return "string"; } + ArgParser::ArgEntryBase::ArgEntryBase(const char * name, const char * description, bool need_arg) : shortname(0), name(name), description(description), need_arg(need_arg) { @@ -159,11 +163,11 @@ ArgParser::ArgEntryBase::ArgEntryBase(const char * name, const char * descriptio } else { - cerr << "Warning: argument '" << this->name << "' may not be parsed correctly" << endl; + cerr << "Warning: argument '" << this->name << "' cannnot be parsed as a short option" << endl; } } } -const int ArgParser::arg_col_width = 40; +const int ArgParser::arg_col_width = 31; } // namespace pdf2htmlEX diff --git a/src/util/ArgParser.h b/src/util/ArgParser.h index 432ec59..a6c58c4 100644 --- a/src/util/ArgParser.h +++ b/src/util/ArgParser.h @@ -39,7 +39,6 @@ void dump_value(std::ostream & out, const T & v) extern void dump_value(std::ostream & out, const std::string & v); - class ArgParser { public: @@ -48,22 +47,29 @@ class ArgParser typedef void (*ArgParserCallBack) (const char * arg); /* - * optname: name of the argment, should be provided as --optname - * description: if description is "", the argument won't be shown in show_usage() + * The 1st is for arg without arguments (i.e. flags), and the 2nd is for general args. + * optname: + * - if not nullptr, it should be the name of the arg, should be in the format of "[,]", e.g. "help,h" + * - if nullptr, it denotes an optional arg, and description will be ignored + * description: + * - if description is nullptr or "", the argument won't be shown in show_usage() */ - ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback = nullptr); - template - ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback = nullptr, bool dont_show_default = false); + ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback = nullptr, bool dont_show_default = false); void parse(int argc, char ** argv) const; void show_usage(std::ostream & out) const; private: + // type names helper + template + static const char * get_type_name(void) { return "unknown"; } + class ArgEntryBase { public: + /* name or description cannot be nullptr */ ArgEntryBase(const char * name, const char * description, bool need_arg); virtual ~ArgEntryBase() { } char shortname; @@ -101,15 +107,25 @@ class ArgParser template ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback, bool dont_show_default) { - // use "" in case nullptr is provided + // ArgEntry does not accept nullptr as optname nor description if((!optname) || (!optname[0])) + { + // when optname is nullptr or "", it's optional, and description is dropped optional_arg_entries.push_back(new ArgEntry("", location, default_value, callback, "", dont_show_default)); + } else - arg_entries.push_back(new ArgEntry(optname, location, default_value, callback, description, dont_show_default)); + { + arg_entries.push_back(new ArgEntry(optname, location, default_value, callback, (description ? description : ""), dont_show_default)); + } return *this; } +// Known types +template<> const char * ArgParser::get_type_name (void); +template<> const char * ArgParser::get_type_name (void); +template<> const char * ArgParser::get_type_name (void); + template ArgParser::ArgEntry::ArgEntry(const char * name, T * location, const Tv & default_value, ArgParserCallBack callback, const char * description, bool dont_show_default) : ArgEntryBase(name, description, (location != nullptr)) @@ -141,7 +157,7 @@ void ArgParser::ArgEntry::parse(const char * arg) const template void ArgParser::ArgEntry::show_usage(std::ostream & out) const { - if(description == "") + if(description.empty()) return; std::ostringstream sout; @@ -161,13 +177,7 @@ void ArgParser::ArgEntry::show_usage(std::ostream & out) const if(need_arg) { - sout << " "; - if(!dont_show_default) - { - sout << " (="; - dump_value(sout, default_value); - sout << ")"; - } + sout << " <" << get_type_name() << ">"; } std::string s = sout.str(); @@ -175,8 +185,17 @@ void ArgParser::ArgEntry::show_usage(std::ostream & out) const for(int i = s.size(); i < arg_col_width; ++i) out << ' '; - - out << " " << description << std::endl; + + out << " " << description; + + if(need_arg && !dont_show_default) + { + out << " (default: "; + dump_value(out, default_value); + out << ")"; + } + + out << std::endl; } } // namespace ArgParser diff --git a/src/util/math.h b/src/util/math.h index 9c9f5db..2966090 100644 --- a/src/util/math.h +++ b/src/util/math.h @@ -15,7 +15,7 @@ namespace pdf2htmlEX { static inline double round(double x) { return (std::abs(x) > EPS) ? x : 0.0; } -static inline bool equal(double x, double y) { return std::abs(x-y) < EPS; } +static inline bool equal(double x, double y) { return std::abs(x-y) <= EPS; } static inline bool is_positive(double x) { return x > EPS; } static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6) { diff --git a/src/util/unicode.h b/src/util/unicode.h index 9cc9dc6..6b527da 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -33,6 +33,9 @@ Unicode unicode_from_font (CharCode code, GfxFont * font); */ Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font); +/* + * Escape necessary characters, and map Unicode to UTF-8 + */ void outputUnicodes(std::ostream & out, const Unicode * u, int uLen); diff --git a/test/test.py b/test/test.py index 79a09f8..79ab79d 100755 --- a/test/test.py +++ b/test/test.py @@ -13,7 +13,7 @@ with open('out.html','w') as outf: if not f.lower().endswith('.pdf'): continue print f - if os.system('pdf2htmlEX --dest-dir html --auto-hint=1 --external-hint-tool="ttfautohint" "%s/%s"' % (DIR,f)) != 0: + if os.system('pdf2htmlEX -l 7 --dest-dir html --auto-hint=1 --external-hint-tool="ttfautohint" "%s/%s"' % (DIR,f)) != 0: print "error on ", f sys.exit(-1)