1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-05 01:28:39 +00:00

Merge branch 'master' of github.com:coolwanglu/pdf2htmlEX

This commit is contained in:
Lu Wang 2013-01-30 21:48:31 +08:00
commit b8e49c448b
26 changed files with 802 additions and 403 deletions

14
AUTHORS Normal file
View File

@ -0,0 +1,14 @@
Deepak <iapain@gmail.com>
filodej <philode@gmail.com>
hasufell <julian.ospald@googlemail.com>
Herbert Jones <herbert@mediafire.com>
Hongliang Tian <tatetian@gmail.com>
John Hewson <john@jahewson.com>
Lu Wang <coolwanglu@gmail.com>
Packagers:
Arthur Titeica <arthur.titeica@gmail.com>
Deepak Thukral <iapain@iapa.in>
Jamie Ly <me@jamie.ly>
Lu Wang <coolwanglu@gmail.com>

View File

@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 2.6.0 FATAL_ERROR)
include_directories(${CMAKE_SOURCE_DIR}/src)
set(PDF2HTMLEX_VERSION "0.6")
set(PDF2HTMLEX_VERSION "0.7")
set(ARCHIVE_NAME pdf2htmlex-${PDF2HTMLEX_VERSION})
add_custom_target(dist
COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD
@ -154,6 +154,7 @@ add_executable(pdf2htmlEX
src/HTMLRenderer/TextLineBuffer.h
src/HTMLRenderer/TextLineBuffer.cc
src/HTMLRenderer/link.cc
src/HTMLRenderer/outline.cc
src/HTMLRenderer/state.cc
src/HTMLRenderer/text.cc
src/BackgroundRenderer/BackgroundRenderer.h

View File

@ -1,8 +1,17 @@
Latest v0.6
Latest v0.7
* Process outline
* Fix build with poppler
* Many code cleaning jobs [John Hewson]
v0.6
2013.01.26
* new option --no-drm [John Hewson]
* Travis CI integration [John Hewson]
* Add a class for 'left'
* Fixed a bug of hashing/finding GfxRGB
* new option -v, --version [Thanks to John Hewson]
* new option -v, --version [John Hewson]
* Render Type 3 fonts as image
* New parameter: --use-cropbox
* Progress indicator

View File

@ -43,42 +43,26 @@ Readers can also be benefitted
- Color
- Transformation
* Links
* Outline
* [EXPERIMENTAL] Path drawing with CSS
- Orthogonal lines
- Rectangles
- Linear gradients
* Not fully supported, and rendered as images
* Not fully supported (Rendered as images)
- Type 3 fonts
- Non-text object
## Get started
### Ubuntu
[PPA](https://launchpad.net/~coolwanglu/+archive/pdf2htmlex), which is not so up-to-date.
### ArchLinux
[AUR Package](https://aur.archlinux.org/packages.php?ID=62426), special thanks to Arthur Titeica <arthur.titeica@gmail.com>
### Gentoo
Install through Overlay gentoo-zh, mrueg or sunrise, thanks to the packagers.
### Mac
[Homebrew Formula](https://github.com/jamiely/homebrew/blob/pdf2htmlex/Library/Formula/pdf2htmlex.rb), special thanks to Jamie Ly <me@jamie.ly>
[Macports (local repo)](https://github.com/iapain/pdf2htmlEX-macport), special thanks to Deepak Thukral <iapain@iapa.in>
### Windows
The code may be built with Cygwin.
Or with MinGW with some modifications.
More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u.ac.jp/~okumura/texwiki/?pdf2htmlEX) (in Japanese), special thanks to Haruhiko Okumura
### Install
Thanks to all packagers!
* [Ubuntu PPA](https://launchpad.net/~coolwanglu/+archive/pdf2htmlex) by Lu Wang <coolwanglu@gmail.com>, not always up-to-date.
* [ArchLinux AUR](https://aur.archlinux.org/packages.php?ID=62426) by Arthur Titeica <arthur.titeica@gmail.com>
* [Gentoo Overlay](http://gpo.zugaina.org/app-text/pdf2htmlex), gentoo-zh, mrueg or sunrise, by respective packagers.
* [Homebrew Formula](https://github.com/jamiely/homebrew/blob/pdf2htmlex/Library/Formula/pdf2htmlex.rb) by Jamie Ly <me@jamie.ly>
* [Macports (local repo)](https://github.com/iapain/pdf2htmlEX-macport) by Deepak Thukral <iapain@iapa.in>
### Build from source
@ -96,6 +80,10 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u
* git version is recommended to avoid annoying compilation issues
* [Optional] **ttfautohint**
* run pdf2htmlEX with **--external-hint-tool=ttfautohint** to enable it
* [For Windows]
* Cygwin
* or MinGW, with some modifications to pdf2htmlEX. See [pdf2htmlEX on TeX Wiki](http://oku.edu.mie-u.ac.jp/~okumura/texwiki/?pdf2htmlEX) (in Japanese), special thanks to Haruhiko Okumura
#### Compiling
@ -106,9 +94,7 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u
## Usage
pdf2htmlEX /path/to/foobar.pdf
pdf2htmlEX --help
man pdf2htmlEX
## FAQ
@ -131,6 +117,16 @@ GPLv2 & GPLv3 Dual licensed
### [**Donate Now**](http://coolwanglu.github.com/pdf2htmlEX/donate.html)
## Contact
* Mailing list <pdf2htmlex@googlegroups.com>
* Please read `man pdf2htmlEX` and [**FAQ**](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ) before sending emails. Or your message might be ignored.
* Please use the **latest master branch**.
* Lu Wang <coolwanglu@gmail.com>
* Please use the mailing list above unless for personal enquiries.
* Accepting messages in **Chinese**, **English** or **Japanese**.
## Acknowledge
pdf2htmlEX is made possible thanks to the following projects:
@ -147,18 +143,6 @@ pdf2htmlEX is inspired by the following projects:
* Crocodoc
* Google Doc
## Contact
* Mailing list <pdf2htmlex@googlegroups.com>
* Please read [**FAQ**](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ) before sending emails. Or your message might be ignored.
* Please use the **latest master branch**.
* Lu Wang <coolwanglu@gmail.com>
* Please use the mailing list above unless for personal enquiries.
* Accepting messages in **Chinese**, **English** or **Japanese**.
### Special Thanks
* Hongliang Tian <tatetian@gmail.com>

3
TODO
View File

@ -1,6 +1,3 @@
word space/offset before the first letter (calendar pdf)
add class for "left"
== Future: ==
Too difficult/complicated to implement:

View File

@ -5,7 +5,7 @@ Dirty script for building package for PPA
by WangLu
2011.01.13
modified by pdf2htmlEX
modified for pdf2htmlEX
2012.08.28
"""

19
debian/changelog vendored
View File

@ -1,7 +1,24 @@
pdf2htmlex (0.7-1~git201301292229r2595c-0ubuntu1) quantal; urgency=low
* Fixed a CSS issue
-- WANG Lu <coolwanglu@gmail.com> Tue, 29 Jan 2013 22:29:21 +0800
pdf2htmlex (0.7-1~git201301282229r2595c-0ubuntu1) quantal; urgency=low
* Process PDF Outline
-- WANG Lu <coolwanglu@gmail.com> Mon, 28 Jan 2013 22:29:35 +0800
pdf2htmlex (0.7-1~git201301261427r2595c-0ubuntu1) quantal; urgency=low
* New version, see Changelog for changelog
-- WANG Lu <coolwanglu@gmail.com> Sat, 26 Jan 2013 14:27:18 +0800
pdf2htmlex (0.6-1~git201212182148rd76af-0ubuntu1) quantal; urgency=low
* fix dependency of poppler for quantal
*
-- WANG Lu <coolwanglu@gmail.com> Tue, 18 Dec 2012 21:48:35 +0800

View File

@ -1,4 +1,4 @@
.TH pdf2htmlEX 1 "Aug 31, 2012" "pdf2htmlEX 0.1"
.TH pdf2htmlEX 1 "pdf2htmlEX @PDF2HTMLEX_VERSION@"
.SH NAME
.PP
.nf
@ -22,59 +22,73 @@ Fonts are extracted form PDF and then embedded into HTML (Type 3 fonts are not s
Other objects are rendered as images and also embedded.
.SH OPTIONS
.TP
.B --help
Show all options
.TP
.B -v, --version
Show copyright and version
.TP
.B -o, --owner-password <password>
Specify owner password
.TP
.B -u, --user-password <password>
Specify user password
.TP
.B --no-drm <0|1> (Default: 0)
Override document DRM settings
.TP
.B --dest-dir <dir> (Default: .)
Specify destination folder
.TP
.B --data-dir <dir> (Default: @CMAKE_INSTALL_PREFIX@/share/pdf2htmlEX)
Specify the folder holding the manifest and other files
.SS Pages
.TP
.B -f, --first-page <num> (Default: 1)
Specify the first page to process
.TP
.B -l, --last-page <num> (Default: last page)
Specify the last page to process
.TP
.SS Dimensions
.B --zoom <ratio>, --fit-width <width>, --fit-height <height>
--zoom specifies the zoom factor directly; --fit-width/height specifies the maximum width/height of a page, the values are in pixels.
If multiple values are specified, the minimum one will be used.
If none is specified, pages will be rendered as 72DPI.
.TP
.B --hdpi <dpi>, --vdpi <dpi> (Default: 144)
Specify the horizontal and vertical DPI for images
.TP
.B --use-cropbox <0|1> (Default: 0)
Use CropBox instead of MediaBox for output.
.TP
.B --process-nontext <0|1> (Default: 1)
Whether to process non-text objects (as images)
.B --hdpi <dpi>, --vdpi <dpi> (Default: 144)
Specify the horizontal and vertical DPI for images
.SS Output Files
.TP
.B --single-html <0|1> (Default: 1)
Whether to embed everything into one HTML file.
If switched off, there will be several files generated along with the HTML file including files for fonts, css, images.
Note that the outline will always be embedded into the main HTML file no matter if this switch is on or not.
And only when this switch is off will there be a separate .outline file contains the outline.
You need to modify the manifest if you do not want outline embedded.
.TP
.B --split-pages <0|1> (Default: 0)
If turned on, each page is saved in a separated files, also the generated css file will be store separatedly as if single-html=0
If turned on, pages will be stored into separated files named as <output-filename>0.page, <output-filename>1.page, ...
Also the css and outline will be stored into separated files, and the will be no <output-filename>.html generated.
This switch is useful if you want pages to be loaded separately & dynamically -- in which case you need to compose the page yourself, and a supporting backend might be necessary.
.TP
.B --dest-dir <dir> (Default: .)
Specify destination folder
.TP
.B --css-filename <filename> (Default: <none>)
Specify the filename of the generated css file, if not embedded.
If it's empty, the file name will be determined automatically.
.TP
.B --outline-filename <filename> (Default: <none>)
Specify the filename of the generated outline file, if not embedded.
If it's empty, the file name will be determined automatically.
.SS Fonts
The output files will be named as <output-filename>0.page, <output-filename>1.page, ...
.TP
.B --embed-base-font <0|1> (Default: 1)
Whether to embed base 14 fonts.
@ -82,20 +96,55 @@ Whether to embed base 14 fonts.
There are several base font defined in PDF standards, which are supposed to be provided by the PDF reader.
If this switch is on, local matched font will be used and embedded; otherwise only font names are exported such that web browsers may try to find proper fonts themselves.
.TP
.B --embed-external-font <0|1> (Default: 0)
Similar as above but for non-base fonts.
.TP
.B --font-suffix <suffix> (Default: .ttf)
Specify the suffix of fonts extracted from the PDF file.
.TP
.B --decompose-ligature <0|1> (Default: 0)
Decompose ligatures. For example 'fi' -> 'f''i'.
.TP
.B --remove-unused-glyph <0|1> (Default: 1)
If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size.
.TP
.B --auto-hint <0|1> (Default: 0)
If set to 1, hints will be generated for the fonts using fontforge.
This may be preceded by --external-hint-tool.
.TP
.B --external-hint-tool <tool> (Default: <none>)
If specified, the tool will be called in order to enhanced hinting for fonts, this will precede --auto-hint.
The tool will be called as '<tool> <in.suffix> <out.suffix>', where suffix will be the same as specified for --font-suffix.
.TP
.B --stretch-narrow-glyph <0|1> (Default: 0)
If set to 1, glyphs narrower than described in PDF will be stretched; otherwise space will be padded to the right of the glyphs
.TP
.B --squeeze-wide-glyph <0|1> (Default: 1)
If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated.
.SS Text
.TP
.B --heps <len>, --veps <len> (Default: 1)
Specify the maximum tolerable horizontal/vertical offset (in pixels).
pdf2htmlEX would try to optimize the generated HTML file moving Text within this distance.
.TP
.B --space-threshold <ratio> (Default: 1.0/6)
pdf2htmlEX would insert a whitespace character ' ' if the distance between two consecutive letters in the same line is wider than ratio * font_size.
.TP
.B --font-size-multiplier <ratio> (Default: 4.0)
Many web browsers limit the minimum font size, and many would round the given font size, which results in incorrect rendering.
@ -103,11 +152,13 @@ Many web browsers limit the minimum font size, and many would round the given fo
Specify a ratio greater than 1 would resolve this issue, however it might freeze some browsers.
For some versions of Firefox, however, there will be a problem when the font size is too large, in which case a smaller value should be specified here.
.TP
.B --auto-hint <0|1> (Default: 0)
If set to 1, hints will be generated for the fonts using fontforge.
This may be preceded by --external-hint-tool.
.TP
.B --space-as-offset <0|1> (Default: 0)
Treat space characters as offsets, which may increase the size of the output.
Turn it on if space characters are not displayed correctly, or you want to remove positional spaces.
.TP
.B --tounicode <-1|0|1> (Default: 0)
A ToUnicode map may be provided for each font in PDF which indicates the 'meaning' of the characters. However often there is better "ToUnicode" info in Type 0/1 fonts, and sometimes the ToUnicode map provided is wrong.
@ -117,40 +168,62 @@ If this value is set to 1, the ToUnicode Map is always applied, if provided in P
If set to -1, a customized map is used such that rendering will be correct in HTML (visually the same), but you may not get correct characters by select & copy & paste.
If set to 0, pdf2htmlEX would try its best to balance the two methods above.
.TP
.B --space-as-offset <0|1> (Default: 0)
Treat space characters as offsets, which may increase the size of the output.
Turn it on if space characters are not displayed correctly, or you want to remove positional spaces.
.TP
.B --stretch-narrow-glyph <0|1> (Default: 0)
If set to 1, glyphs narrower than described in PDF will be stretched; otherwise space will be padded to the right of the glyphs
.TP
.B --squeeze-wide-glyph <0|1> (Default: 1)
If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated.
.TP
.B --remove-unused-glyph <0|1> (Default: 1)
If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size.
.TP
.B --font-suffix <suffix> (Default: .ttf), --font-format <format> (Default: truetype)
Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
.TP
.B --external-hint-tool <tool> (Default: <none>)
If specified, the tool will be called in order to enhanced hinting for fonts, this will precede --auto-hint.
.SS PDF Protection
The tool will be called as '<tool> <in.suffix> <out.suffix>', where suffix will be the same as specified for --font-suffix.
.TP
.B --css-filename <filename> (Default: <none>)
Specify the filename of the generated css file, if not embedded.
.B -o, --owner-password <password>
Specify owner password
If it's empty, the file name will be determined automatically.
.TP
.B --debug <0|1> (Default: 0)
Show debug information.
.B -u, --user-password <password>
Specify user password
.TP
.B --no-drm <0|1> (Default: 0)
Override document DRM settings
.SS Misc.
.TP
.B --clean-tmp <0|1> (Default: 1)
If switched off, intermediate files won't be cleaned in the end.
.TP
.B --process-nontext <0|1> (Default: 1)
Whether to process non-text objects (as images)
.TP
.B --data-dir <dir> (Default: @CMAKE_INSTALL_PREFIX@/share/pdf2htmlEX)
Specify the folder holding the manifest and other files (see below for the manifest file)`
.TP
.B --css-draw <0|1> (Default: 0)
Experimental and unsupported CSS drawing
.TP
.B --debug <0|1> (Default: 0)
Print debug information.
.SS Meta
.TP
.B -v, --version
Print copyright and version info
.TP
.B --help
Print usage information
.SH MANIFEST and DATA-DIR
When split-pages is 0, the manifest file describes how the final html page should be generated.
By default, pdf2htmlEX will use the manifest in the default data-dir (run `pdf2htmlEX -v` to check), which gives a simple demo of its syntax.
You can modify the default one, or you can create a new one and specify the correct data-dir in the command line.
When single-html is 1, all files referred by the manifest must be located in the data-dir.
.SH EXAMPLE
.TP
.B pdf2htmlEX /path/to/file.pdf
@ -164,7 +237,7 @@ Convert file.pdf into out/file.html and leave font/image files separated.
.SH COPYRIGHT
.PP
Copyright 2012 Lu Wang <coolwanglu@gmail.com>
Copyright 2012,2013 Lu Wang <coolwanglu@gmail.com>
pdf2htmlEX is GPLv2 & GPLv3 dual licensed

View File

@ -1,19 +1,63 @@
/* Base CSS */
/* Copyright 2012 Lu Wang <coolwanglu@gmail.com> */
#pdf-main { /* PDF container */
#pdf-outline { /* PDF Outline */
position:absolute;
top:0;
left:0;
bottom:0;
width:193px;
overflow:auto;
margin:0px;
padding:0 0 0 7px;
background-color:#707070;
display:none;
}
#pdf-outline.opened {
display:block;
}
#pdf-outline ul {
margin-left:13px;
margin-right:3px;
padding-left:3px;
}
#pdf-outline li {
list-style-type:disc;
list-style-position:outside;
}
#pdf-outline a {
font-size:13px;
color:#e8e8e8;
}
#pdf-outline a:visited {
color:#e8e8e8;
}
#pdf-outline a:hover{
color:#e8e8e8;
}
#pdf-outline a:active{
color:#e8e8e8;
}
#pdf-main { /* PDF container */
position:absolute;
top:0;
left:0px;
bottom:0;
right:0;
overflow:auto;
background-color:grey;
background-color:#808080;
/* margin & border-width have to be 0,
* otherwise pdf2htmlEX may not calculate the coordinates correctly
*/
margin:0;
border-width:0;
}
#pdf-outline.opened + #pdf-main {
left:200px;
}
/*
* The followings are base classes, which are meant to be override by PDF specific classes
* So do not increase the specificity
*/
.d { /* page decoration */
position:relative;
margin: 13px auto;

View File

@ -1,4 +1,4 @@
# manifest
# pdf2htmlEX manifest
# by WangLu
# 2012.09.12
#
@ -22,26 +22,52 @@
<meta name="generator" content="pdf2htmlEX"/>
"""
# base CSS styles
@base.css
# PDF specific CSS styles
$css
# necessary Javascript codes
@jquery.js
@pdf2htmlEX.js
# entry point of pdf2htmlEX
"""
<script type="text/javascript">
new pdf2htmlEX.Viewer('pdf-main');
new pdf2htmlEX.Viewer('pdf-main', 'pdf-outline');
</script>
"""
"""
<title></title>
</head>
<body>
<div id="pdf-main">
"""
$pages
# The container of outline
# By default this is hidden, pdf2htmlEX.js will add the 'opened' class if it is not empty
# You can add a class 'opened' here if you want it always opened or you don't use pdf2htmlEX.js
# e.g.
# <div id="pdf-outline" class="opened">
"""
<div id="pdf-outline">
"""
$outline
"""
</div>
"""
# The container of PDF pages
# check base.css for an example and requirements of its CSS styles
"""
<div id="pdf-main">
"""
$pages
"""
</div>
"""
"""
</body>
</html>
"""

View File

@ -30,7 +30,7 @@ var pdf2htmlEX = (function(){
,ctm[1] * pos[0] + ctm[3] * pos[1] + ctm[5]];
};
var Page = function(page, container) {
if(page == undefined) return undefined;
if(page == undefined) return;
this.p = $(page);
this.n = parseInt(this.p.attr('data-page-no'), 16);
@ -94,8 +94,9 @@ var pdf2htmlEX = (function(){
}
});
pdf2htmlEX.Viewer = function(container_id) {
pdf2htmlEX.Viewer = function(container_id, outline_id) {
this.container_id = container_id;
this.outline_id = outline_id;
this.init_before_loading_content();
var _ = this;
@ -113,8 +114,14 @@ var pdf2htmlEX = (function(){
},
init_after_loading_content : function() {
this.outline = $('#'+this.outline_id);
this.container = $('#'+this.container_id);
// need a better design
if(this.outline.children().length > 0) {
this.outline.addClass('opened');
}
var new_pages = new Array();
var pl= $('.p', this.container);
/* don't use for(..in..) */
@ -129,7 +136,10 @@ var pdf2htmlEX = (function(){
//this.zoom_fixer();
this.container.on('click', '.a', this, this.annot_link_handler);
// used by outline/annot_link etc
// note that one is for the class 'a' and the other is for the tag 'a'
this.container.on('click', '.a', this, this.link_handler);
this.outline.on('click', 'a', this, this.link_handler);
this.render();
},
@ -228,18 +238,24 @@ var pdf2htmlEX = (function(){
get_containing_page : function(obj) {
/* get the page obj containing obj */
return this.pages[(new Page(obj.closest('.p')[0])).n];
var p = obj.closest('.p')[0];
return p && this.pages[(new Page(p)).n];
},
annot_link_handler : function (e) {
link_handler : function (e) {
var _ = e.data;
var t = $(e.currentTarget);
var cur_page = _.get_containing_page(t);
if(cur_page == undefined) return;
var cur_pos = cur_page.position();
//get the coordinates in default user system
cur_pos = transform(cur_page.ictm, [cur_pos[0], cur_page.height()-cur_pos[1]]);
var cur_pos = [0,0];
// cur_page might be undefined, e.g. from Outline
var cur_page = _.get_containing_page(t);
if(cur_page != undefined)
{
cur_pos = cur_page.position();
//get the coordinates in default user system
cur_pos = transform(cur_page.ictm, [cur_pos[0], cur_page.height()-cur_pos[1]]);
}
var detail_str = t.attr('data-dest-detail');
if(detail_str == undefined) return;
@ -281,9 +297,6 @@ var pdf2htmlEX = (function(){
upside_down = false;
ok = true;
break;
pos = [0,0];
ok = true;
break;
default:
ok = false;
break;

View File

@ -147,7 +147,9 @@ class HTMLRenderer : public OutputDev
virtual void setDefaultCTM(double *ctm);
// Start a page.
// UGLY: These 2 versions are for different versions of poppler
virtual void startPage(int pageNum, GfxState *state);
virtual void startPage(int pageNum, GfxState *state, XRef * xref);
// End a page.
virtual void endPage();
@ -210,12 +212,17 @@ class HTMLRenderer : public OutputDev
void pre_process(PDFDoc * doc);
void post_process();
// set flags
void process_outline();
void process_outline_items(GooList * items);
void set_stream_flags (std::ostream & out);
std::string dump_embedded_font (GfxFont * font, long long fn_id);
void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false);
// convert a LinkAction to a string that our Javascript code can understand
std::string get_linkaction_str(LinkAction *, std::string & detail);
////////////////////////////////////////////////////
// manage styles
////////////////////////////////////////////////////
@ -241,7 +248,7 @@ class HTMLRenderer : public OutputDev
* remote font: to be retrieved from the web server
* local font: to be substituted with a local (client side) font
*/
void export_remote_font(const FontInfo & info, const std::string & suffix, const std::string & fontfileformat, GfxFont * font);
void export_remote_font(const FontInfo & info, const std::string & suffix, GfxFont * font);
void export_remote_default_font(long long fn_id);
void export_local_font(const FontInfo & info, GfxFont * font, const std::string & original_font_name, const std::string & cssfont);
@ -300,6 +307,8 @@ class HTMLRenderer : public OutputDev
XRef * xref;
PDFDoc * cur_doc;
Catalog * cur_catalog;
double default_ctm[6];
// page info
@ -424,8 +433,11 @@ class HTMLRenderer : public OutputDev
std::map<double, long long> left_map;
const Param * param;
std::ofstream html_fout, css_fout;
std::string html_path, css_path;
struct {
std::ofstream fs;
std::string path;
} f_outline, f_pages, f_css;
static const std::string MANIFEST_FILENAME;
};

View File

@ -83,7 +83,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
max_ascent = max<double>(max_ascent, s.ascent * s.draw_font_size);
}
ostream & out = renderer->html_fout;
ostream & out = renderer->f_pages.fs;
out << "<div style=\""
<< "bottom:" << round(y) << "px;"
<< "\""

View File

@ -372,48 +372,48 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co
}
}
html_fout << "<div class=\"Cd t" << install_transform_matrix(new_tm) << "\" style=\"";
f_pages.fs << "<div class=\"Cd t" << install_transform_matrix(new_tm) << "\" style=\"";
if(line_color)
{
html_fout << "border-color:" << *line_color << ";";
f_pages.fs << "border-color:" << *line_color << ";";
html_fout << "border-width:";
f_pages.fs << "border-width:";
for(int i = 0; i < line_width_count; ++i)
{
if(i > 0) html_fout << ' ';
if(i > 0) f_pages.fs << ' ';
double lw = line_width_array[i] * scale;
html_fout << round(lw);
if(is_positive(lw)) html_fout << "px";
f_pages.fs << round(lw);
if(is_positive(lw)) f_pages.fs << "px";
}
html_fout << ";";
f_pages.fs << ";";
}
else
{
html_fout << "border:none;";
f_pages.fs << "border:none;";
}
if(fill_color)
{
html_fout << "background-color:" << (*fill_color) << ";";
f_pages.fs << "background-color:" << (*fill_color) << ";";
}
else
{
html_fout << "background-color:transparent;";
f_pages.fs << "background-color:transparent;";
}
if(style_function)
{
style_function(style_function_data, html_fout);
style_function(style_function_data, f_pages.fs);
}
html_fout << "bottom:" << round(y) << "px;"
f_pages.fs << "bottom:" << round(y) << "px;"
<< "left:" << round(x) << "px;"
<< "width:" << round(w * scale) << "px;"
<< "height:" << round(h * scale) << "px;";
html_fout << "\"></div>";
f_pages.fs << "\"></div>";
}

View File

@ -18,11 +18,45 @@
namespace pdf2htmlEX {
void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, const string & fontfileformat, GfxFont * font)
using std::cerr;
using std::endl;
void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, GfxFont * font)
{
css_fout << "@font-face{"
<< "font-family:f" << info.id << ";"
<< "src:url(";
string mime_type, format;
if(suffix == ".ttf")
{
format = "truetype";
mime_type = "application/x-font-ttf";
}
else if(suffix == ".otf")
{
format = "opentype";
mime_type = "application/x-font-otf";
}
else if(suffix == ".woff")
{
format = "woff";
mime_type = "application/font-woff";
}
else if(suffix == ".eot")
{
format = "embedded-opentype";
mime_type = "application/vnd.ms-fontobject";
}
else if(suffix == ".svg")
{
format = "svg";
mime_type = "image/svg+xml";
}
else
{
cerr << "Warning: unknown font suffix: " << suffix << endl;
}
f_css.fs << "@font-face{"
<< "font-family:f" << info.id << ";"
<< "src:url(";
{
auto fn = str_fmt("f%llx%s", info.id, suffix.c_str());
@ -32,32 +66,32 @@ void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suff
ifstream fin(path, ifstream::binary);
if(!fin)
throw "Cannot locate font file: " + path;
css_fout << "'data:font/" + fontfileformat + ";base64," << base64stream(fin) << "'";
f_css.fs << "'data:font/" + mime_type + ";base64," << base64stream(fin) << "'";
}
else
{
css_fout << (char*)fn;
f_css.fs << (char*)fn;
}
}
css_fout << ")"
<< "format(\"" << fontfileformat << "\");"
<< "}" // end of @font-face
<< ".f" << info.id << "{"
<< "font-family:f" << info.id << ";"
<< "line-height:" << round(info.ascent - info.descent) << ";"
<< "font-style:normal;"
<< "font-weight:normal;"
<< "visibility:visible;"
<< "}" // end of .f
<< endl;
f_css.fs << ")"
<< "format(\"" << format << "\");"
<< "}" // end of @font-face
<< ".f" << info.id << "{"
<< "font-family:f" << info.id << ";"
<< "line-height:" << round(info.ascent - info.descent) << ";"
<< "font-style:normal;"
<< "font-weight:normal;"
<< "visibility:visible;"
<< "}" // end of .f
<< endl;
}
static string general_font_family(GfxFont * font)
{
if(font -> isFixedWidth())
if(font->isFixedWidth())
return "monospace";
else if (font -> isSerif())
else if (font->isSerif())
return "serif";
else
return "sans-serif";
@ -66,45 +100,45 @@ static string general_font_family(GfxFont * font)
// TODO: this function is called when some font is unable to process, may use the name there as a hint
void HTMLRenderer::export_remote_default_font(long long fn_id)
{
css_fout << ".f" << fn_id << "{font-family:sans-serif;visibility:hidden;}" << endl;
f_css.fs << ".f" << fn_id << "{font-family:sans-serif;visibility:hidden;}" << endl;
}
void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont)
{
css_fout << ".f" << info.id << "{";
css_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
f_css.fs << ".f" << info.id << "{";
f_css.fs << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
string fn = original_font_name;
for(auto iter = fn.begin(); iter != fn.end(); ++iter)
*iter = tolower(*iter);
if(font->isBold() || (fn.find("bold") != string::npos))
css_fout << "font-weight:bold;";
f_css.fs << "font-weight:bold;";
else
css_fout << "font-weight:normal;";
f_css.fs << "font-weight:normal;";
if(fn.find("oblique") != string::npos)
css_fout << "font-style:oblique;";
f_css.fs << "font-style:oblique;";
else if(font->isItalic() || (fn.find("italic") != string::npos))
css_fout << "font-style:italic;";
f_css.fs << "font-style:italic;";
else
css_fout << "font-style:normal;";
f_css.fs << "font-style:normal;";
css_fout << "line-height:" << round(info.ascent - info.descent) << ";";
f_css.fs << "line-height:" << round(info.ascent - info.descent) << ";";
css_fout << "visibility:visible;";
f_css.fs << "visibility:visible;";
css_fout << "}" << endl;
f_css.fs << "}" << endl;
}
void HTMLRenderer::export_font_size (long long fs_id, double font_size)
{
css_fout << ".s" << fs_id << "{font-size:" << round(font_size) << "px;}" << endl;
f_css.fs << ".s" << fs_id << "{font-size:" << round(font_size) << "px;}" << endl;
}
void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
{
css_fout << ".t" << tm_id << "{";
f_css.fs << ".t" << tm_id << "{";
// always ignore tm[4] and tm[5] because
// we have already shifted the origin
@ -114,7 +148,7 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
{
auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"};
for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter)
css_fout << *iter << "transform:none;";
f_css.fs << *iter << "transform:none;";
}
else
{
@ -122,53 +156,53 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter)
{
// PDF use a different coordinate system from Web
css_fout << *iter << "transform:matrix("
f_css.fs << *iter << "transform:matrix("
<< round(tm[0]) << ','
<< round(-tm[1]) << ','
<< round(-tm[2]) << ','
<< round(tm[3]) << ',';
css_fout << "0,0);";
f_css.fs << "0,0);";
}
}
css_fout << "}" << endl;
f_css.fs << "}" << endl;
}
void HTMLRenderer::export_letter_space (long long ls_id, double letter_space)
{
css_fout << ".l" << ls_id << "{letter-spacing:" << round(letter_space) << "px;}" << endl;
f_css.fs << ".l" << ls_id << "{letter-spacing:" << round(letter_space) << "px;}" << endl;
}
void HTMLRenderer::export_word_space (long long ws_id, double word_space)
{
css_fout << ".w" << ws_id << "{word-spacing:" << round(word_space) << "px;}" << endl;
f_css.fs << ".w" << ws_id << "{word-spacing:" << round(word_space) << "px;}" << endl;
}
void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
{
css_fout << ".c" << color_id << "{color:" << (*rgb) << ";}" << endl;
f_css.fs << ".c" << color_id << "{color:" << (*rgb) << ";}" << endl;
}
void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
{
if(ws_width > 0)
css_fout << "._" << ws_id << "{display:inline-block;width:" << round(ws_width) << "px;}" << endl;
f_css.fs << "._" << ws_id << "{display:inline-block;width:" << round(ws_width) << "px;}" << endl;
else
css_fout << "._" << ws_id << "{display:inline;margin-left:" << round(ws_width) << "px;}" << endl;
f_css.fs << "._" << ws_id << "{display:inline;margin-left:" << round(ws_width) << "px;}" << endl;
}
void HTMLRenderer::export_rise (long long rise_id, double rise)
{
css_fout << ".r" << rise_id << "{top:" << round(-rise) << "px;}" << endl;
f_css.fs << ".r" << rise_id << "{top:" << round(-rise) << "px;}" << endl;
}
void HTMLRenderer::export_height (long long height_id, double height)
{
css_fout << ".h" << height_id << "{height:" << round(height) << "px;}" << endl;
f_css.fs << ".h" << height_id << "{height:" << round(height) << "px;}" << endl;
}
void HTMLRenderer::export_left (long long left_id, double left)
{
css_fout << ".L" << left_id << "{left:" << round(left) << "px;}" << endl;
f_css.fs << ".L" << left_id << "{left:" << round(left) << "px;}" << endl;
}
}

View File

@ -13,6 +13,8 @@
#include <algorithm>
#include <vector>
#include <GlobalParams.h>
#include "HTMLRenderer.h"
#include "TextLineBuffer.h"
#include "pdf2htmlEX-config.h"
@ -35,10 +37,6 @@ using std::abs;
using std::cerr;
using std::endl;
static void dummy(void *, enum ErrorCategory, int pos, char *)
{
}
HTMLRenderer::HTMLRenderer(const Param * param)
:OutputDev()
,line_opened(false)
@ -49,8 +47,8 @@ HTMLRenderer::HTMLRenderer(const Param * param)
{
if(!(param->debug))
{
//disable error function of poppler
setErrorCallback(&dummy, nullptr);
//disable error messages of poppler
globalParams->setErrQuiet(gTrue);
}
ffw_init(param->debug);
@ -71,10 +69,14 @@ HTMLRenderer::~HTMLRenderer()
void HTMLRenderer::process(PDFDoc *doc)
{
cur_doc = doc;
cur_catalog = doc->getCatalog();
xref = doc->getXRef();
pre_process(doc);
///////////////////
// Process pages
BackgroundRenderer * bg_renderer = nullptr;
if(param->process_nontext)
{
@ -90,10 +92,10 @@ void HTMLRenderer::process(PDFDoc *doc)
if(param->split_pages)
{
auto page_fn = str_fmt("%s/%s%d.page", param->dest_dir.c_str(), param->output_filename.c_str(), i);
html_fout.open((char*)page_fn, ofstream::binary);
if(!html_fout)
f_pages.fs.open((char*)page_fn, ofstream::binary);
if(!f_pages.fs)
throw string("Cannot open ") + (char*)page_fn + " for writing";
set_stream_flags(html_fout);
set_stream_flags(f_pages.fs);
}
if(param->process_nontext)
@ -114,13 +116,17 @@ void HTMLRenderer::process(PDFDoc *doc)
if(param->split_pages)
{
html_fout.close();
f_pages.fs.close();
}
}
if(page_count >= 0)
cerr << "Working: " << page_count << "/" << page_count;
cerr << endl;
////////////////////////
// Process Outline
process_outline();
post_process();
if(bg_renderer)
@ -135,6 +141,11 @@ void HTMLRenderer::setDefaultCTM(double *ctm)
}
void HTMLRenderer::startPage(int pageNum, GfxState *state)
{
startPage(pageNum, state, nullptr);
}
void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
{
this->pageNum = pageNum;
this->pageWidth = state->getPageWidth();
@ -142,7 +153,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
assert((!line_opened) && "Open line in startPage detected!");
html_fout
f_pages.fs
<< "<div class=\"d\" style=\"width:"
<< (pageWidth) << "px;height:"
<< (pageHeight) << "px;\">"
@ -151,7 +162,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
if(param->process_nontext)
{
html_fout << "background-image:url(";
f_pages.fs << "background-image:url(";
{
if(param->single_html)
@ -160,18 +171,18 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
ifstream fin((char*)path, ifstream::binary);
if(!fin)
throw string("Cannot read background image ") + (char*)path;
html_fout << "'data:image/png;base64," << base64stream(fin) << "'";
f_pages.fs << "'data:image/png;base64," << base64stream(fin) << "'";
}
else
{
html_fout << str_fmt("p%x.png", pageNum);
f_pages.fs << str_fmt("p%x.png", pageNum);
}
}
html_fout << ");background-position:0 0;background-size:" << pageWidth << "px " << pageHeight << "px;background-repeat:no-repeat;";
f_pages.fs << ");background-position:0 0;background-size:" << pageWidth << "px " << pageHeight << "px;background-repeat:no-repeat;";
}
html_fout << "\">";
f_pages.fs << "\">";
draw_text_scale = 1.0;
cur_font_info = install_font(nullptr);
@ -206,26 +217,26 @@ void HTMLRenderer::endPage() {
cur_doc->processLinks(this, pageNum);
// close box
html_fout << "</div>";
f_pages.fs << "</div>";
// dump info for js
// TODO: create a function for this
// BE CAREFUL WITH ESCAPES
html_fout << "<div class=\"j\" data-data='{";
f_pages.fs << "<div class=\"j\" data-data='{";
//default CTM
html_fout << "\"ctm\":[";
f_pages.fs << "\"ctm\":[";
for(int i = 0; i < 6; ++i)
{
if(i > 0) html_fout << ",";
html_fout << round(default_ctm[i]);
if(i > 0) f_pages.fs << ",";
f_pages.fs << round(default_ctm[i]);
}
html_fout << "]";
f_pages.fs << "]";
html_fout << "}'></div>";
f_pages.fs << "}'></div>";
// close page
html_fout << "</div></div>" << endl;
f_pages.fs << "</div></div>" << endl;
}
void HTMLRenderer::pre_process(PDFDoc * doc)
@ -290,11 +301,32 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
if(param->single_html && (!param->split_pages))
tmp_files.add((char*)fn);
css_path = (char*)fn,
css_fout.open(css_path, ofstream::binary);
if(!css_fout)
f_css.path = (char*)fn;
f_css.fs.open(f_css.path, ofstream::binary);
if(!f_css.fs)
throw string("Cannot open ") + (char*)fn + " for writing";
set_stream_flags(css_fout);
set_stream_flags(f_css.fs);
}
{
/*
* The logic for outline is similar to css
*/
auto fn = (param->single_html && (!param->split_pages))
? str_fmt("%s/__outline", param->tmp_dir.c_str())
: str_fmt("%s/%s", param->dest_dir.c_str(), param->outline_filename.c_str());
if(param->single_html && (!param->split_pages))
tmp_files.add((char*)fn);
f_outline.path = (char*)fn;
f_outline.fs.open(f_outline.path, ofstream::binary);
if(!f_outline.fs)
throw string("Cannot open") + (char*)fn + " for writing";
// might not be necessary
set_stream_flags(f_outline.fs);
}
// if split-pages is specified, open & close the file in the process loop
@ -303,7 +335,7 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
{
/*
* If single-html
* we have to keep the html file (for page) into a temporary place
* we have to keep the html file for pages into a temporary place
* because we'll have to embed css before it
*
* Otherwise just generate it
@ -311,21 +343,22 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
auto fn = str_fmt("%s/__pages", param->tmp_dir.c_str());
tmp_files.add((char*)fn);
html_path = (char*)fn;
html_fout.open(html_path, ofstream::binary);
if(!html_fout)
f_pages.path = (char*)fn;
f_pages.fs.open(f_pages.path, ofstream::binary);
if(!f_pages.fs)
throw string("Cannot open ") + (char*)fn + " for writing";
set_stream_flags(html_fout);
set_stream_flags(f_pages.fs);
}
}
void HTMLRenderer::post_process()
{
// close files
html_fout.close();
css_fout.close();
f_outline.fs.close();
f_pages.fs.close();
f_css.fs.close();
//only when split-page, do we have some work left to do
//only when split-page == 0, do we have some work left to do
if(param->split_pages)
return;
@ -359,7 +392,9 @@ void HTMLRenderer::post_process()
continue;
}
if(line.empty() || line[0] == '#')
if(line.empty()
|| (line.find_first_not_of(' ') == string::npos)
|| line[0] == '#')
continue;
@ -373,14 +408,23 @@ void HTMLRenderer::post_process()
{
if(line == "$css")
{
embed_file(output, css_path, ".css", false);
embed_file(output, f_css.path, ".css", false);
}
else if (line == "$pages")
else if (line == "$outline")
{
ifstream fin(html_path, ifstream::binary);
ifstream fin(f_outline.path, ifstream::binary);
if(!fin)
throw "Cannot open read the pages";
output << fin.rdbuf();
output.clear(); // output will set fail big if fin is empty
}
else if (line == "$pages")
{
ifstream fin(f_pages.path, ifstream::binary);
if(!fin)
throw "Cannot open read the pages";
output << fin.rdbuf();
output.clear(); // output will set fail big if fin is empty
}
else
{
@ -418,8 +462,9 @@ void HTMLRenderer::embed_file(ostream & out, const string & path, const string &
if(!fin)
throw string("Cannot open file ") + path + " for embedding";
out << iter->second.first << endl
<< fin.rdbuf()
<< iter->second.second << endl;
<< fin.rdbuf();
out.clear(); // out will set fail big if fin is empty
out << iter->second.second << endl;
}
else
{
@ -437,6 +482,7 @@ void HTMLRenderer::embed_file(ostream & out, const string & path, const string &
if(!out)
throw string("Cannot open file ") + path + " for embedding";
out << fin.rdbuf();
out.clear(); // out will set fail big if fin is empty
}
}
}

View File

@ -110,7 +110,7 @@ void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info)
if(path != "")
{
embed_font(path, font, info);
export_remote_font(info, param->font_suffix, param->font_format, font);
export_remote_font(info, param->font_suffix, font);
}
else
{
@ -129,7 +129,7 @@ void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, Font
if(localfontloc != nullptr)
{
embed_font(localfontloc->path->getCString(), font, info);
export_remote_font(info, param->font_suffix, param->font_format, font);
export_remote_font(info, param->font_suffix, font);
delete localfontloc;
return;
}
@ -186,7 +186,7 @@ void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info)
if(localfontloc != nullptr)
{
embed_font(string(localfontloc->path->getCString()), font, info);
export_remote_font(info, param->font_suffix, param->font_format, font);
export_remote_font(info, param->font_suffix, font);
delete localfontloc;
return;
}
@ -281,7 +281,7 @@ long long HTMLRenderer::install_whitespace(double ws_width, double & actual_widt
{
// ws_width is already mulitpled by draw_scale
auto iter = whitespace_map.lower_bound(ws_width - param->h_eps);
if((iter != whitespace_map.end()) && (abs(iter->first - ws_width) < param->h_eps))
if((iter != whitespace_map.end()) && (abs(iter->first - ws_width) <= param->h_eps))
{
actual_width = iter->first;
return iter->second;
@ -297,7 +297,7 @@ long long HTMLRenderer::install_whitespace(double ws_width, double & actual_widt
long long HTMLRenderer::install_rise(double rise)
{
auto iter = rise_map.lower_bound(rise - param->v_eps);
if((iter != rise_map.end()) && (abs(iter->first - rise) < param->v_eps))
if((iter != rise_map.end()) && (abs(iter->first - rise) <= param->v_eps))
{
return iter->second;
}
@ -311,7 +311,7 @@ long long HTMLRenderer::install_rise(double rise)
long long HTMLRenderer::install_height(double height)
{
auto iter = height_map.lower_bound(height - EPS);
if((iter != height_map.end()) && (abs(iter->first - height) < EPS))
if((iter != height_map.end()) && (abs(iter->first - height) <= EPS))
{
return iter->second;
}
@ -324,7 +324,7 @@ long long HTMLRenderer::install_height(double height)
long long HTMLRenderer::install_left(double left)
{
auto iter = left_map.lower_bound(left - param->h_eps);
if((iter != left_map.end()) && (abs(iter->first - left) < param->h_eps))
if((iter != left_map.end()) && (abs(iter->first - left) <= param->h_eps))
{
return iter->second;
}

View File

@ -29,9 +29,27 @@ using std::endl;
/*
* The detailed rectangle area of the link destination
* Will be parsed and performed by Javascript
* The string will be put into a HTML attribute, surrounded by single quotes
* So pay attention to the characters used here
*/
static string get_dest_detail_str(int pageno, LinkDest * dest)
static string get_linkdest_detail_str(LinkDest * dest, Catalog * catalog, int & pageno)
{
pageno = 0;
if(dest->isPageRef())
{
auto pageref = dest->getPageRef();
pageno = catalog->findPage(pageref.num, pageref.gen);
}
else
{
pageno = dest->getPageNum();
}
if(pageno <= 0)
{
return "";
}
ostringstream sout;
// dec
sout << "[" << pageno;
@ -108,16 +126,11 @@ static string get_dest_detail_str(int pageno, LinkDest * dest)
return sout.str();
}
/*
* Based on pdftohtml from poppler
* TODO: CSS for link rectangles
* TODO: share rectangle draw with css-draw
*/
void HTMLRenderer::processLink(AnnotLink * al)
string HTMLRenderer::get_linkaction_str(LinkAction * action, string & detail)
{
std::string dest_str, dest_detail_str;
auto action = al->getAction();
string dest_str;
detail = "";
if(action)
{
auto kind = action->getKind();
@ -125,34 +138,21 @@ void HTMLRenderer::processLink(AnnotLink * al)
{
case actionGoTo:
{
auto catalog = cur_doc->getCatalog();
auto * real_action = dynamic_cast<LinkGoTo*>(action);
LinkDest * dest = nullptr;
if(auto _ = real_action->getDest())
dest = _->copy();
else if (auto _ = real_action->getNamedDest())
dest = catalog->findDest(_);
dest = cur_catalog->findDest(_);
if(dest)
{
int pageno = 0;
if(dest->isPageRef())
{
auto pageref = dest->getPageRef();
pageno = catalog->findPage(pageref.num, pageref.gen);
}
else
{
pageno = dest->getPageNum();
}
detail = get_linkdest_detail_str(dest, cur_catalog, pageno);
if(pageno > 0)
{
dest_str = (char*)str_fmt("#p%x", pageno);
dest_detail_str = get_dest_detail_str(pageno, dest);
}
delete dest;
}
}
break;
@ -178,17 +178,30 @@ void HTMLRenderer::processLink(AnnotLink * al)
}
}
if(dest_str != "")
return dest_str;
}
/*
* Based on pdftohtml from poppler
* TODO: CSS for link rectangles
* TODO: share rectangle draw with css-draw
*/
void HTMLRenderer::processLink(AnnotLink * al)
{
string dest_detail_str;
string dest_str = get_linkaction_str(al->getAction(), dest_detail_str);
if(!dest_str.empty())
{
html_fout << "<a class=\"a\" href=\"" << dest_str << "\"";
f_pages.fs << "<a class=\"a\" href=\"" << dest_str << "\"";
if(dest_detail_str != "")
html_fout << " data-dest-detail='" << dest_detail_str << "'";
if(!dest_detail_str.empty())
f_pages.fs << " data-dest-detail='" << dest_detail_str << "'";
html_fout << ">";
f_pages.fs << ">";
}
html_fout << "<div class=\"Cd t"
f_pages.fs << "<div class=\"Cd t"
<< install_transform_matrix(default_ctm)
<< "\" style=\"";
@ -215,31 +228,31 @@ void HTMLRenderer::processLink(AnnotLink * al)
border_top_bottom_width, border_left_right_width);
if(abs(border_top_bottom_width - border_left_right_width) < EPS)
html_fout << "border-width:" << round(border_top_bottom_width) << "px;";
f_pages.fs << "border-width:" << round(border_top_bottom_width) << "px;";
else
html_fout << "border-width:" << round(border_top_bottom_width) << "px " << round(border_left_right_width) << "px;";
f_pages.fs << "border-width:" << round(border_top_bottom_width) << "px " << round(border_left_right_width) << "px;";
}
auto style = border->getStyle();
switch(style)
{
case AnnotBorder::borderSolid:
html_fout << "border-style:solid;";
f_pages.fs << "border-style:solid;";
break;
case AnnotBorder::borderDashed:
html_fout << "border-style:dashed;";
f_pages.fs << "border-style:dashed;";
break;
case AnnotBorder::borderBeveled:
html_fout << "border-style:outset;";
f_pages.fs << "border-style:outset;";
break;
case AnnotBorder::borderInset:
html_fout << "border-style:inset;";
f_pages.fs << "border-style:inset;";
break;
case AnnotBorder::borderUnderlined:
html_fout << "border-style:none;border-bottom-style:solid;";
f_pages.fs << "border-style:none;border-bottom-style:solid;";
break;
default:
cerr << "Warning:Unknown annotation border style: " << style << endl;
html_fout << "border-style:solid;";
f_pages.fs << "border-style:solid;";
}
@ -257,36 +270,36 @@ void HTMLRenderer::processLink(AnnotLink * al)
r = g = b = 0;
}
html_fout << "border-color:rgb("
f_pages.fs << "border-color:rgb("
<< dec << (int)dblToByte(r) << "," << (int)dblToByte(g) << "," << (int)dblToByte(b) << hex
<< ");";
}
else
{
html_fout << "border-style:none;";
f_pages.fs << "border-style:none;";
}
}
else
{
html_fout << "border-style:none;";
f_pages.fs << "border-style:none;";
}
tm_transform(default_ctm, x, y);
html_fout << "position:absolute;"
f_pages.fs << "position:absolute;"
<< "left:" << round(x) << "px;"
<< "bottom:" << round(y) << "px;"
<< "width:" << round(w) << "px;"
<< "height:" << round(h) << "px;";
// fix for IE
html_fout << "background-color:rgba(255,255,255,0.000001);";
f_pages.fs << "background-color:rgba(255,255,255,0.000001);";
html_fout << "\"></div>";
f_pages.fs << "\"></div>";
if(dest_str != "")
{
html_fout << "</a>";
f_pages.fs << "</a>";
}
}

View File

@ -0,0 +1,72 @@
/*
* outline.cc
*
* Handling Outline items
*
* by WangLu
* 2013.01.28
*/
#include <iostream>
#include <Outline.h>
#include <goo/GooList.h>
#include "HTMLRenderer.h"
#include "util/namespace.h"
#include "util/unicode.h"
namespace pdf2htmlEX {
using std::ostream;
void HTMLRenderer::process_outline_items(GooList * items)
{
if((!items) || (items->getLength() == 0))
return;
f_outline.fs << "<ul>";
for(int i = 0; i < items->getLength(); ++i)
{
OutlineItem * item = (OutlineItem*)(items->get(i));
string detail;
string dest = get_linkaction_str(item->getAction(), detail);
// we don't care dest is empty or not.
f_outline.fs << "<li>"
<< "<a href=\"" << dest << "\"";
if(!detail.empty())
f_outline.fs << " data-dest-detail='" << detail << "'";
f_outline.fs << ">";
outputUnicodes(f_outline.fs, item->getTitle(), item->getTitleLength());
f_outline.fs << "</a>";
// check kids
item->open();
if(item->hasKids())
{
process_outline_items(item->getKids());
}
item->close();
f_outline.fs << "</li>";
}
f_outline.fs << "</ul>";
}
void HTMLRenderer::process_outline()
{
Outline * outline = cur_doc->getOutline();
if(!outline)
return;
process_outline_items(outline->getItems());
}
}// namespace pdf2htmlEX

View File

@ -15,63 +15,56 @@ namespace pdf2htmlEX {
struct Param
{
// PDF stuff
std::string owner_password, user_password;
std::string input_filename, output_filename;
int no_drm;
// path
std::string dest_dir, tmp_dir, data_dir;
// normal parameters
// pages
int first_page, last_page;
// dimensions
double zoom;
double fit_width, fit_height;
double h_dpi, v_dpi;
int use_cropbox;
int process_nontext;
double h_dpi, v_dpi;
// output files
int single_html;
int split_pages;
std::string dest_dir;
std::string css_filename;
std::string outline_filename;
// fonts
int embed_base_font;
int embed_external_font;
std::string font_suffix;
int decompose_ligature;
// Advanced tweak
/*
* Position & Size
*/
int remove_unused_glyph;
int auto_hint;
std::string external_hint_tool;
int stretch_narrow_glyph;
int squeeze_wide_glyph;
// text
double h_eps, v_eps;
double space_threshold;
double font_size_multiplier;
/*
* Font
*/
int auto_hint;
int tounicode;
int space_as_offset;
int stretch_narrow_glyph;
int squeeze_wide_glyph;
int remove_unused_glyph;
std::string font_suffix, font_format;
std::string external_hint_tool;
/*
* Output
*/
std::string css_filename;
/*
* Debug
*/
int debug;
int tounicode;
// encryption
std::string owner_password, user_password;
int no_drm;
// misc.
int clean_tmp;
// experimental
int process_nontext;
std::string data_dir;
int css_draw;
int debug;
// non-optional
std::string input_filename, output_filename;
// not a paramater
std::string tmp_dir;
};
} // namespace pdf2htmlEX

View File

@ -36,13 +36,8 @@ ArgParser argparser;
void show_usage_and_exit(const char * dummy = nullptr)
{
cerr << "Usage: pdf2htmlEX [Options] <input.pdf> [<output.html>]" << endl;
cerr << endl;
cerr << "Options:" << endl;
cerr << "Usage: pdf2htmlEX [options] <input.pdf> [<output.html>]" << endl;
argparser.show_usage(cerr);
cerr << endl;
cerr << "Run 'man pdf2htmlEX' for detailed information" << endl;
cerr << endl;
exit(EXIT_FAILURE);
}
@ -53,60 +48,76 @@ void show_version_and_exit(const char * dummy = nullptr)
cerr << "Libraries: ";
cerr << "poppler " << POPPLER_VERSION << ", ";
cerr << "libfontforge " << ffw_get_version() << endl;
cerr << "Default data-dir: " << PDF2HTMLEX_DATA_PATH << endl;
exit(EXIT_SUCCESS);
}
void parse_options (int argc, char **argv)
{
string deprecated_string;
argparser
.add("help,h", "show all options", &show_usage_and_exit)
.add("version,v", "show copyright and version info", &show_version_and_exit)
// pages
.add("first-page,f", &param.first_page, 1, "first page to convert")
.add("last-page,l", &param.last_page, numeric_limits<int>::max(), "last page to convert")
// dimensions
.add("zoom", &param.zoom, 0, "zoom ratio", nullptr, true)
.add("fit-width", &param.fit_width, 0, "fit width to <fp> pixels", nullptr, true)
.add("fit-height", &param.fit_height, 0, "fit height to <fp> pixels", nullptr, true)
.add("use-cropbox", &param.use_cropbox, 0, "use CropBox instead of MediaBox")
.add("hdpi", &param.h_dpi, 144.0, "horizontal resolution for graphics in DPI")
.add("vdpi", &param.v_dpi, 144.0, "vertical resolution for graphics in DPI")
// output files
.add("single-html", &param.single_html, 1, "generate a single HTML file")
.add("split-pages", &param.split_pages, 0, "split pages into separate files")
.add("dest-dir", &param.dest_dir, ".", "specify destination directory")
.add("css-filename", &param.css_filename, "", "filename of the generated css file")
.add("outline-filename", &param.outline_filename, "", "filename of the generated outline file")
// fonts
.add("embed-base-font", &param.embed_base_font, 0, "embed local match for standard 14 fonts")
.add("embed-external-font", &param.embed_external_font, 0, "embed local match for external fonts")
.add("font-suffix", &param.font_suffix, ".ttf", "suffix for embedded font files (.ttf,.otf,.woff,.svg)")
.add("decompose-ligature", &param.decompose_ligature, 0, "decompose ligatures, such as \uFB01 -> fi")
.add("remove-unused-glyph", &param.remove_unused_glyph, 1, "remove unused glyphs in embedded fonts")
.add("auto-hint", &param.auto_hint, 0, "use fontforge autohint on fonts without hints")
.add("external-hint-tool", &param.external_hint_tool, "", "external tool for hinting fonts (overrides --auto-hint)")
.add("stretch-narrow-glyph", &param.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding them")
.add("squeeze-wide-glyph", &param.squeeze_wide_glyph, 1, "shrink wide glyphs instead of truncating them")
// text
.add("heps", &param.h_eps, 1.0, "horizontal threshold for merging text, in pixels")
.add("veps", &param.v_eps, 1.0, "vertical threshold for merging text, in pixels")
.add("space-threshold", &param.space_threshold, (1.0/8), "word break threshold (threshold * em)")
.add("font-size-multiplier", &param.font_size_multiplier, 4.0, "a value greater than 1 increases the rendering accuracy")
.add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets")
.add("tounicode", &param.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)")
// encryption
.add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", nullptr, true)
.add("user-password,u", &param.user_password, "", "user password (for encrypted files)", nullptr, true)
.add("no-drm", &param.no_drm, 0, "override document DRM settings")
.add("dest-dir", &param.dest_dir, ".", "specify destination directory")
// misc.
.add("clean-tmp", &param.clean_tmp, 1, "remove temporary files after conversion")
.add("process-nontext", &param.process_nontext, 1, "render graphics in addition to text")
.add("data-dir", &param.data_dir, PDF2HTMLEX_DATA_PATH, "specify data directory")
.add("first-page,f", &param.first_page, 1, "first page to process")
.add("last-page,l", &param.last_page, numeric_limits<int>::max(), "last page to process")
.add("zoom", &param.zoom, 0, "zoom ratio", nullptr, true)
.add("fit-width", &param.fit_width, 0, "fit width to <arg> pixels", nullptr, true)
.add("fit-height", &param.fit_height, 0, "fit height to <arg> pixels", nullptr, true)
.add("hdpi", &param.h_dpi, 144.0, "horizontal DPI for non-text")
.add("vdpi", &param.v_dpi, 144.0, "vertical DPI for non-text")
.add("use-cropbox", &param.use_cropbox, 0, "use CropBox instead of MediaBox")
.add("process-nontext", &param.process_nontext, 1, "process nontext objects")
.add("single-html", &param.single_html, 1, "combine everything into one single HTML file")
.add("split-pages", &param.split_pages, 0, "split pages into separated files")
.add("embed-base-font", &param.embed_base_font, 0, "embed local matched font for base 14 fonts in the PDF file")
.add("embed-external-font", &param.embed_external_font, 0, "embed local matched font for external fonts in the PDF file")
.add("decompose-ligature", &param.decompose_ligature, 0, "decompose ligatures, for example 'fi' -> 'f''i'")
.add("heps", &param.h_eps, 1.0, "max tolerated horizontal offset (in pixels)")
.add("veps", &param.v_eps, 1.0, "max tolerated vertical offset (in pixels)")
.add("space-threshold", &param.space_threshold, (1.0/8), "distance no thiner than (threshold * em) will be considered as a space character")
.add("font-size-multiplier", &param.font_size_multiplier, 4.0, "setting a value greater than 1 would increase the rendering accuracy")
.add("auto-hint", &param.auto_hint, 0, "Whether to generate hints for fonts")
.add("tounicode", &param.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled")
.add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets")
.add("stretch-narrow-glyph", &param.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding space")
.add("squeeze-wide-glyph", &param.squeeze_wide_glyph, 1, "squeeze wide glyphs instead of truncating")
.add("remove-unused-glyph", &param.remove_unused_glyph, 1, "remove unused glyphs in embedded fonts")
.add("font-suffix", &param.font_suffix, ".ttf", "suffix for extracted font files")
.add("font-format", &param.font_format, "opentype", "format for extracted font files")
.add("external-hint-tool", &param.external_hint_tool, "", "external tool for hintting fonts.(overrides --auto-hint)")
.add("css-filename", &param.css_filename, "", "Specify the file name of the generated css file")
.add("debug", &param.debug, 0, "output debug information")
.add("clean-tmp", &param.clean_tmp, 1, "clean temporary files after processing")
.add("css-draw", &param.css_draw, 0, "[Experimental and Unsupported] CSS Drawing")
.add("css-draw", &param.css_draw, 0, "[experimental and unsupported] CSS drawing")
.add("debug", &param.debug, 0, "print debugging information")
// meta
.add("version,v", "print copyright and version info", &show_version_and_exit)
.add("help,h", "print usage information", &show_usage_and_exit)
.add("", &param.input_filename, "", "")
.add("", &param.output_filename, "", "")
// deprecated
.add("font-format", &deprecated_string, "", "", [] (const char*) {
cerr << "warning: --font-format is deprecated, @font-face format is inferred from --font-suffix" << endl;
})
;
try
@ -140,8 +151,7 @@ int main(int argc, char **argv)
parse_options(argc, argv);
if (param.input_filename == "")
{
cerr << "Missing input filename" << endl;
exit(EXIT_FAILURE);
show_usage_and_exit();
}
//prepare the directories
@ -202,7 +212,7 @@ int main(int argc, char **argv)
param.first_page = min<int>(max<int>(param.first_page, 1), doc->getNumPages());
param.last_page = min<int>(max<int>(param.last_page, param.first_page), doc->getNumPages());
if(param.output_filename == "")
if(param.output_filename.empty())
{
const string s = get_filename(param.input_filename);
@ -223,7 +233,7 @@ int main(int argc, char **argv)
}
}
if(param.css_filename == "")
if(param.css_filename.empty())
{
const string s = get_filename(param.input_filename);
@ -237,6 +247,21 @@ int main(int argc, char **argv)
param.css_filename = s + ".css";
}
}
if(param.outline_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.outline_filename = s.substr(0, s.size() - 4) + ".outline";
}
else
{
if(!param.split_pages)
param.outline_filename = s + ".outline";
}
}
HTMLRenderer * htmlOut = new HTMLRenderer(&param);
htmlOut->process(doc);

View File

@ -76,7 +76,7 @@ void ArgParser::parse(int argc, char ** argv) const
int v = p->shortname;
if(!(opt_map.insert(make_pair(v, p)).second))
{
cerr << "Warning: duplicated shortname '" << v << "' used by -" << (char)(p->shortname) << " and -" << (char)(opt_map[p->shortname]->shortname) << endl;
cerr << "Warning: duplicated shortname: " << v << endl;
}
}
@ -93,7 +93,7 @@ void ArgParser::parse(int argc, char ** argv) const
}
if(!(opt_map.insert(make_pair(v, p)).second))
{
cerr << "Warning: duplicated shortname '" << v << "' used by --" << (p->name) << " and --" << (opt_map[p->shortname]->name) << endl;
cerr << "Warning: duplicated long name: " << (p->name) << endl;
}
}
}
@ -146,6 +146,10 @@ void ArgParser::show_usage(ostream & out) const
}
}
template<> const char * ArgParser::get_type_name<int> (void) { return "int"; }
template<> const char * ArgParser::get_type_name<double> (void) { return "fp"; }
template<> const char * ArgParser::get_type_name<string> (void) { return "string"; }
ArgParser::ArgEntryBase::ArgEntryBase(const char * name, const char * description, bool need_arg)
: shortname(0), name(name), description(description), need_arg(need_arg)
{
@ -159,11 +163,11 @@ ArgParser::ArgEntryBase::ArgEntryBase(const char * name, const char * descriptio
}
else
{
cerr << "Warning: argument '" << this->name << "' may not be parsed correctly" << endl;
cerr << "Warning: argument '" << this->name << "' cannnot be parsed as a short option" << endl;
}
}
}
const int ArgParser::arg_col_width = 40;
const int ArgParser::arg_col_width = 31;
} // namespace pdf2htmlEX

View File

@ -39,7 +39,6 @@ void dump_value(std::ostream & out, const T & v)
extern void dump_value(std::ostream & out, const std::string & v);
class ArgParser
{
public:
@ -48,22 +47,29 @@ class ArgParser
typedef void (*ArgParserCallBack) (const char * arg);
/*
* optname: name of the argment, should be provided as --optname
* description: if description is "", the argument won't be shown in show_usage()
* The 1st is for arg without arguments (i.e. flags), and the 2nd is for general args.
* optname:
* - if not nullptr, it should be the name of the arg, should be in the format of "<long name>[,<short char>]", e.g. "help,h"
* - if nullptr, it denotes an optional arg, and description will be ignored
* description:
* - if description is nullptr or "", the argument won't be shown in show_usage()
*/
ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback = nullptr);
template <class T, class Tv>
ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback = nullptr, bool dont_show_default = false);
ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback = nullptr, bool dont_show_default = false);
void parse(int argc, char ** argv) const;
void show_usage(std::ostream & out) const;
private:
// type names helper
template<class>
static const char * get_type_name(void) { return "unknown"; }
class ArgEntryBase
{
public:
/* name or description cannot be nullptr */
ArgEntryBase(const char * name, const char * description, bool need_arg);
virtual ~ArgEntryBase() { }
char shortname;
@ -101,15 +107,25 @@ class ArgParser
template<class T, class Tv>
ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback, bool dont_show_default)
{
// use "" in case nullptr is provided
// ArgEntry does not accept nullptr as optname nor description
if((!optname) || (!optname[0]))
{
// when optname is nullptr or "", it's optional, and description is dropped
optional_arg_entries.push_back(new ArgEntry<T, Tv>("", location, default_value, callback, "", dont_show_default));
}
else
arg_entries.push_back(new ArgEntry<T, Tv>(optname, location, default_value, callback, description, dont_show_default));
{
arg_entries.push_back(new ArgEntry<T, Tv>(optname, location, default_value, callback, (description ? description : ""), dont_show_default));
}
return *this;
}
// Known types
template<> const char * ArgParser::get_type_name<int> (void);
template<> const char * ArgParser::get_type_name<double> (void);
template<> const char * ArgParser::get_type_name<std::string> (void);
template<class T, class Tv>
ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, T * location, const Tv & default_value, ArgParserCallBack callback, const char * description, bool dont_show_default)
: ArgEntryBase(name, description, (location != nullptr))
@ -141,7 +157,7 @@ void ArgParser::ArgEntry<T, Tv>::parse(const char * arg) const
template<class T, class Tv>
void ArgParser::ArgEntry<T, Tv>::show_usage(std::ostream & out) const
{
if(description == "")
if(description.empty())
return;
std::ostringstream sout;
@ -161,13 +177,7 @@ void ArgParser::ArgEntry<T, Tv>::show_usage(std::ostream & out) const
if(need_arg)
{
sout << " <arg>";
if(!dont_show_default)
{
sout << " (=";
dump_value(sout, default_value);
sout << ")";
}
sout << " <" << get_type_name<T>() << ">";
}
std::string s = sout.str();
@ -175,8 +185,17 @@ void ArgParser::ArgEntry<T, Tv>::show_usage(std::ostream & out) const
for(int i = s.size(); i < arg_col_width; ++i)
out << ' ';
out << " " << description << std::endl;
out << " " << description;
if(need_arg && !dont_show_default)
{
out << " (default: ";
dump_value(out, default_value);
out << ")";
}
out << std::endl;
}
} // namespace ArgParser

View File

@ -15,7 +15,7 @@
namespace pdf2htmlEX {
static inline double round(double x) { return (std::abs(x) > EPS) ? x : 0.0; }
static inline bool equal(double x, double y) { return std::abs(x-y) < EPS; }
static inline bool equal(double x, double y) { return std::abs(x-y) <= EPS; }
static inline bool is_positive(double x) { return x > EPS; }
static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6)
{

View File

@ -33,6 +33,9 @@ Unicode unicode_from_font (CharCode code, GfxFont * font);
*/
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
/*
* Escape necessary characters, and map Unicode to UTF-8
*/
void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);

View File

@ -13,7 +13,7 @@ with open('out.html','w') as outf:
if not f.lower().endswith('.pdf'):
continue
print f
if os.system('pdf2htmlEX --dest-dir html --auto-hint=1 --external-hint-tool="ttfautohint" "%s/%s"' % (DIR,f)) != 0:
if os.system('pdf2htmlEX -l 7 --dest-dir html --auto-hint=1 --external-hint-tool="ttfautohint" "%s/%s"' % (DIR,f)) != 0:
print "error on ", f
sys.exit(-1)