1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

Merge branch 'master' into devv

Conflicts:
	TODO
This commit is contained in:
Lu Wang 2013-01-19 14:53:08 +08:00
commit 81ec1a5618
58 changed files with 1204 additions and 876 deletions

View File

@ -5,7 +5,7 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build configuration (Debug, Release,
project(pdf2htmlEX) project(pdf2htmlEX)
cmake_minimum_required(VERSION 2.6.0 FATAL_ERROR) cmake_minimum_required(VERSION 2.6.0 FATAL_ERROR)
include_directories(${CMAKE_SOURCE_DIR}/src/include) include_directories(${CMAKE_SOURCE_DIR}/src)
set(PDF2HTMLEX_VERSION "0.6") set(PDF2HTMLEX_VERSION "0.6")
set(ARCHIVE_NAME pdf2htmlex-${PDF2HTMLEX_VERSION}) set(ARCHIVE_NAME pdf2htmlex-${PDF2HTMLEX_VERSION})
@ -138,36 +138,52 @@ if(NOT CXX0X_SUPPORT)
endif() endif()
configure_file (${CMAKE_SOURCE_DIR}/src/include/pdf2htmlEX-config.h.in ${CMAKE_SOURCE_DIR}/src/include/pdf2htmlEX-config.h) configure_file (${CMAKE_SOURCE_DIR}/src/pdf2htmlEX-config.h.in ${CMAKE_SOURCE_DIR}/src/pdf2htmlEX-config.h)
configure_file (${CMAKE_SOURCE_DIR}/pdf2htmlEX.1.in ${CMAKE_SOURCE_DIR}/pdf2htmlEX.1) configure_file (${CMAKE_SOURCE_DIR}/pdf2htmlEX.1.in ${CMAKE_SOURCE_DIR}/pdf2htmlEX.1)
add_executable(pdf2htmlEX add_executable(pdf2htmlEX
src/Param.h
src/pdf2htmlEX-config.h
src/pdf2htmlEX.cc src/pdf2htmlEX.cc
src/include/HTMLRenderer.h src/HTMLRenderer/HTMLRenderer.h
src/HTMLRenderer/general.cc
src/HTMLRenderer/state.cc
src/HTMLRenderer/install.cc
src/HTMLRenderer/export.cc
src/HTMLRenderer/text.cc
src/HTMLRenderer/image.cc
src/HTMLRenderer/draw.cc src/HTMLRenderer/draw.cc
src/HTMLRenderer/export.cc
src/HTMLRenderer/general.cc
src/HTMLRenderer/image.cc
src/HTMLRenderer/install.cc
src/HTMLRenderer/TextLineBuffer.h
src/HTMLRenderer/TextLineBuffer.cc
src/HTMLRenderer/link.cc src/HTMLRenderer/link.cc
src/include/namespace.h src/HTMLRenderer/state.cc
src/HTMLRenderer/LineBuffer.cc src/HTMLRenderer/text.cc
src/include/ffw.h src/BackgroundRenderer/BackgroundRenderer.h
src/ffw.c src/BackgroundRenderer/SplashBackgroundRenderer.h
src/include/BackgroundRenderer.h src/BackgroundRenderer/SplashBackgroundRenderer.cc
src/include/SplashBackgroundRenderer.h src/BackgroundRenderer/CairoBackgroundRenderer.h
src/SplashBackgroundRenderer.cc src/BackgroundRenderer/CairoBackgroundRenderer.cc
src/include/CairoBackgroundRenderer.h src/util/ArgParser.h
src/CairoBackgroundRenderer.cc src/util/ArgParser.cc
src/include/Preprocessor.h src/util/base64stream.h
src/Preprocessor.cc src/util/base64stream.cc
src/include/util.h src/util/const.h
src/util.cc src/util/const.cc
src/include/ArgParser.h src/util/ffw.h
src/ArgParser.cc src/util/ffw.c
src/include/pdf2htmlEX-config.h src/util/math.h
src/util/math.cc
src/util/misc.h
src/util/misc.cc
src/util/namespace.h
src/util/path.h
src/util/path.cc
src/util/Preprocessor.h
src/util/Preprocessor.cc
src/util/StringFormatter.h
src/util/StringFormatter.cc
src/util/TmpFiles.h
src/util/TmpFiles.cc
src/util/unicode.h
src/util/unicode.cc
) )
target_link_libraries(pdf2htmlEX ${PDF2HTMLEX_LIBS}) target_link_libraries(pdf2htmlEX ${PDF2HTMLEX_LIBS})

View File

@ -1,5 +1,10 @@
Latest v0.6 Latest v0.6
* New parameter: --use-cropbox
* Progress indicator
* Create a glyph for ' ' when missing
* Code refining
v0.5 v0.5
2012.10.06 2012.10.06

View File

@ -20,24 +20,34 @@ It is optimized for modern web browsers.On Linux/Mac, the generated HTML pages c
This program is designed for scientific papers with complicate formulas and figures, therefore precise rendering is the #1 concern. But of course general PDF files are also supported. This program is designed for scientific papers with complicate formulas and figures, therefore precise rendering is the #1 concern. But of course general PDF files are also supported.
### Why HTML ?
HTML, together with CSS and Javascript, is much more open and flexible than PDF. Almost everything can be customized.
- Embedding documents to web pages with consistent theme and behavior
- Cross references to other documents are much easier and intuitive
- More functions to the document with Javascript, e.g. access control, animation, statistics
Readers can also be benefitted
- Read while downloading
- Plugin-free
## Features ## Features
* Single HTML file output * Optional single HTML file output
* Precise rendering * Precise rendering
* Text Selection * Text perserved - you can select & copy & paste
* Font embedding & reencoding for Web * Proper styling
* Proper styling (Color, Transformation...) - Font - extracted and reencoded
- Color
- Transformation
* Links * Links
* Optimization for Web
* [EXPERIMENTAL] Path drawing with CSS * [EXPERIMENTAL] Path drawing with CSS
* Orthogonal lines - Orthogonal lines
* Rectangles - Rectangles
* Linear gradients - Linear gradients
* Not fully supported, and rendered as images
### Objects rendered as images - Type 3 fonts
- Non-text object
* Type 3 fonts
* Non-text object
## Get started ## Get started
@ -99,12 +109,6 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u
man pdf2htmlEX man pdf2htmlEX
### For Geeks
* Experimental and unsupported
pdf2htmlEX --process-nontext 0 --css-draw 1 /path/to/foobar.pdf
## FAQ ## FAQ
* [Troubleshooting compilation errors](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ#wiki-compile) * [Troubleshooting compilation errors](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ#wiki-compile)
@ -113,11 +117,6 @@ More info can be found on [the pdf2htmlEX page in TeX Wiki](http://oku.edu.mie-u
* [I want more features](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ#wiki-feature_commission) * [I want more features](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ#wiki-feature_commission)
* [More](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ) * [More](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ)
**WINDOWS XP USERS: Please make sure ClearType is turned on**
(Control Panel -> Display -> Appearance -> Effects -> "Use the following method to smooth edges of screen fonts" -> ClearType)
## LICENSE ## LICENSE
GPLv2 & GPLv3 Dual licensed GPLv2 & GPLv3 Dual licensed
@ -161,4 +160,5 @@ pdf2htmlEX is inspired by the following projects:
### Special Thanks ### Special Thanks
* Hongliang Tian <tatetian@gmail.com> * Hongliang Tian <tatetian@gmail.com>
* Wanmin Liu <wanminliu@gmail.com>

21
TODO
View File

@ -1,30 +1,37 @@
<<<<<<< HEAD
manually locate font if fixed name manually locate font if fixed name
word space/offset before the first letter (calendar pdf) word space/offset before the first letter (calendar pdf)
don't dump image when there is nothing don't dump image when there is nothing
=======
>>>>>>> master
Integrate splash/cairo word space/offset before the first letter (calendar pdf)
native support for image
native support for draw
draw non-orthogonal lines with CSS minimum line width of css drawing
position history stack (popstate)
==Wait until someone asks== ==Wait until someone asks==
position history stack (popstate)
draw non-orthogonal lines with CSS
try harder finding glyph names (using fontforge) for CID Type 0 try harder finding glyph names (using fontforge) for CID Type 0
rename single-html -> embed-font/image/css ... rename single-html -> embed-font/image/css ...
merge sub/sup into one line merge sub/sup into one line
precise link dest: zoom precise link dest: zoom
multiple charcode mapped to a same glyph multiple charcode mapped to a same glyph
don't dump image when there is nothing
==Future== ==Future==
Integrate splash/cairo
native support for image
native support for draw
type 3 fonts
combine lines (unwarp)
argument auto-completion argument auto-completion
use absolute positioning for long whitespace use absolute positioning for long whitespace
color invert color invert
detect duplicate base fonts when embedding detect duplicate base fonts when embedding
disable selection if we know unicode is wrong disable selection if we know unicode is wrong
combine lines (unwarp)
Printing
check if we can add information to the font, and let browsers show ligatures automatically check if we can add information to the font, and let browsers show ligatures automatically
Printing

13
debian/changelog vendored
View File

@ -1,3 +1,16 @@
pdf2htmlex (0.6-1~git201212182148rd76af-0ubuntu1) quantal; urgency=low
* fix dependency of poppler for quantal
*
-- WANG Lu <coolwanglu@gmail.com> Tue, 18 Dec 2012 21:48:35 +0800
pdf2htmlex (0.6-1~git201212111844rd76af-0ubuntu1) quantal; urgency=low
* Package for quantal
-- WANG Lu <coolwanglu@gmail.com> Tue, 11 Dec 2012 18:44:44 +0800
pdf2htmlex (0.6-1~git201210070052rcb9a8-0ubuntu1) precise; urgency=low pdf2htmlex (0.6-1~git201210070052rcb9a8-0ubuntu1) precise; urgency=low
* New version * New version

2
debian/control vendored
View File

@ -8,6 +8,6 @@ Homepage: http://github.com/coolwanglu/pdf2htmlEX
Package: pdf2htmlex Package: pdf2htmlex
Architecture: any Architecture: any
Depends: ${shlibs:Depends}, ${misc:Depends}, libpoppler27 (>= 0.20.3), libboost-filesystem-dev, libboost-program-options-dev, libpng12-0, libfontforge1 Depends: ${shlibs:Depends}, ${misc:Depends}, libpoppler27 (>= 0.20.3) | libpoppler28, libboost-filesystem-dev, libboost-program-options-dev, libpng12-0, libfontforge1
Description: Converts PDF to HTML without losing format Description: Converts PDF to HTML without losing format
pdf2htmlEX converts PDF to HTML while retaining text, format & style as much as possible pdf2htmlEX converts PDF to HTML while retaining text, format & style as much as possible

View File

@ -54,9 +54,12 @@ If multiple values are specified, the minimum one will be used.
If none is specified, pages will be rendered as 72DPI. If none is specified, pages will be rendered as 72DPI.
.TP .TP
.B --hpdi <dpi>, --vpdi <dpi> (Default: 144) .B --hdpi <dpi>, --vdpi <dpi> (Default: 144)
Specify the horizontal and vertical DPI for images Specify the horizontal and vertical DPI for images
.TP .TP
.B --use-cropbox <0|1> (Default: 0)
Use CropBox instead of MediaBox for output.
.TP
.B --process-nontext <0|1> (Default: 1) .B --process-nontext <0|1> (Default: 1)
Whether to process non-text objects (as images) Whether to process non-text objects (as images)
.TP .TP
@ -110,7 +113,7 @@ If this value is set to 1, the ToUnicode Map is always applied, if provided in P
If set to -1, a customized map is used such that rendering will be correct in HTML (visually the same), but you may not get correct characters by select & copy & paste. If set to -1, a customized map is used such that rendering will be correct in HTML (visually the same), but you may not get correct characters by select & copy & paste.
If set to 0, pdf2htmlEX would try it best to balance the two methods above. If set to 0, pdf2htmlEX would try its best to balance the two methods above.
.TP .TP
.B --space-as-offset <0|1> (Default: 0) .B --space-as-offset <0|1> (Default: 0)
Treat space characters as offsets, which may increase the size of the output. Treat space characters as offsets, which may increase the size of the output.
@ -118,13 +121,13 @@ Treat space characters as offsets, which may increase the size of the output.
Turn it on if space characters are not displayed correctly, or you want to remove positional spaces. Turn it on if space characters are not displayed correctly, or you want to remove positional spaces.
.TP .TP
.B --stretch-narrow-glyph <0|1> (Default: 0) .B --stretch-narrow-glyph <0|1> (Default: 0)
If set to 1, glyphs narrower than described in PDF will be strecth; otherwise space will be padded to the right of the glyphs If set to 1, glyphs narrower than described in PDF will be stretched; otherwise space will be padded to the right of the glyphs
.TP .TP
.B --squeeze_wide_glyph <0|1> (Default: 1) .B --squeeze-wide-glyph <0|1> (Default: 1)
If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated. If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated.
.TP .TP
.B --remove-unused-glyph <0|1> (Default: 1) .B --remove-unused-glyph <0|1> (Default: 1)
[Experimental] If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size. If set to 1, remove unused glyphs in embedded fonts in order to reduce the file size.
.TP .TP
.B --font-suffix <suffix> (Default: .ttf), --font-format <format> (Default: truetype) .B --font-suffix <suffix> (Default: .ttf), --font-format <format> (Default: truetype)
Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.

View File

@ -63,6 +63,8 @@
span { span {
position:relative; position:relative;
vertical-align: baseline; vertical-align: baseline;
/* _<id> for spaces may need display:inline, which will override this */
display:inline-block;
} }
._ { ._ {
color:transparent; color:transparent;
@ -74,9 +76,6 @@ span {
::-moz-selection{ ::-moz-selection{
background: rgba(127,255,255,1); background: rgba(127,255,255,1);
} }
.i {
position:absolute;
}
.j { .j {
display:none; display:none;
} }

View File

@ -19,7 +19,8 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
double originX, double originY, double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen) CharCode code, int nBytes, Unicode *u, int uLen)
{ {
if((state->getRender() & 3) == 3) if(((state->getRender() & 3) == 3)
|| ((state->getFont()) && (state->getFont()->getWMode())))
{ {
SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen); SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code, nBytes, u, uLen);
} }
@ -32,7 +33,9 @@ static GBool annot_cb(Annot *, void *) {
void SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno, const string & filename) void SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno, const string & filename)
{ {
doc->displayPage(this, pageno, param->h_dpi, param->v_dpi, doc->displayPage(this, pageno, param->h_dpi, param->v_dpi,
0, true, false, false, 0,
(param->use_cropbox == 0),
false, false,
nullptr, nullptr, &annot_cb, nullptr); nullptr, nullptr, &annot_cb, nullptr);
getBitmap()->writeImgFile(splashFormatPng, getBitmap()->writeImgFile(splashFormatPng,

View File

@ -15,8 +15,8 @@
#include <splash/SplashBitmap.h> #include <splash/SplashBitmap.h>
#include <SplashOutputDev.h> #include <SplashOutputDev.h>
#include "HTMLRenderer.h"
#include "Param.h" #include "Param.h"
#include "HTMLRenderer/HTMLRenderer.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {

View File

@ -25,8 +25,10 @@
#include <Annot.h> #include <Annot.h>
#include "Param.h" #include "Param.h"
#include "util.h" #include "util/Preprocessor.h"
#include "Preprocessor.h" #include "util/const.h"
#include "util/StringFormatter.h"
#include "util/TmpFiles.h"
/* /*
* Naming Convention * Naming Convention
@ -38,7 +40,6 @@
* b - page Box * b - page Box
* d - page Decoration * d - page Decoration
* l - Line * l - Line
* i - Image
* j - Js data * j - Js data
* p - Page * p - Page
* *
@ -60,6 +61,51 @@
namespace pdf2htmlEX { namespace pdf2htmlEX {
// we may need more info of a font in the future
class FontInfo
{
public:
long long id;
bool use_tounicode;
int em_size;
double ascent, descent;
};
class GfxRGB_hash
{
public:
size_t operator () (const GfxRGB & rgb) const
{
return (colToByte(rgb.r) << 16) | (colToByte(rgb.g) << 8) | (colToByte(rgb.b));
}
};
class GfxRGB_equal
{
public:
bool operator ()(const GfxRGB & rgb1, const GfxRGB & rgb2) const
{
return ((rgb1.r == rgb2.r) && (rgb1.g == rgb2.g) && (rgb1.b == rgb1.b));
}
};
class Matrix_less
{
public:
bool operator () (const Matrix & m1, const Matrix & m2) const
{
// Note that we only care about the first 4 elements
for(int i = 0; i < 4; ++i)
{
if(m1.m[i] < m2.m[i] - EPS)
return true;
if(m1.m[i] > m2.m[i] + EPS)
return false;
}
return false;
}
};
class HTMLRenderer : public OutputDev class HTMLRenderer : public OutputDev
{ {
public: public:
@ -154,10 +200,8 @@ class HTMLRenderer : public OutputDev
void post_process(); void post_process();
// set flags // set flags
void fix_stream (std::ostream & out); void set_stream_flags (std::ostream & out);
void add_tmp_file (const std::string & fn);
void clean_tmp_files ();
std::string dump_embedded_font (GfxFont * font, long long fn_id); std::string dump_embedded_font (GfxFont * font, long long fn_id);
void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false); void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false);
@ -335,82 +379,20 @@ class HTMLRenderer : public OutputDev
double draw_tx, draw_ty; double draw_tx, draw_ty;
// some metrics have to be determined after all elements in the lines have been seen // some metrics have to be determined after all elements in the lines have been seen
class LineBuffer { class TextLineBuffer;
public: friend class TextLineBuffer;
LineBuffer (HTMLRenderer * renderer) : renderer(renderer) { } TextLineBuffer * text_line_buf;
class State {
public:
void begin(std::ostream & out, const State * prev_state);
void end(std::ostream & out) const;
void hash(void);
int diff(const State & s) const;
enum {
FONT_ID,
FONT_SIZE_ID,
COLOR_ID,
LETTER_SPACE_ID,
WORD_SPACE_ID,
RISE_ID,
ID_COUNT
};
long long ids[ID_COUNT];
double ascent;
double descent;
double draw_font_size;
size_t start_idx; // index of the first Text using this state
// for optimzation
long long hash_value;
bool need_close;
static const char * format_str; // class names for each id
};
class Offset {
public:
size_t start_idx; // should put this idx before text[start_idx];
double width;
};
void reset(GfxState * state);
void append_unicodes(const Unicode * u, int l);
void append_offset(double width);
void append_state(void);
void flush(void);
private:
// retrieve state from renderer
void set_state(State & state);
HTMLRenderer * renderer;
double x, y;
long long tm_id;
std::vector<State> states;
std::vector<Offset> offsets;
std::vector<Unicode> text;
// for flush
std::vector<State*> stack;
} line_buf;
friend class LineBuffer;
// for font reencoding // for font reencoding
int32_t * cur_mapping; int32_t * cur_mapping;
char ** cur_mapping2; char ** cur_mapping2;
int * width_list; int * width_list;
Preprocessor preprocessor; Preprocessor preprocessor;
TmpFiles tmp_files;
// for string formatting // for string formatting
string_formatter str_fmt; StringFormatter str_fmt;
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// styles & resources // styles & resources
@ -426,12 +408,9 @@ class HTMLRenderer : public OutputDev
std::map<double, long long> rise_map; std::map<double, long long> rise_map;
std::map<double, long long> height_map; std::map<double, long long> height_map;
int image_count;
const Param * param; const Param * param;
std::ofstream html_fout, css_fout; std::ofstream html_fout, css_fout;
std::string html_path, css_path; std::string html_path, css_path;
std::set<std::string> tmp_files;
static const std::string MANIFEST_FILENAME; static const std::string MANIFEST_FILENAME;
}; };

View File

@ -1,5 +1,5 @@
/* /*
* LineBuffer.cc * TextLineBuffer.cc
* *
* Generate and optimized HTML for one line * Generate and optimized HTML for one line
* *
@ -10,7 +10,10 @@
#include <vector> #include <vector>
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "namespace.h" #include "TextLineBuffer.h"
#include "util/namespace.h"
#include "util/unicode.h"
#include "util/math.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
@ -18,19 +21,21 @@ using std::min;
using std::max; using std::max;
using std::vector; using std::vector;
using std::ostream; using std::ostream;
using std::cerr;
using std::endl;
void HTMLRenderer::LineBuffer::reset(GfxState * state) void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
{ {
state->transform(state->getCurX(), state->getCurY(), &x, &y); state->transform(state->getCurX(), state->getCurY(), &x, &y);
tm_id = renderer->cur_ttm_id; tm_id = renderer->cur_ttm_id;
} }
void HTMLRenderer::LineBuffer::append_unicodes(const Unicode * u, int l) void HTMLRenderer::TextLineBuffer::append_unicodes(const Unicode * u, int l)
{ {
text.insert(text.end(), u, u+l); text.insert(text.end(), u, u+l);
} }
void HTMLRenderer::LineBuffer::append_offset(double width) void HTMLRenderer::TextLineBuffer::append_offset(double width)
{ {
if((!offsets.empty()) && (offsets.back().start_idx == text.size())) if((!offsets.empty()) && (offsets.back().start_idx == text.size()))
offsets.back().width += width; offsets.back().width += width;
@ -38,7 +43,7 @@ void HTMLRenderer::LineBuffer::append_offset(double width)
offsets.push_back(Offset({text.size(), width})); offsets.push_back(Offset({text.size(), width}));
} }
void HTMLRenderer::LineBuffer::append_state(void) void HTMLRenderer::TextLineBuffer::append_state(void)
{ {
if(states.empty() || (states.back().start_idx != text.size())) if(states.empty() || (states.back().start_idx != text.size()))
{ {
@ -49,10 +54,10 @@ void HTMLRenderer::LineBuffer::append_state(void)
set_state(states.back()); set_state(states.back());
} }
void HTMLRenderer::LineBuffer::flush(void) void HTMLRenderer::TextLineBuffer::flush(void)
{ {
/* /*
* Each Line is an independent absolute positioined block * Each Line is an independent absolute positioned block
* so even we have a few states or offsets, we may omit them * so even we have a few states or offsets, we may omit them
*/ */
if(text.empty()) return; if(text.empty()) return;
@ -80,8 +85,8 @@ void HTMLRenderer::LineBuffer::flush(void)
ostream & out = renderer->html_fout; ostream & out = renderer->html_fout;
out << "<div style=\"left:" out << "<div style=\"left:"
<< _round(x) << "px;bottom:" << round(x) << "px;bottom:"
<< _round(y) << "px;" << round(y) << "px;"
<< "\"" << "\""
<< " class=\"l t" << tm_id << " class=\"l t" << tm_id
<< " h" << renderer->install_height(max_ascent) << " h" << renderer->install_height(max_ascent)
@ -177,7 +182,7 @@ void HTMLRenderer::LineBuffer::flush(void)
} }
void HTMLRenderer::LineBuffer::set_state (State & state) void HTMLRenderer::TextLineBuffer::set_state (State & state)
{ {
state.ids[State::FONT_ID] = renderer->cur_font_info->id; state.ids[State::FONT_ID] = renderer->cur_font_info->id;
state.ids[State::FONT_SIZE_ID] = renderer->cur_fs_id; state.ids[State::FONT_SIZE_ID] = renderer->cur_fs_id;
@ -192,7 +197,7 @@ void HTMLRenderer::LineBuffer::set_state (State & state)
state.draw_font_size = renderer->draw_font_size; state.draw_font_size = renderer->draw_font_size;
} }
void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state) void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state)
{ {
bool first = true; bool first = true;
for(int i = 0; i < ID_COUNT; ++i) for(int i = 0; i < ID_COUNT; ++i)
@ -225,13 +230,13 @@ void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_s
} }
} }
void HTMLRenderer::LineBuffer::State::end(ostream & out) const void HTMLRenderer::TextLineBuffer::State::end(ostream & out) const
{ {
if(need_close) if(need_close)
out << "</span>"; out << "</span>";
} }
void HTMLRenderer::LineBuffer::State::hash(void) void HTMLRenderer::TextLineBuffer::State::hash(void)
{ {
hash_value = 0; hash_value = 0;
for(int i = 0; i < ID_COUNT; ++i) for(int i = 0; i < ID_COUNT; ++i)
@ -240,7 +245,7 @@ void HTMLRenderer::LineBuffer::State::hash(void)
} }
} }
int HTMLRenderer::LineBuffer::State::diff(const State & s) const int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
{ {
/* /*
* A quick check based on hash_value * A quick check based on hash_value
@ -256,5 +261,5 @@ int HTMLRenderer::LineBuffer::State::diff(const State & s) const
return d; return d;
} }
const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr"; const char * HTMLRenderer::TextLineBuffer::State::format_str = "fsclwr";
} //namespace pdf2htmlEX } //namespace pdf2htmlEX

View File

@ -0,0 +1,78 @@
#ifndef TEXTLINEBUFFER_H__
#define TEXTLINEBUFFER_H__
#include <iostream>
#include <vector>
namespace pdf2htmlEX {
class HTMLRenderer;
class HTMLRenderer::TextLineBuffer
{
public:
TextLineBuffer (HTMLRenderer * renderer) : renderer(renderer) { }
class State {
public:
void begin(std::ostream & out, const State * prev_state);
void end(std::ostream & out) const;
void hash(void);
int diff(const State & s) const;
enum {
FONT_ID,
FONT_SIZE_ID,
COLOR_ID,
LETTER_SPACE_ID,
WORD_SPACE_ID,
RISE_ID,
ID_COUNT
};
long long ids[ID_COUNT];
double ascent;
double descent;
double draw_font_size;
size_t start_idx; // index of the first Text using this state
// for optimzation
long long hash_value;
bool need_close;
static const char * format_str; // class names for each id
};
class Offset {
public:
size_t start_idx; // should put this idx before text[start_idx];
double width;
};
void reset(GfxState * state);
void append_unicodes(const Unicode * u, int l);
void append_offset(double width);
void append_state(void);
void flush(void);
private:
// retrieve state from renderer
void set_state(State & state);
HTMLRenderer * renderer;
double x, y;
long long tm_id;
std::vector<State> states;
std::vector<Offset> offsets;
std::vector<Unicode> text;
// for flush
std::vector<State*> stack;
};
} // namespace pdf2htmlEX
#endif //TEXTLINEBUFFER_H__

View File

@ -14,8 +14,9 @@
#include <iostream> #include <iostream>
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "util.h" #include "util/misc.h"
#include "namespace.h" #include "util/math.h"
#include "util/namespace.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
@ -33,36 +34,36 @@ static bool is_horizontal_line(GfxSubpath * path)
{ {
return ((path->getNumPoints() == 2) return ((path->getNumPoints() == 2)
&& (!path->getCurve(1)) && (!path->getCurve(1))
&& (_equal(path->getY(0), path->getY(1)))); && (equal(path->getY(0), path->getY(1))));
} }
static bool is_vertical_line(GfxSubpath * path) static bool is_vertical_line(GfxSubpath * path)
{ {
return ((path->getNumPoints() == 2) return ((path->getNumPoints() == 2)
&& (!path->getCurve(1)) && (!path->getCurve(1))
&& (_equal(path->getX(0), path->getX(1)))); && (equal(path->getX(0), path->getX(1))));
} }
static bool is_rectangle(GfxSubpath * path) static bool is_rectangle(GfxSubpath * path)
{ {
if (!(((path->getNumPoints() != 4) && (path->isClosed())) if (!(((path->getNumPoints() != 4) && (path->isClosed()))
|| ((path->getNumPoints() == 5) || ((path->getNumPoints() == 5)
&& _equal(path->getX(0), path->getX(4)) && equal(path->getX(0), path->getX(4))
&& _equal(path->getY(0), path->getY(4))))) && equal(path->getY(0), path->getY(4)))))
return false; return false;
for(int i = 1; i < path->getNumPoints(); ++i) for(int i = 1; i < path->getNumPoints(); ++i)
if(path->getCurve(i)) if(path->getCurve(i))
return false; return false;
return (_equal(path->getY(0), path->getY(1)) return (equal(path->getY(0), path->getY(1))
&& _equal(path->getX(1), path->getX(2)) && equal(path->getX(1), path->getX(2))
&& _equal(path->getY(2), path->getY(3)) && equal(path->getY(2), path->getY(3))
&& _equal(path->getX(3), path->getX(0))) && equal(path->getX(3), path->getX(0)))
|| (_equal(path->getX(0), path->getX(1)) || (equal(path->getX(0), path->getX(1))
&& _equal(path->getY(1), path->getY(2)) && equal(path->getY(1), path->getY(2))
&& _equal(path->getX(2), path->getX(3)) && equal(path->getX(2), path->getX(3))
&& _equal(path->getY(3), path->getY(0))); && equal(path->getY(3), path->getY(0)));
} }
static void get_shading_bbox(GfxState * state, GfxShading * shading, static void get_shading_bbox(GfxState * state, GfxShading * shading,
@ -105,7 +106,7 @@ static void get_shading_bbox(GfxState * state, GfxShading * shading,
*/ */
static double get_angle(double dx, double dy) static double get_angle(double dx, double dy)
{ {
double r = _hypot(dx, dy); double r = hypot(dx, dy);
/* /*
* acos always returns [0, pi] * acos always returns [0, pi]
@ -208,10 +209,10 @@ void LinearGradient::dumpto (ostream & out)
auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"}; auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"};
for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter)
{ {
out << "background-image:" << (*iter) << "linear-gradient(" << _round(angle) << "rad"; out << "background-image:" << (*iter) << "linear-gradient(" << round(angle) << "rad";
for(auto iter2 = stops.begin(); iter2 != stops.end(); ++iter2) for(auto iter2 = stops.begin(); iter2 != stops.end(); ++iter2)
{ {
out << "," << (iter2->rgb) << " " << _round((iter2->pos) * 100) << "%"; out << "," << (iter2->rgb) << " " << round((iter2->pos) * 100) << "%";
} }
out << ");"; out << ");";
} }
@ -318,7 +319,7 @@ bool HTMLRenderer::css_do_path(GfxState *state, bool fill, bool test_only)
GfxRGB * ps = fill ? nullptr : (&stroke_color); GfxRGB * ps = fill ? nullptr : (&stroke_color);
GfxRGB * pf = fill ? (&fill_color) : nullptr; GfxRGB * pf = fill ? (&fill_color) : nullptr;
if(_equal(h, 0) || _equal(w, 0)) if(equal(h, 0) || equal(w, 0))
{ {
// orthogonal line // orthogonal line
@ -351,7 +352,7 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co
double new_tm[6]; double new_tm[6];
memcpy(new_tm, tm, sizeof(new_tm)); memcpy(new_tm, tm, sizeof(new_tm));
_tm_transform(new_tm, x, y); tm_transform(new_tm, x, y);
double scale = 1.0; double scale = 1.0;
{ {
@ -359,8 +360,8 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co
double i1 = (new_tm[0] + new_tm[2]) / sqrt2; double i1 = (new_tm[0] + new_tm[2]) / sqrt2;
double i2 = (new_tm[1] + new_tm[3]) / sqrt2; double i2 = (new_tm[1] + new_tm[3]) / sqrt2;
scale = _hypot(i1, i2); scale = hypot(i1, i2);
if(_is_positive(scale)) if(is_positive(scale))
{ {
for(int i = 0; i < 4; ++i) for(int i = 0; i < 4; ++i)
new_tm[i] /= scale; new_tm[i] /= scale;
@ -383,8 +384,8 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co
if(i > 0) html_fout << ' '; if(i > 0) html_fout << ' ';
double lw = line_width_array[i] * scale; double lw = line_width_array[i] * scale;
html_fout << _round(lw); html_fout << round(lw);
if(_is_positive(lw)) html_fout << "px"; if(is_positive(lw)) html_fout << "px";
} }
html_fout << ";"; html_fout << ";";
} }
@ -407,10 +408,10 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co
style_function(style_function_data, html_fout); style_function(style_function_data, html_fout);
} }
html_fout << "bottom:" << _round(y) << "px;" html_fout << "bottom:" << round(y) << "px;"
<< "left:" << _round(x) << "px;" << "left:" << round(x) << "px;"
<< "width:" << _round(w * scale) << "px;" << "width:" << round(w * scale) << "px;"
<< "height:" << _round(h * scale) << "px;"; << "height:" << round(h * scale) << "px;";
html_fout << "\"></div>"; html_fout << "\"></div>";
} }

View File

@ -11,7 +11,10 @@
#include <cctype> #include <cctype>
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "namespace.h" #include "util/namespace.h"
#include "util/base64stream.h"
#include "util/math.h"
#include "util/misc.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
@ -38,7 +41,7 @@ void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suff
css_fout << ")format(\"" << fontfileformat css_fout << ")format(\"" << fontfileformat
<< "\");}.f" << info.id << "\");}.f" << info.id
<< "{font-family:f" << info.id << "{font-family:f" << info.id
<< ";line-height:" << _round(info.ascent - info.descent) << ";line-height:" << round(info.ascent - info.descent)
<< ";font-style:normal;font-weight:normal;}"; << ";font-style:normal;font-weight:normal;}";
css_fout << endl; css_fout << endl;
@ -81,14 +84,14 @@ void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, cons
else else
css_fout << "font-style:normal;"; css_fout << "font-style:normal;";
css_fout << "line-height:" << _round(info.ascent - info.descent) << ";"; css_fout << "line-height:" << round(info.ascent - info.descent) << ";";
css_fout << "}" << endl; css_fout << "}" << endl;
} }
void HTMLRenderer::export_font_size (long long fs_id, double font_size) void HTMLRenderer::export_font_size (long long fs_id, double font_size)
{ {
css_fout << ".s" << fs_id << "{font-size:" << _round(font_size) << "px;}" << endl; css_fout << ".s" << fs_id << "{font-size:" << round(font_size) << "px;}" << endl;
} }
void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
@ -99,7 +102,7 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
// we have already shifted the origin // we have already shifted the origin
// TODO: recognize common matices // TODO: recognize common matices
if(_tm_equal(tm, id_matrix, 4)) if(tm_equal(tm, ID_MATRIX, 4))
{ {
auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"}; auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"};
for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter)
@ -112,10 +115,10 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
{ {
// PDF use a different coordinate system from Web // PDF use a different coordinate system from Web
css_fout << *iter << "transform:matrix(" css_fout << *iter << "transform:matrix("
<< _round(tm[0]) << ',' << round(tm[0]) << ','
<< _round(-tm[1]) << ',' << round(-tm[1]) << ','
<< _round(-tm[2]) << ',' << round(-tm[2]) << ','
<< _round(tm[3]) << ','; << round(tm[3]) << ',';
css_fout << "0,0);"; css_fout << "0,0);";
} }
@ -125,12 +128,12 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
void HTMLRenderer::export_letter_space (long long ls_id, double letter_space) void HTMLRenderer::export_letter_space (long long ls_id, double letter_space)
{ {
css_fout << ".l" << ls_id << "{letter-spacing:" << _round(letter_space) << "px;}" << endl; css_fout << ".l" << ls_id << "{letter-spacing:" << round(letter_space) << "px;}" << endl;
} }
void HTMLRenderer::export_word_space (long long ws_id, double word_space) void HTMLRenderer::export_word_space (long long ws_id, double word_space)
{ {
css_fout << ".w" << ws_id << "{word-spacing:" << _round(word_space) << "px;}" << endl; css_fout << ".w" << ws_id << "{word-spacing:" << round(word_space) << "px;}" << endl;
} }
void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb) void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
@ -141,19 +144,19 @@ void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
{ {
if(ws_width > 0) if(ws_width > 0)
css_fout << "._" << ws_id << "{display:inline-block;width:" << _round(ws_width) << "px;}" << endl; css_fout << "._" << ws_id << "{display:inline-block;width:" << round(ws_width) << "px;}" << endl;
else else
css_fout << "._" << ws_id << "{display:inline;margin-left:" << _round(ws_width) << "px;}" << endl; css_fout << "._" << ws_id << "{display:inline;margin-left:" << round(ws_width) << "px;}" << endl;
} }
void HTMLRenderer::export_rise (long long rise_id, double rise) void HTMLRenderer::export_rise (long long rise_id, double rise)
{ {
css_fout << ".r" << rise_id << "{top:" << _round(-rise) << "px;}" << endl; css_fout << ".r" << rise_id << "{top:" << round(-rise) << "px;}" << endl;
} }
void HTMLRenderer::export_height (long long height_id, double height) void HTMLRenderer::export_height (long long height_id, double height)
{ {
css_fout << ".h" << height_id << "{height:" << _round(height) << "px;}" << endl; css_fout << ".h" << height_id << "{height:" << round(height) << "px;}" << endl;
} }
} }

View File

@ -14,10 +14,14 @@
#include <vector> #include <vector>
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "BackgroundRenderer.h" #include "TextLineBuffer.h"
#include "namespace.h"
#include "ffw.h"
#include "pdf2htmlEX-config.h" #include "pdf2htmlEX-config.h"
#include "BackgroundRenderer/BackgroundRenderer.h"
#include "util/namespace.h"
#include "util/ffw.h"
#include "util/base64stream.h"
#include "util/math.h"
#include "util/path.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
@ -28,6 +32,8 @@ using std::max;
using std::min_element; using std::min_element;
using std::vector; using std::vector;
using std::abs; using std::abs;
using std::cerr;
using std::endl;
static void dummy(void *, enum ErrorCategory, int pos, char *) static void dummy(void *, enum ErrorCategory, int pos, char *)
{ {
@ -36,9 +42,9 @@ static void dummy(void *, enum ErrorCategory, int pos, char *)
HTMLRenderer::HTMLRenderer(const Param * param) HTMLRenderer::HTMLRenderer(const Param * param)
:OutputDev() :OutputDev()
,line_opened(false) ,line_opened(false)
,line_buf(this) ,text_line_buf(new TextLineBuffer(this))
,preprocessor(param) ,preprocessor(param)
,image_count(0) ,tmp_files(*param)
,param(param) ,param(param)
{ {
if(!(param->debug)) if(!(param->debug))
@ -55,8 +61,8 @@ HTMLRenderer::HTMLRenderer(const Param * param)
HTMLRenderer::~HTMLRenderer() HTMLRenderer::~HTMLRenderer()
{ {
delete text_line_buf;
ffw_finalize(); ffw_finalize();
clean_tmp_files();
delete [] cur_mapping; delete [] cur_mapping;
delete [] cur_mapping2; delete [] cur_mapping2;
delete [] width_list; delete [] width_list;
@ -76,7 +82,7 @@ void HTMLRenderer::process(PDFDoc *doc)
bg_renderer->startDoc(doc); bg_renderer->startDoc(doc);
} }
int page_count = (param->last_page - param->first_page); int page_count = (param->last_page - param->first_page + 1);
for(int i = param->first_page; i <= param->last_page ; ++i) for(int i = param->first_page; i <= param->last_page ; ++i)
{ {
cerr << "Working: " << (i-param->first_page) << "/" << page_count << '\r' << flush; cerr << "Working: " << (i-param->first_page) << "/" << page_count << '\r' << flush;
@ -87,21 +93,23 @@ void HTMLRenderer::process(PDFDoc *doc)
html_fout.open((char*)page_fn, ofstream::binary); html_fout.open((char*)page_fn, ofstream::binary);
if(!html_fout) if(!html_fout)
throw string("Cannot open ") + (char*)page_fn + " for writing"; throw string("Cannot open ") + (char*)page_fn + " for writing";
fix_stream(html_fout); set_stream_flags(html_fout);
} }
if(param->process_nontext) if(param->process_nontext)
{ {
auto fn = str_fmt("%s/p%x.png", (param->single_html ? param->tmp_dir : param->dest_dir).c_str(), i); auto fn = str_fmt("%s/p%x.png", (param->single_html ? param->tmp_dir : param->dest_dir).c_str(), i);
if(param->single_html) if(param->single_html)
add_tmp_file((char*)fn); tmp_files.add((char*)fn);
bg_renderer->render_page(doc, i, (char*)fn); bg_renderer->render_page(doc, i, (char*)fn);
} }
doc->displayPage(this, i, doc->displayPage(this, i,
text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI,
0, true, false, false, 0,
(param->use_cropbox == 0),
false, false,
nullptr, nullptr, nullptr, nullptr); nullptr, nullptr, nullptr, nullptr);
if(param->split_pages) if(param->split_pages)
@ -170,8 +178,8 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
cur_font_size = draw_font_size = 0; cur_font_size = draw_font_size = 0;
cur_fs_id = install_font_size(cur_font_size); cur_fs_id = install_font_size(cur_font_size);
memcpy(cur_text_tm, id_matrix, sizeof(cur_text_tm)); memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm));
memcpy(draw_text_tm, id_matrix, sizeof(draw_text_tm)); memcpy(draw_text_tm, ID_MATRIX, sizeof(draw_text_tm));
cur_ttm_id = install_transform_matrix(draw_text_tm); cur_ttm_id = install_transform_matrix(draw_text_tm);
cur_letter_space = cur_word_space = 0; cur_letter_space = cur_word_space = 0;
@ -210,7 +218,7 @@ void HTMLRenderer::endPage() {
for(int i = 0; i < 6; ++i) for(int i = 0; i < 6; ++i)
{ {
if(i > 0) html_fout << ","; if(i > 0) html_fout << ",";
html_fout << _round(default_ctm[i]); html_fout << round(default_ctm[i]);
} }
html_fout << "]"; html_fout << "]";
@ -232,17 +240,17 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
vector<double> zoom_factors; vector<double> zoom_factors;
if(_is_positive(param->zoom)) if(is_positive(param->zoom))
{ {
zoom_factors.push_back(param->zoom); zoom_factors.push_back(param->zoom);
} }
if(_is_positive(param->fit_width)) if(is_positive(param->fit_width))
{ {
zoom_factors.push_back((param->fit_width) / preprocessor.get_max_width()); zoom_factors.push_back((param->fit_width) / preprocessor.get_max_width());
} }
if(_is_positive(param->fit_height)) if(is_positive(param->fit_height))
{ {
zoom_factors.push_back((param->fit_height) / preprocessor.get_max_height()); zoom_factors.push_back((param->fit_height) / preprocessor.get_max_height());
} }
@ -280,13 +288,13 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
: str_fmt("%s/%s", param->dest_dir.c_str(), param->css_filename.c_str()); : str_fmt("%s/%s", param->dest_dir.c_str(), param->css_filename.c_str());
if(param->single_html && (!param->split_pages)) if(param->single_html && (!param->split_pages))
add_tmp_file((char*)fn); tmp_files.add((char*)fn);
css_path = (char*)fn, css_path = (char*)fn,
css_fout.open(css_path, ofstream::binary); css_fout.open(css_path, ofstream::binary);
if(!css_fout) if(!css_fout)
throw string("Cannot open ") + (char*)fn + " for writing"; throw string("Cannot open ") + (char*)fn + " for writing";
fix_stream(css_fout); set_stream_flags(css_fout);
} }
// if split-pages is specified, open & close the file in the process loop // if split-pages is specified, open & close the file in the process loop
@ -301,13 +309,13 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
* Otherwise just generate it * Otherwise just generate it
*/ */
auto fn = str_fmt("%s/__pages", param->tmp_dir.c_str()); auto fn = str_fmt("%s/__pages", param->tmp_dir.c_str());
add_tmp_file((char*)fn); tmp_files.add((char*)fn);
html_path = (char*)fn; html_path = (char*)fn;
html_fout.open(html_path, ofstream::binary); html_fout.open(html_path, ofstream::binary);
if(!html_fout) if(!html_fout)
throw string("Cannot open ") + (char*)fn + " for writing"; throw string("Cannot open ") + (char*)fn + " for writing";
fix_stream(html_fout); set_stream_flags(html_fout);
} }
} }
@ -327,7 +335,7 @@ void HTMLRenderer::post_process()
output.open((char*)fn, ofstream::binary); output.open((char*)fn, ofstream::binary);
if(!output) if(!output)
throw string("Cannot open ") + (char*)fn + " for writing"; throw string("Cannot open ") + (char*)fn + " for writing";
fix_stream(output); set_stream_flags(output);
} }
// apply manifest // apply manifest
@ -385,40 +393,13 @@ void HTMLRenderer::post_process()
} }
} }
void HTMLRenderer::fix_stream (std::ostream & out) void HTMLRenderer::set_stream_flags(std::ostream & out)
{ {
// we output all ID's in hex // we output all ID's in hex
// browsers are not happy with scientific notations // browsers are not happy with scientific notations
out << hex << fixed; out << hex << fixed;
} }
void HTMLRenderer::add_tmp_file(const string & fn)
{
if(!param->clean_tmp)
return;
if(tmp_files.insert(fn).second && param->debug)
cerr << "Add new temporary file: " << fn << endl;
}
void HTMLRenderer::clean_tmp_files()
{
if(!param->clean_tmp)
return;
for(auto iter = tmp_files.begin(); iter != tmp_files.end(); ++iter)
{
const string & fn = *iter;
remove(fn.c_str());
if(param->debug)
cerr << "Remove temporary file: " << fn << endl;
}
remove(param->tmp_dir.c_str());
if(param->debug)
cerr << "Remove temporary directory: " << param->tmp_dir << endl;
}
void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy) void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy)
{ {
string fn = get_filename(path); string fn = get_filename(path);

View File

@ -8,7 +8,7 @@
*/ */
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "namespace.h" #include "util/namespace.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {

View File

@ -15,12 +15,15 @@
#include "Param.h" #include "Param.h"
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "namespace.h" #include "util/namespace.h"
#include "util.h" #include "util/math.h"
#include "util/misc.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
using std::abs; using std::abs;
using std::cerr;
using std::endl;
const FontInfo * HTMLRenderer::install_font(GfxFont * font) const FontInfo * HTMLRenderer::install_font(GfxFont * font)
{ {
@ -203,7 +206,7 @@ void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info)
long long HTMLRenderer::install_font_size(double font_size) long long HTMLRenderer::install_font_size(double font_size)
{ {
auto iter = font_size_map.lower_bound(font_size - EPS); auto iter = font_size_map.lower_bound(font_size - EPS);
if((iter != font_size_map.end()) && (_equal(iter->first, font_size))) if((iter != font_size_map.end()) && (equal(iter->first, font_size)))
return iter->second; return iter->second;
long long new_fs_id = font_size_map.size(); long long new_fs_id = font_size_map.size();
@ -218,7 +221,7 @@ long long HTMLRenderer::install_transform_matrix(const double * tm)
memcpy(m.m, tm, sizeof(m.m)); memcpy(m.m, tm, sizeof(m.m));
auto iter = transform_matrix_map.lower_bound(m); auto iter = transform_matrix_map.lower_bound(m);
if((iter != transform_matrix_map.end()) && (_tm_equal(m.m, iter->first.m, 4))) if((iter != transform_matrix_map.end()) && (tm_equal(m.m, iter->first.m, 4)))
return iter->second; return iter->second;
long long new_tm_id = transform_matrix_map.size(); long long new_tm_id = transform_matrix_map.size();
@ -230,7 +233,7 @@ long long HTMLRenderer::install_transform_matrix(const double * tm)
long long HTMLRenderer::install_letter_space(double letter_space) long long HTMLRenderer::install_letter_space(double letter_space)
{ {
auto iter = letter_space_map.lower_bound(letter_space - EPS); auto iter = letter_space_map.lower_bound(letter_space - EPS);
if((iter != letter_space_map.end()) && (_equal(iter->first, letter_space))) if((iter != letter_space_map.end()) && (equal(iter->first, letter_space)))
return iter->second; return iter->second;
long long new_ls_id = letter_space_map.size(); long long new_ls_id = letter_space_map.size();
@ -242,7 +245,7 @@ long long HTMLRenderer::install_letter_space(double letter_space)
long long HTMLRenderer::install_word_space(double word_space) long long HTMLRenderer::install_word_space(double word_space)
{ {
auto iter = word_space_map.lower_bound(word_space - EPS); auto iter = word_space_map.lower_bound(word_space - EPS);
if((iter != word_space_map.end()) && (_equal(iter->first, word_space))) if((iter != word_space_map.end()) && (equal(iter->first, word_space)))
return iter->second; return iter->second;
long long new_ws_id = word_space_map.size(); long long new_ws_id = word_space_map.size();

View File

@ -11,16 +11,20 @@
#include <sstream> #include <sstream>
#include <algorithm> #include <algorithm>
#include <HTMLRenderer.h>
#include <Link.h> #include <Link.h>
#include "namespace.h" #include "HTMLRenderer.h"
#include "util/namespace.h"
#include "util/math.h"
#include "util/misc.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
using std::ostringstream; using std::ostringstream;
using std::min; using std::min;
using std::max; using std::max;
using std::cerr;
using std::endl;
/* /*
* The detailed rectangle area of the link destination * The detailed rectangle area of the link destination
@ -211,9 +215,9 @@ void HTMLRenderer::processLink(AnnotLink * al)
border_top_bottom_width, border_left_right_width); border_top_bottom_width, border_left_right_width);
if(abs(border_top_bottom_width - border_left_right_width) < EPS) if(abs(border_top_bottom_width - border_left_right_width) < EPS)
html_fout << "border-width:" << _round(border_top_bottom_width) << "px;"; html_fout << "border-width:" << round(border_top_bottom_width) << "px;";
else else
html_fout << "border-width:" << _round(border_top_bottom_width) << "px " << _round(border_left_right_width) << "px;"; html_fout << "border-width:" << round(border_top_bottom_width) << "px " << round(border_left_right_width) << "px;";
} }
auto style = border->getStyle(); auto style = border->getStyle();
switch(style) switch(style)
@ -267,13 +271,13 @@ void HTMLRenderer::processLink(AnnotLink * al)
html_fout << "border-style:none;"; html_fout << "border-style:none;";
} }
_tm_transform(default_ctm, x, y); tm_transform(default_ctm, x, y);
html_fout << "position:absolute;" html_fout << "position:absolute;"
<< "left:" << _round(x) << "px;" << "left:" << round(x) << "px;"
<< "bottom:" << _round(y) << "px;" << "bottom:" << round(y) << "px;"
<< "width:" << _round(w) << "px;" << "width:" << round(w) << "px;"
<< "height:" << _round(h) << "px;"; << "height:" << round(h) << "px;";
// fix for IE // fix for IE
html_fout << "background-color:rgba(255,255,255,0.000001);"; html_fout << "background-color:rgba(255,255,255,0.000001);";

View File

@ -16,8 +16,9 @@
#include <algorithm> #include <algorithm>
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "namespace.h" #include "TextLineBuffer.h"
#include "util.h" #include "util/namespace.h"
#include "util/math.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
@ -104,7 +105,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
} }
double new_font_size = state->getFontSize(); double new_font_size = state->getFontSize();
if(!_equal(cur_font_size, new_font_size)) if(!equal(cur_font_size, new_font_size))
{ {
need_rescale_font = true; need_rescale_font = true;
cur_font_size = new_font_size; cur_font_size = new_font_size;
@ -132,7 +133,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
new_ctm[5] = m1[1] * m2[4] + m1[3] * m2[5] + m1[5]; new_ctm[5] = m1[1] * m2[4] + m1[3] * m2[5] + m1[5];
//new_ctm[4] = new_ctm[5] = 0; //new_ctm[4] = new_ctm[5] = 0;
if(!_tm_equal(new_ctm, cur_text_tm)) if(!tm_equal(new_ctm, cur_text_tm))
{ {
need_recheck_position = true; need_recheck_position = true;
need_rescale_font = true; need_rescale_font = true;
@ -147,10 +148,10 @@ void HTMLRenderer::check_state_change(GfxState * state)
double new_draw_text_tm[6]; double new_draw_text_tm[6];
memcpy(new_draw_text_tm, cur_text_tm, sizeof(new_draw_text_tm)); memcpy(new_draw_text_tm, cur_text_tm, sizeof(new_draw_text_tm));
double new_draw_text_scale = 1.0/text_scale_factor2 * _hypot(new_draw_text_tm[2], new_draw_text_tm[3]); double new_draw_text_scale = 1.0/text_scale_factor2 * hypot(new_draw_text_tm[2], new_draw_text_tm[3]);
double new_draw_font_size = cur_font_size; double new_draw_font_size = cur_font_size;
if(_is_positive(new_draw_text_scale)) if(is_positive(new_draw_text_scale))
{ {
new_draw_font_size *= new_draw_text_scale; new_draw_font_size *= new_draw_text_scale;
for(int i = 0; i < 4; ++i) for(int i = 0; i < 4; ++i)
@ -161,19 +162,28 @@ void HTMLRenderer::check_state_change(GfxState * state)
new_draw_text_scale = 1.0; new_draw_text_scale = 1.0;
} }
if(!(_equal(new_draw_text_scale, draw_text_scale))) if(!is_positive(new_draw_font_size))
{
// Page is flipped and css can't handle it.
new_draw_font_size = -new_draw_font_size;
for(int i = 0; i < 4; ++i)
new_draw_text_tm[i] *= -1;
}
if(!(equal(new_draw_text_scale, draw_text_scale)))
{ {
draw_text_scale_changed = true; draw_text_scale_changed = true;
draw_text_scale = new_draw_text_scale; draw_text_scale = new_draw_text_scale;
} }
if(!(_equal(new_draw_font_size, draw_font_size))) if(!(equal(new_draw_font_size, draw_font_size)))
{ {
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN); new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
draw_font_size = new_draw_font_size; draw_font_size = new_draw_font_size;
cur_fs_id = install_font_size(draw_font_size); cur_fs_id = install_font_size(draw_font_size);
} }
if(!(_tm_equal(new_draw_text_tm, draw_text_tm, 4))) if(!(tm_equal(new_draw_text_tm, draw_text_tm, 4)))
{ {
new_line_state = max<NewLineState>(new_line_state, NLS_DIV); new_line_state = max<NewLineState>(new_line_state, NLS_DIV);
memcpy(draw_text_tm, new_draw_text_tm, sizeof(draw_text_tm)); memcpy(draw_text_tm, new_draw_text_tm, sizeof(draw_text_tm));
@ -199,21 +209,21 @@ void HTMLRenderer::check_state_change(GfxState * state)
*/ */
bool merged = false; bool merged = false;
if(_tm_equal(old_ctm, cur_text_tm, 4)) if(tm_equal(old_ctm, cur_text_tm, 4))
{ {
double dy = cur_ty - draw_ty; double dy = cur_ty - draw_ty;
double tdx = old_ctm[4] - cur_text_tm[4] - cur_text_tm[2] * dy; double tdx = old_ctm[4] - cur_text_tm[4] - cur_text_tm[2] * dy;
double tdy = old_ctm[5] - cur_text_tm[5] - cur_text_tm[3] * dy; double tdy = old_ctm[5] - cur_text_tm[5] - cur_text_tm[3] * dy;
if(_equal(cur_text_tm[0] * tdy, cur_text_tm[1] * tdx)) if(equal(cur_text_tm[0] * tdy, cur_text_tm[1] * tdx))
{ {
if(_is_positive(cur_text_tm[0])) if(is_positive(cur_text_tm[0]))
{ {
draw_tx += tdx / cur_text_tm[0]; draw_tx += tdx / cur_text_tm[0];
draw_ty += dy; draw_ty += dy;
merged = true; merged = true;
} }
else if (_is_positive(cur_text_tm[1])) else if (is_positive(cur_text_tm[1]))
{ {
draw_tx += tdy / cur_text_tm[1]; draw_tx += tdy / cur_text_tm[1];
draw_ty += dy; draw_ty += dy;
@ -221,7 +231,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
} }
else else
{ {
if((_equal(tdx,0)) && (_equal(tdy,0))) if((equal(tdx,0)) && (equal(tdy,0)))
{ {
// free // free
draw_tx = cur_tx; draw_tx = cur_tx;
@ -246,7 +256,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
if(all_changed || letter_space_changed || draw_text_scale_changed) if(all_changed || letter_space_changed || draw_text_scale_changed)
{ {
double new_letter_space = state->getCharSpace(); double new_letter_space = state->getCharSpace();
if(!_equal(cur_letter_space, new_letter_space)) if(!equal(cur_letter_space, new_letter_space))
{ {
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN); new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
cur_letter_space = new_letter_space; cur_letter_space = new_letter_space;
@ -259,7 +269,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
if(all_changed || word_space_changed || draw_text_scale_changed) if(all_changed || word_space_changed || draw_text_scale_changed)
{ {
double new_word_space = state->getWordSpace(); double new_word_space = state->getWordSpace();
if(!_equal(cur_word_space, new_word_space)) if(!equal(cur_word_space, new_word_space))
{ {
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN); new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
cur_word_space = new_word_space; cur_word_space = new_word_space;
@ -294,7 +304,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
if(all_changed || rise_changed || draw_text_scale_changed) if(all_changed || rise_changed || draw_text_scale_changed)
{ {
double new_rise = state->getRise(); double new_rise = state->getRise();
if(!_equal(cur_rise, new_rise)) if(!equal(cur_rise, new_rise))
{ {
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN); new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
cur_rise = new_rise; cur_rise = new_rise;
@ -333,7 +343,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
{ {
close_text_line(); close_text_line();
line_buf.reset(state); text_line_buf->reset(state);
//resync position //resync position
draw_ty = cur_ty; draw_ty = cur_ty;
@ -350,14 +360,14 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
} }
else else
{ {
line_buf.append_offset(target); text_line_buf->append_offset(target);
draw_tx += target / draw_text_scale; draw_tx += target / draw_text_scale;
} }
} }
if(new_line_state != NLS_NONE) if(new_line_state != NLS_NONE)
{ {
line_buf.append_state(); text_line_buf->append_state();
} }
line_opened = true; line_opened = true;
@ -368,7 +378,7 @@ void HTMLRenderer::close_text_line()
if(line_opened) if(line_opened)
{ {
line_opened = false; line_opened = false;
line_buf.flush(); text_line_buf->flush();
} }
} }

View File

@ -15,9 +15,14 @@
#include <CharCodeToUnicode.h> #include <CharCodeToUnicode.h>
#include <fofi/FoFiTrueType.h> #include <fofi/FoFiTrueType.h>
#include "ffw.h"
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "namespace.h" #include "TextLineBuffer.h"
#include "util/ffw.h"
#include "util/namespace.h"
#include "util/unicode.h"
#include "util/path.h"
#include "util/math.h"
#include "util/misc.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
@ -26,6 +31,8 @@ using std::min;
using std::all_of; using std::all_of;
using std::floor; using std::floor;
using std::swap; using std::swap;
using std::cerr;
using std::endl;
string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
{ {
@ -127,7 +134,7 @@ string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
obj.streamReset(); obj.streamReset();
filepath = (char*)str_fmt("%s/f%llx%s", param->tmp_dir.c_str(), fn_id, suffix.c_str()); filepath = (char*)str_fmt("%s/f%llx%s", param->tmp_dir.c_str(), fn_id, suffix.c_str());
add_tmp_file(filepath); tmp_files.add(filepath);
ofstream outf(filepath, ofstream::binary); ofstream outf(filepath, ofstream::binary);
if(!outf) if(!outf)
@ -171,7 +178,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
if(param->debug) if(param->debug)
{ {
auto fn = str_fmt("%s/__raw_font_%lld", param->tmp_dir.c_str(), info.id, param->font_suffix.c_str()); auto fn = str_fmt("%s/__raw_font_%lld", param->tmp_dir.c_str(), info.id, param->font_suffix.c_str());
add_tmp_file((char*)fn); tmp_files.add((char*)fn);
ofstream((char*)fn, ofstream::binary) << ifstream(filepath).rdbuf(); ofstream((char*)fn, ofstream::binary) << ifstream(filepath).rdbuf();
} }
@ -374,7 +381,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
// in auto mode, just drop the tounicode map // in auto mode, just drop the tounicode map
if(!retried) if(!retried)
{ {
cerr << "ToUnicode CMap is not valid and got dropped" << endl; cerr << "ToUnicode CMap is not valid and got dropped for font: " << hex << info.id << dec << endl;
retried = true; retried = true;
codeset.clear(); codeset.clear();
info.use_tounicode = false; info.use_tounicode = false;
@ -410,7 +417,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
ffw_reencode_raw(cur_mapping, max_key + 1, 1); ffw_reencode_raw(cur_mapping, max_key + 1, 1);
// we need the space chracter for offsets // we need the space character for offsets
if(!has_space) if(!has_space)
{ {
int space_width; int space_width;
@ -437,9 +444,9 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
* *
*/ */
string cur_tmp_fn = (char*)str_fmt("%s/__tmp_font1%s", param->tmp_dir.c_str(), param->font_suffix.c_str()); string cur_tmp_fn = (char*)str_fmt("%s/__tmp_font1%s", param->tmp_dir.c_str(), param->font_suffix.c_str());
add_tmp_file(cur_tmp_fn); tmp_files.add(cur_tmp_fn);
string other_tmp_fn = (char*)str_fmt("%s/__tmp_font2%s", param->tmp_dir.c_str(), param->font_suffix.c_str()); string other_tmp_fn = (char*)str_fmt("%s/__tmp_font2%s", param->tmp_dir.c_str(), param->font_suffix.c_str());
add_tmp_file(other_tmp_fn); tmp_files.add(other_tmp_fn);
ffw_save(cur_tmp_fn.c_str()); ffw_save(cur_tmp_fn.c_str());
@ -482,7 +489,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
info.id, param->font_suffix.c_str()); info.id, param->font_suffix.c_str());
if(param->single_html) if(param->single_html)
add_tmp_file(fn); tmp_files.add(fn);
ffw_load_font(cur_tmp_fn.c_str()); ffw_load_font(cur_tmp_fn.c_str());
ffw_metric(&info.ascent, &info.descent); ffw_metric(&info.ascent, &info.descent);
@ -517,14 +524,6 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
char *p = s->getCString(); char *p = s->getCString();
int len = s->getLength(); int len = s->getLength();
//debug
{
if(strcmp(p, "ORTUG") == 0)
{
cerr << "DEBUG: " << (int)(state->getRender()) << endl;
}
}
double dx = 0; double dx = 0;
double dy = 0; double dy = 0;
double dxerr = 0; double dxerr = 0;
@ -538,10 +537,11 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
CharCode code; CharCode code;
Unicode *u = nullptr; Unicode *u = nullptr;
while (len > 0) { while (len > 0)
{
auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy); auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy);
if(!(_equal(ox, 0) && _equal(oy, 0))) if(!(equal(ox, 0) && equal(oy, 0)))
{ {
cerr << "TODO: non-zero origins" << endl; cerr << "TODO: non-zero origins" << endl;
} }
@ -556,25 +556,25 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
if(is_space && (param->space_as_offset)) if(is_space && (param->space_as_offset))
{ {
// ignore horiz_scaling, as it's merged in CTM // ignore horiz_scaling, as it's merged in CTM
line_buf.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); text_line_buf->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
} }
else else
{ {
if((param->decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) if((param->decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
{ {
line_buf.append_unicodes(u, uLen); text_line_buf->append_unicodes(u, uLen);
} }
else else
{ {
if(cur_font_info->use_tounicode) if(cur_font_info->use_tounicode)
{ {
Unicode uu = check_unicode(u, uLen, code, font); Unicode uu = check_unicode(u, uLen, code, font);
line_buf.append_unicodes(&uu, 1); text_line_buf->append_unicodes(&uu, 1);
} }
else else
{ {
Unicode uu = unicode_from_font(code, font); Unicode uu = unicode_from_font(code, font);
line_buf.append_unicodes(&uu, 1); text_line_buf->append_unicodes(&uu, 1);
} }
} }
} }

View File

@ -28,6 +28,7 @@ struct Param
double zoom; double zoom;
double fit_width, fit_height; double fit_width, fit_height;
double h_dpi, v_dpi; double h_dpi, v_dpi;
int use_cropbox;
int process_nontext; int process_nontext;
int single_html; int single_html;

View File

@ -1,235 +0,0 @@
/*
* Constants & Misc functions
*
*
* by WangLu
* 2012.08.10
*/
#ifndef UTIL_H__
#define UTIL_H__
#include <cstdio>
#include <iostream>
#include <algorithm>
#include <cmath>
#include <vector>
#include <string>
#include <map>
#ifndef nullptr
#define nullptr (NULL)
#endif
namespace pdf2htmlEX {
static const double EPS = 1e-6;
extern const double id_matrix[6];
static const double DEFAULT_DPI = 72.0;
extern const std::map<std::string, std::string> BASE_14_FONT_CSS_FONT_MAP;
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
// map to embed files into html
// key: (suffix, if_embed_content)
// value: (prefix string, suffix string)
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP;
static inline double _round(double x) { return (std::abs(x) > EPS) ? x : 0.0; }
static inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; }
static inline bool _is_positive(double x) { return x > EPS; }
static inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6)
{
for(int i = 0; i < size; ++i)
if(!_equal(tm1[i], tm2[i]))
return false;
return true;
}
static inline double _hypot(double x, double y) { return std::sqrt(x*x+y*y); }
void _tm_transform(const double * tm, double & x, double & y, bool is_delta = false);
void _tm_multiply(double * tm_left, const double * tm_right);
static inline long long hash_ref(const Ref * id)
{
return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen);
}
/*
* http://en.wikipedia.org/wiki/HTML_decimal_character_rendering
*/
bool isLegalUnicode(Unicode u);
Unicode map_to_private(CharCode code);
/*
* Try to determine the Unicode value directly from the information in the font
*/
Unicode unicode_from_font (CharCode code, GfxFont * font);
/*
* We have to use a single Unicode value to reencode fonts
* if we got multi-unicode values, it might be expanded ligature, try to restore it
* if we cannot figure it out at the end, use a private mapping
*/
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
class GfxRGB_hash
{
public:
size_t operator () (const GfxRGB & rgb) const
{
return (colToByte(rgb.r) << 16) | (colToByte(rgb.g) << 8) | (colToByte(rgb.b));
}
};
class GfxRGB_equal
{
public:
bool operator ()(const GfxRGB & rgb1, const GfxRGB & rgb2) const
{
return ((rgb1.r == rgb2.r) && (rgb1.g == rgb2.g) && (rgb1.b == rgb1.b));
}
};
// we may need more info of a font in the future
class FontInfo
{
public:
long long id;
bool use_tounicode;
int em_size;
double ascent, descent;
};
class Matrix_less
{
public:
bool operator () (const Matrix & m1, const Matrix & m2) const
{
// Note that we only care about the first 4 elements
for(int i = 0; i < 4; ++i)
{
if(m1.m[i] < m2.m[i] - EPS)
return true;
if(m1.m[i] > m2.m[i] + EPS)
return false;
}
return false;
}
};
class base64stream
{
public:
base64stream(std::istream & in) : in(&in) { }
base64stream(std::istream && in) : in(&in) { }
std::ostream & dumpto(std::ostream & out)
{
unsigned char buf[3];
while(in->read((char*)buf, 3))
{
out << base64_encoding[(buf[0] & 0xfc)>>2]
<< base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]
<< base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)]
<< base64_encoding[(buf[2] & 0x3f)];
}
auto cnt = in->gcount();
if(cnt > 0)
{
for(int i = cnt; i < 3; ++i)
buf[i] = 0;
out << base64_encoding[(buf[0] & 0xfc)>>2]
<< base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)];
if(cnt > 1)
{
out << base64_encoding[(buf[1] & 0x0f)<<2];
}
else
{
out << '=';
}
out << '=';
}
return out;
}
private:
std::istream * in;
static const char * base64_encoding;
};
static inline std::ostream & operator << (std::ostream & out, base64stream & bf) { return bf.dumpto(out); }
static inline std::ostream & operator << (std::ostream & out, base64stream && bf) { return bf.dumpto(out); }
class string_formatter
{
public:
class guarded_pointer
{
public:
guarded_pointer(string_formatter * sf) : sf(sf) { ++(sf->buf_cnt); }
~guarded_pointer(void) { --(sf->buf_cnt); }
operator char* () { return &(sf->buf.front()); }
private:
string_formatter * sf;
};
string_formatter() : buf_cnt(0) { buf.reserve(L_tmpnam); }
/*
* Important:
* there is only one buffer, so new strings will replace old ones
*/
guarded_pointer operator () (const char * format, ...) {
assert((buf_cnt == 0) && "string_formatter: buffer is reused!");
va_list vlist;
va_start(vlist, format);
int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist);
va_end(vlist);
if(l >= (int)buf.capacity())
{
buf.reserve(std::max<long>((long)(l+1), (long)buf.capacity() * 2));
va_start(vlist, format);
l = vsnprintf(&buf.front(), buf.capacity(), format, vlist);
va_end(vlist);
}
assert(l >= 0); // we should fail when vsnprintf fail
assert(l < (int)buf.capacity());
return guarded_pointer(this);
}
private:
friend class guarded_pointer;
std::vector<char> buf;
int buf_cnt;
};
void create_directories(std::string path);
bool is_truetype_suffix(const std::string & suffix);
std::string get_filename(const std::string & path);
std::string get_suffix(const std::string & path);
/*
* In PDF, edges of the rectangle are in the middle of the borders
* In HTML, edges are completely outside the rectangle
*/
void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2,
double border_width,
double & x, double & y, double & w, double & h,
double & border_top_bottom_width,
double & border_left_right_width);
std::ostream & operator << (std::ostream & out, const GfxRGB & rgb);
} // namespace util
#endif //UTIL_H__

View File

@ -19,10 +19,11 @@
#include <PDFDocFactory.h> #include <PDFDocFactory.h>
#include <GlobalParams.h> #include <GlobalParams.h>
#include "HTMLRenderer.h"
#include "Param.h" #include "Param.h"
#include "pdf2htmlEX-config.h" #include "pdf2htmlEX-config.h"
#include "ArgParser.h" #include "HTMLRenderer/HTMLRenderer.h"
#include "util/ArgParser.h"
#include "util/path.h"
using namespace std; using namespace std;
using namespace pdf2htmlEX; using namespace pdf2htmlEX;
@ -66,6 +67,7 @@ void parse_options (int argc, char **argv)
.add("fit-height", &param.fit_height, 0, "fit height", nullptr, true) .add("fit-height", &param.fit_height, 0, "fit height", nullptr, true)
.add("hdpi", &param.h_dpi, 144.0, "horizontal DPI for non-text") .add("hdpi", &param.h_dpi, 144.0, "horizontal DPI for non-text")
.add("vdpi", &param.v_dpi, 144.0, "vertical DPI for non-text") .add("vdpi", &param.v_dpi, 144.0, "vertical DPI for non-text")
.add("use-cropbox", &param.use_cropbox, 0, "use CropBox instead of MediaBox")
.add("process-nontext", &param.process_nontext, 1, "process nontext objects") .add("process-nontext", &param.process_nontext, 1, "process nontext objects")
.add("single-html", &param.single_html, 1, "combine everything into one single HTML file") .add("single-html", &param.single_html, 1, "combine everything into one single HTML file")

View File

@ -1,322 +0,0 @@
/*
* Misc functions
*
*
* by WangLu
* 2012.08.10
*/
#include <errno.h>
#include <cctype>
#include <GfxState.h>
#include <GfxFont.h>
#include <CharTypes.h>
#include <GlobalParams.h>
#include <Object.h>
// for mkdir
#include <sys/stat.h>
#include <sys/types.h>
#include "util.h"
using std::cerr;
using std::endl;
using std::string;
using std::map;
using std::ostream;
namespace pdf2htmlEX {
const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
const map<string, string> BASE_14_FONT_CSS_FONT_MAP({
{ "Courier", "Courier,monospace" },
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },
{ "Symbol", "Symbol,\"Standard Symbols L\"" },
{ "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },
});
const map<string, string> GB_ENCODED_FONT_NAME_MAP({
{"\xCB\xCE\xCC\xE5", "SimSun"},
{"\xBA\xDA\xCC\xE5", "SimHei"},
{"\xBF\xAC\xCC\xE5_GB2312", "SimKai"},
{"\xB7\xC2\xCB\xCE_GB2312", "SimFang"},
{"\xC1\xA5\xCA\xE9", "SimLi"},
});
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}},
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}},
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}},
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}}
});
void _tm_transform(const double * tm, double & x, double & y, bool is_delta)
{
double xx = x, yy = y;
x = tm[0] * xx + tm[2] * yy;
y = tm[1] * xx + tm[3] * yy;
if(!is_delta)
{
x += tm[4];
y += tm[5];
}
}
void _tm_multiply(double * tm_left, const double * tm_right)
{
double old[4];
memcpy(old, tm_left, sizeof(old));
tm_left[0] = old[0] * tm_right[0] + old[2] * tm_right[1];
tm_left[1] = old[1] * tm_right[0] + old[3] * tm_right[1];
tm_left[2] = old[0] * tm_right[2] + old[2] * tm_right[3];
tm_left[3] = old[1] * tm_right[2] + old[3] * tm_right[3];
tm_left[4] += old[0] * tm_right[4] + old[2] * tm_right[5];
tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5];
}
bool isLegalUnicode(Unicode u)
{
/*
if((u == 9) || (u == 10) || (u == 13))
return true;
*/
if(u <= 31)
return false;
if((u >= 127) && (u <= 159))
return false;
if((u >= 0xd800) && (u <= 0xdfff))
return false;
return true;
}
Unicode map_to_private(CharCode code)
{
Unicode private_mapping = (Unicode)(code + 0xE000);
if(private_mapping > 0xF8FF)
{
private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000);
if(private_mapping > 0xFFFFD)
{
private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000);
if(private_mapping > 0x10FFFD)
{
cerr << "Warning: all private use unicode are used" << endl;
}
}
}
return private_mapping;
}
Unicode unicode_from_font (CharCode code, GfxFont * font)
{
if(!font->isCIDFont())
{
char * cname = dynamic_cast<Gfx8BitFont*>(font)->getCharName(code);
// may be untranslated ligature
if(cname)
{
Unicode ou = globalParams->mapNameToUnicode(cname);
if(isLegalUnicode(ou))
return ou;
}
}
return map_to_private(code);
}
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
{
if(len == 0)
return map_to_private(code);
if(len == 1)
{
if(isLegalUnicode(*u))
return *u;
}
return unicode_from_font(code, font);
}
/*
* Copied from UTF.h / UTF8.h in poppler
*/
static int mapUTF8(Unicode u, char *buf, int bufSize) {
if (u <= 0x0000007f) {
if (bufSize < 1) {
return 0;
}
buf[0] = (char)u;
return 1;
} else if (u <= 0x000007ff) {
if (bufSize < 2) {
return 0;
}
buf[0] = (char)(0xc0 + (u >> 6));
buf[1] = (char)(0x80 + (u & 0x3f));
return 2;
} else if (u <= 0x0000ffff) {
if (bufSize < 3) {
return 0;
}
buf[0] = (char)(0xe0 + (u >> 12));
buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[2] = (char)(0x80 + (u & 0x3f));
return 3;
} else if (u <= 0x0010ffff) {
if (bufSize < 4) {
return 0;
}
buf[0] = (char)(0xf0 + (u >> 18));
buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[3] = (char)(0x80 + (u & 0x3f));
return 4;
} else {
return 0;
}
}
void outputUnicodes(ostream & out, const Unicode * u, int uLen)
{
for(int i = 0; i < uLen; ++i)
{
switch(u[i])
{
case '&':
out << "&amp;";
break;
case '\"':
out << "&quot;";
break;
case '\'':
out << "&apos;";
break;
case '<':
out << "&lt;";
break;
case '>':
out << "&gt;";
break;
default:
{
char buf[4];
auto n = mapUTF8(u[i], buf, 4);
out.write(buf, n);
}
}
}
}
const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
void create_directories(string path)
{
if(path.empty()) return;
size_t idx = path.rfind('/');
if(idx != string::npos)
{
create_directories(path.substr(0, idx));
}
int r = mkdir(path.c_str(), S_IRWXU);
if(r != 0)
{
if(errno == EEXIST)
{
struct stat stat_buf;
if((stat(path.c_str(), &stat_buf) == 0) && S_ISDIR(stat_buf.st_mode))
return;
}
throw string("Cannot create directory: ") + path;
}
}
bool is_truetype_suffix(const string & suffix)
{
return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf");
}
string get_filename (const string & path)
{
size_t idx = path.rfind('/');
if(idx == string::npos)
return path;
else if (idx == path.size() - 1)
return "";
return path.substr(idx + 1);
}
string get_suffix(const string & path)
{
string fn = get_filename(path);
size_t idx = fn.rfind('.');
if(idx == string::npos)
return "";
else
{
string s = fn.substr(idx);
for(auto iter = s.begin(); iter != s.end(); ++iter)
*iter = tolower(*iter);
return s;
}
}
void css_fix_rectangle_border_width(double x1, double y1,
double x2, double y2,
double border_width,
double & x, double & y, double & w, double & h,
double & border_top_bottom_width,
double & border_left_right_width)
{
w = x2 - x1;
if(w > border_width)
{
w -= border_width;
border_left_right_width = border_width;
}
else
{
border_left_right_width = border_width + w/2;
w = 0;
}
x = x1 - border_width / 2;
h = y2 - y1;
if(h > border_width)
{
h -= border_width;
border_top_bottom_width = border_width;
}
else
{
border_top_bottom_width = border_width + h/2;
h = 0;
}
y = y1 - border_width / 2;
}
ostream & operator << (ostream & out, const GfxRGB & rgb)
{
auto flags= out.flags();
out << std::dec << "rgb("
<< (int)colToByte(rgb.r) << ","
<< (int)colToByte(rgb.g) << ","
<< (int)colToByte(rgb.b) << ")";
out.flags(flags);
return out;
}
} // namespace pdf2htmlEX

View File

@ -15,7 +15,8 @@
#include <GfxFont.h> #include <GfxFont.h>
#include "Preprocessor.h" #include "Preprocessor.h"
#include "util.h" #include "util/misc.h"
#include "util/const.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
@ -41,7 +42,7 @@ Preprocessor::~Preprocessor(void)
void Preprocessor::process(PDFDoc * doc) void Preprocessor::process(PDFDoc * doc)
{ {
int page_count = (param->last_page - param->first_page); int page_count = (param->last_page - param->first_page + 1);
for(int i = param->first_page; i <= param->last_page ; ++i) for(int i = param->first_page; i <= param->last_page ; ++i)
{ {
cerr << "Preprocessing: " << (i-param->first_page) << "/" << page_count << '\r' << flush; cerr << "Preprocessing: " << (i-param->first_page) << "/" << page_count << '\r' << flush;

View File

@ -0,0 +1,30 @@
#include <cstdarg>
#include <algorithm>
#include <cassert>
#include "StringFormatter.h"
namespace pdf2htmlEX {
StringFormatter::GuardedPointer StringFormatter::operator () (const char * format, ...)
{
assert((buf_cnt == 0) && "StringFormatter: buffer is reused!");
va_list vlist;
va_start(vlist, format);
int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist);
va_end(vlist);
if(l >= (int)buf.capacity())
{
buf.reserve(std::max<long>((long)(l+1), (long)buf.capacity() * 2));
va_start(vlist, format);
l = vsnprintf(&buf.front(), buf.capacity(), format, vlist);
va_end(vlist);
}
assert(l >= 0); // we should fail when vsnprintf fail
assert(l < (int)buf.capacity());
return GuardedPointer(this);
}
} //namespace pdf2htmlEX

View File

@ -0,0 +1,44 @@
/*
* Buffer reusing string formatter
*
* by WangLu
* 2012.11.29
*/
#ifndef STRINGFORMATTER_H__
#define STRINGFORMATTER_H__
#include <vector>
#include <cstdio>
namespace pdf2htmlEX {
class StringFormatter
{
public:
class GuardedPointer
{
public:
GuardedPointer(StringFormatter * sf) : sf(sf) { ++(sf->buf_cnt); }
GuardedPointer(const GuardedPointer & gp) : sf(gp.sf) { ++(sf->buf_cnt); }
~GuardedPointer(void) { --(sf->buf_cnt); }
operator char* () const { return &(sf->buf.front()); }
private:
StringFormatter * sf;
};
StringFormatter() : buf_cnt(0) { buf.reserve(L_tmpnam); }
/*
* Important:
* there is only one buffer, so new strings will replace old ones
*/
GuardedPointer operator () (const char * format, ...);
private:
friend class GuardedPointer;
std::vector<char> buf;
int buf_cnt;
};
} //namespace pdf2htmlEX
#endif //STRINGFORMATTER_H__

56
src/util/TmpFiles.cc Normal file
View File

@ -0,0 +1,56 @@
/*
* TmpFiles.cc
*
* Collect and clean-up temporary files
*
* implemented by WangLu
* split off by Filodej <philodej@gmail.com>
*/
#include <iostream>
#include "TmpFiles.h"
#include "Param.h"
using namespace std;
namespace pdf2htmlEX {
TmpFiles::TmpFiles( const Param& param )
: param( param )
{ }
TmpFiles::~TmpFiles()
{
clean();
}
void TmpFiles::add( const string & fn)
{
if(!param.clean_tmp)
return;
if(tmp_files.insert(fn).second && param.debug)
cerr << "Add new temporary file: " << fn << endl;
}
void TmpFiles::clean()
{
if(!param.clean_tmp)
return;
for(auto iter = tmp_files.begin(); iter != tmp_files.end(); ++iter)
{
const string & fn = *iter;
remove(fn.c_str());
if(param.debug)
cerr << "Remove temporary file: " << fn << endl;
}
remove(param.tmp_dir.c_str());
if(param.debug)
cerr << "Remove temporary directory: " << param.tmp_dir << endl;
}
} // namespace pdf2htmlEX

29
src/util/TmpFiles.h Normal file
View File

@ -0,0 +1,29 @@
#ifndef TMPFILES_H__
#define TMPFILES_H__
#include <string>
#include <set>
#include "Param.h"
namespace pdf2htmlEX {
class TmpFiles
{
public:
explicit TmpFiles( const Param& param );
~TmpFiles();
void add( const std::string& fn);
private:
void clean();
private:
const Param& param;
std::set<std::string> tmp_files;
};
} // namespace pdf2htmlEX
#endif //TMPFILES_H__

45
src/util/base64stream.cc Normal file
View File

@ -0,0 +1,45 @@
#include "base64stream.h"
namespace pdf2htmlEX {
using std::ostream;
ostream & base64stream::dumpto(ostream & out)
{
unsigned char buf[3];
while(in->read((char*)buf, 3))
{
out << base64_encoding[(buf[0] & 0xfc)>>2]
<< base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]
<< base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)]
<< base64_encoding[(buf[2] & 0x3f)];
}
auto cnt = in->gcount();
if(cnt > 0)
{
for(int i = cnt; i < 3; ++i)
buf[i] = 0;
out << base64_encoding[(buf[0] & 0xfc)>>2]
<< base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)];
if(cnt > 1)
{
out << base64_encoding[(buf[1] & 0x0f)<<2];
}
else
{
out << '=';
}
out << '=';
}
return out;
}
const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
ostream & operator << (ostream & out, base64stream & bf) { return bf.dumpto(out); }
ostream & operator << (ostream & out, base64stream && bf) { return bf.dumpto(out); }
} //namespace pdf2htmlEX

33
src/util/base64stream.h Normal file
View File

@ -0,0 +1,33 @@
/*
* Base64 Encoding
*
* by WangLu
* 2012.11.29
*/
#ifndef BASE64STREAM_H__
#define BASE64STREAM_H__
#include <iostream>
namespace pdf2htmlEX {
class base64stream
{
public:
base64stream(std::istream & in) : in(&in) { }
base64stream(std::istream && in) : in(&in) { }
std::ostream & dumpto(std::ostream & out);
private:
std::istream * in;
static const char * base64_encoding;
};
std::ostream & operator << (std::ostream & out, base64stream & bf);
std::ostream & operator << (std::ostream & out, base64stream && bf);
} //namespace pdf2htmlEX
#endif //BASE64STREAM_H__

39
src/util/const.cc Normal file
View File

@ -0,0 +1,39 @@
/*
* Constants
*
* by WangLu
* 2012.11.29
*/
#include "const.h"
namespace pdf2htmlEX {
using std::map;
using std::string;
const double ID_MATRIX[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
const map<string, string> BASE_14_FONT_CSS_FONT_MAP({
{ "Courier", "Courier,monospace" },
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },
{ "Symbol", "Symbol,\"Standard Symbols L\"" },
{ "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },
});
const map<string, string> GB_ENCODED_FONT_NAME_MAP({
{"\xCB\xCE\xCC\xE5", "SimSun"},
{"\xBA\xDA\xCC\xE5", "SimHei"},
{"\xBF\xAC\xCC\xE5_GB2312", "SimKai"},
{"\xB7\xC2\xCB\xCE_GB2312", "SimFang"},
{"\xC1\xA5\xCA\xE9", "SimLi"},
});
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}},
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}},
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}},
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}}
});
} //namespace pdf2htmlEX

35
src/util/const.h Normal file
View File

@ -0,0 +1,35 @@
/*
* Constants
*
* by WangLu
* 2012.11.29
*/
#ifndef CONST_H__
#define CONST_H__
#include <map>
#include <string>
namespace pdf2htmlEX {
#ifndef nullptr
#define nullptr (NULL)
#endif
static const double EPS = 1e-6;
static const double DEFAULT_DPI = 72.0;
extern const double ID_MATRIX[6];
// PDF base 14 font name -> CSS font name
extern const std::map<std::string, std::string> BASE_14_FONT_CSS_FONT_MAP;
// For GB encoded font names
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
// map to embed files into html
// key: (suffix, if_embed_content)
// value: (prefix string, suffix string)
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP;
} // namespace pdf2htmlEX
#endif //CONST_H__

32
src/util/math.cc Normal file
View File

@ -0,0 +1,32 @@
#include <cstring>
#include "math.h"
namespace pdf2htmlEX {
void tm_transform(const double * tm, double & x, double & y, bool is_delta)
{
double xx = x, yy = y;
x = tm[0] * xx + tm[2] * yy;
y = tm[1] * xx + tm[3] * yy;
if(!is_delta)
{
x += tm[4];
y += tm[5];
}
}
void tm_multiply(double * tm_left, const double * tm_right)
{
double old[4];
memcpy(old, tm_left, sizeof(old));
tm_left[0] = old[0] * tm_right[0] + old[2] * tm_right[1];
tm_left[1] = old[1] * tm_right[0] + old[3] * tm_right[1];
tm_left[2] = old[0] * tm_right[2] + old[2] * tm_right[3];
tm_left[3] = old[1] * tm_right[2] + old[3] * tm_right[3];
tm_left[4] += old[0] * tm_right[4] + old[2] * tm_right[5];
tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5];
}
} //namespace pdf2htmlEX

33
src/util/math.h Normal file
View File

@ -0,0 +1,33 @@
/*
* Math functions
*
* by WangLu
* 2012.11.29
*/
#ifndef MATH_H__
#define MATH_H__
#include <cmath>
#include "const.h"
namespace pdf2htmlEX {
static inline double round(double x) { return (std::abs(x) > EPS) ? x : 0.0; }
static inline bool equal(double x, double y) { return std::abs(x-y) < EPS; }
static inline bool is_positive(double x) { return x > EPS; }
static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6)
{
for(int i = 0; i < size; ++i)
if(!equal(tm1[i], tm2[i]))
return false;
return true;
}
static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); }
void tm_transform(const double * tm, double & x, double & y, bool is_delta = false);
void tm_multiply(double * tm_left, const double * tm_right);
} //namespace pdf2htmlEX
#endif //MATH_H__

66
src/util/misc.cc Normal file
View File

@ -0,0 +1,66 @@
/*
* Misc functions
*
*
* by WangLu
* 2012.08.10
*/
#include <map>
#include "misc.h"
using std::cerr;
using std::endl;
using std::string;
using std::map;
using std::ostream;
namespace pdf2htmlEX {
void css_fix_rectangle_border_width(double x1, double y1,
double x2, double y2,
double border_width,
double & x, double & y, double & w, double & h,
double & border_top_bottom_width,
double & border_left_right_width)
{
w = x2 - x1;
if(w > border_width)
{
w -= border_width;
border_left_right_width = border_width;
}
else
{
border_left_right_width = border_width + w/2;
w = 0;
}
x = x1 - border_width / 2;
h = y2 - y1;
if(h > border_width)
{
h -= border_width;
border_top_bottom_width = border_width;
}
else
{
border_top_bottom_width = border_width + h/2;
h = 0;
}
y = y1 - border_width / 2;
}
ostream & operator << (ostream & out, const GfxRGB & rgb)
{
auto flags= out.flags();
out << std::dec << "rgb("
<< (int)colToByte(rgb.r) << ","
<< (int)colToByte(rgb.g) << ","
<< (int)colToByte(rgb.b) << ")";
out.flags(flags);
return out;
}
} // namespace pdf2htmlEX

37
src/util/misc.h Normal file
View File

@ -0,0 +1,37 @@
/*
* Help classes and Functions
*
* by WangLu
* 2012.08.10
*/
#ifndef UTIL_H__
#define UTIL_H__
#include <iostream>
#include <GfxState.h>
namespace pdf2htmlEX {
static inline long long hash_ref(const Ref * id)
{
return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen);
}
/*
* In PDF, edges of the rectangle are in the middle of the borders
* In HTML, edges are completely outside the rectangle
*/
void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2,
double border_width,
double & x, double & y, double & w, double & h,
double & border_top_bottom_width,
double & border_left_right_width);
std::ostream & operator << (std::ostream & out, const GfxRGB & rgb);
} // namespace pdf2htmlEX
#endif //UTIL_H__

View File

@ -12,8 +12,6 @@
using std::hex; using std::hex;
using std::dec; using std::dec;
using std::string; using std::string;
using std::cout;
using std::cerr;
using std::endl; using std::endl;
using std::make_pair; using std::make_pair;
using std::ifstream; using std::ifstream;

73
src/util/path.cc Normal file
View File

@ -0,0 +1,73 @@
/*
* Functions manipulating filenames and paths
*
* by WangLu
* 2012.11.29
*/
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "path.h"
using std::string;
namespace pdf2htmlEX {
void create_directories(const string & path)
{
if(path.empty()) return;
size_t idx = path.rfind('/');
if(idx != string::npos)
{
create_directories(path.substr(0, idx));
}
int r = mkdir(path.c_str(), S_IRWXU);
if(r != 0)
{
if(errno == EEXIST)
{
struct stat stat_buf;
if((stat(path.c_str(), &stat_buf) == 0) && S_ISDIR(stat_buf.st_mode))
return;
}
throw string("Cannot create directory: ") + path;
}
}
bool is_truetype_suffix(const string & suffix)
{
return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf");
}
string get_filename (const string & path)
{
size_t idx = path.rfind('/');
if(idx == string::npos)
return path;
else if (idx == path.size() - 1)
return "";
return path.substr(idx + 1);
}
string get_suffix(const string & path)
{
string fn = get_filename(path);
size_t idx = fn.rfind('.');
if(idx == string::npos)
return "";
else
{
string s = fn.substr(idx);
for(auto iter = s.begin(); iter != s.end(); ++iter)
*iter = tolower(*iter);
return s;
}
}
} //namespace pdf2htmlEX

23
src/util/path.h Normal file
View File

@ -0,0 +1,23 @@
/*
* Function handling filenames and paths
*
* by WangLu
* 2012.11.29
*/
#ifndef PATH_H__
#define PATH_H__
#include <string>
namespace pdf2htmlEX {
void create_directories(const std::string & path);
bool is_truetype_suffix(const std::string & suffix);
std::string get_filename(const std::string & path);
std::string get_suffix(const std::string & path);
} //namespace pdf2htmlEX
#endif //PATH_H__

157
src/util/unicode.cc Normal file
View File

@ -0,0 +1,157 @@
/*
* Unicode manipulation functions
*
* by WangLu
* 2012.11.29
*/
#include <GlobalParams.h>
#include "unicode.h"
namespace pdf2htmlEX {
using std::cerr;
using std::endl;
using std::ostream;
bool isLegalUnicode(Unicode u)
{
/*
if((u == 9) || (u == 10) || (u == 13))
return true;
*/
if(u <= 31)
return false;
if((u >= 127) && (u <= 159))
return false;
if((u >= 0xd800) && (u <= 0xdfff))
return false;
return true;
}
Unicode map_to_private(CharCode code)
{
Unicode private_mapping = (Unicode)(code + 0xE000);
if(private_mapping > 0xF8FF)
{
private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000);
if(private_mapping > 0xFFFFD)
{
private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000);
if(private_mapping > 0x10FFFD)
{
cerr << "Warning: all private use unicode are used" << endl;
}
}
}
return private_mapping;
}
Unicode unicode_from_font (CharCode code, GfxFont * font)
{
if(!font->isCIDFont())
{
char * cname = dynamic_cast<Gfx8BitFont*>(font)->getCharName(code);
// may be untranslated ligature
if(cname)
{
Unicode ou = globalParams->mapNameToUnicode(cname);
if(isLegalUnicode(ou))
return ou;
}
}
return map_to_private(code);
}
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
{
if(len == 0)
return map_to_private(code);
if(len == 1)
{
if(isLegalUnicode(*u))
return *u;
}
return unicode_from_font(code, font);
}
/*
* Copied from UTF.h / UTF8.h in poppler
*/
static int mapUTF8(Unicode u, char *buf, int bufSize) {
if (u <= 0x0000007f) {
if (bufSize < 1) {
return 0;
}
buf[0] = (char)u;
return 1;
} else if (u <= 0x000007ff) {
if (bufSize < 2) {
return 0;
}
buf[0] = (char)(0xc0 + (u >> 6));
buf[1] = (char)(0x80 + (u & 0x3f));
return 2;
} else if (u <= 0x0000ffff) {
if (bufSize < 3) {
return 0;
}
buf[0] = (char)(0xe0 + (u >> 12));
buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[2] = (char)(0x80 + (u & 0x3f));
return 3;
} else if (u <= 0x0010ffff) {
if (bufSize < 4) {
return 0;
}
buf[0] = (char)(0xf0 + (u >> 18));
buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[3] = (char)(0x80 + (u & 0x3f));
return 4;
} else {
return 0;
}
}
void outputUnicodes(ostream & out, const Unicode * u, int uLen)
{
for(int i = 0; i < uLen; ++i)
{
switch(u[i])
{
case '&':
out << "&amp;";
break;
case '\"':
out << "&quot;";
break;
case '\'':
out << "&apos;";
break;
case '<':
out << "&lt;";
break;
case '>':
out << "&gt;";
break;
default:
{
char buf[4];
auto n = mapUTF8(u[i], buf, 4);
out.write(buf, n);
}
}
}
}
} //namespace pdf2htmlEX

41
src/util/unicode.h Normal file
View File

@ -0,0 +1,41 @@
/*
* Unicode manipulation functions
*
* by WangLu
* 2012.11.29
*/
#ifndef UNICODE_H__
#define UNICODE_H__
#include <iostream>
#include <GfxFont.h>
#include <CharTypes.h>
namespace pdf2htmlEX {
/*
* Check if the unicode is valid for HTML
* http://en.wikipedia.org/wiki/HTML_decimal_character_rendering
*/
bool isLegalUnicode(Unicode u);
Unicode map_to_private(CharCode code);
/* * Try to determine the Unicode value directly from the information in the font */
Unicode unicode_from_font (CharCode code, GfxFont * font);
/*
* We have to use a single Unicode value to reencode fonts
* if we got multi-unicode values, it might be expanded ligature, try to restore it
* if we cannot figure it out at the end, use a private mapping
*/
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
} // namespace pdf2htmlEX
#endif //UNICODE_H__