2013-04-30 11:51:11 +00:00
// pdf2htmlEX.cc
2012-08-04 18:03:53 +00:00
//
2013-01-24 14:21:06 +00:00
// Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
2012-08-28 10:27:45 +00:00
2012-08-04 18:03:53 +00:00
# include <cstdio>
# include <cstdlib>
# include <cstddef>
# include <cstring>
# include <ctime>
# include <string>
# include <limits>
2012-08-12 10:53:22 +00:00
# include <iostream>
2013-04-06 09:01:05 +00:00
# include <memory>
2013-10-17 03:52:55 +00:00
# include <errno.h>
2013-04-06 09:01:05 +00:00
2012-09-10 05:03:25 +00:00
# include <getopt.h>
2012-08-04 18:03:53 +00:00
2013-01-23 12:29:59 +00:00
# include <poppler-config.h>
2012-08-12 10:53:22 +00:00
# include <goo/GooString.h>
2012-08-13 14:20:38 +00:00
# include <Object.h>
# include <PDFDoc.h>
# include <PDFDocFactory.h>
# include <GlobalParams.h>
2012-08-04 18:03:53 +00:00
2012-09-10 05:08:47 +00:00
# include "pdf2htmlEX-config.h"
2013-09-18 12:24:48 +00:00
# if ENABLE_SVG
# include <cairo.h>
# endif
2013-04-06 08:45:01 +00:00
# include "ArgParser.h"
# include "Param.h"
2012-11-29 09:28:05 +00:00
# include "HTMLRenderer/HTMLRenderer.h"
2013-04-06 08:45:01 +00:00
2012-11-29 10:16:05 +00:00
# include "util/path.h"
2013-01-23 12:29:59 +00:00
# include "util/ffw.h"
2012-08-04 18:03:53 +00:00
using namespace std ;
2012-09-11 13:52:46 +00:00
using namespace pdf2htmlEX ;
2012-08-04 18:03:53 +00:00
Param param ;
2012-09-10 09:01:15 +00:00
ArgParser argparser ;
2012-08-04 18:03:53 +00:00
2013-11-22 20:16:21 +00:00
# ifdef _WIN32
2013-11-22 21:12:59 +00:00
# include <iomanip>
# include <libgen.h>
2013-12-05 20:20:53 +00:00
# include <direct.h>
2013-11-22 18:32:40 +00:00
# endif
2013-09-18 12:30:52 +00:00
void deprecated_font_suffix ( const char * dummy = nullptr )
{
cerr < < " --font-suffix is deprecated. Use `--font-format` instead. " < < endl ;
exit ( EXIT_FAILURE ) ;
}
2013-01-23 15:02:11 +00:00
void show_usage_and_exit ( const char * dummy = nullptr )
2013-01-23 12:29:59 +00:00
{
2013-01-28 22:16:38 +00:00
cerr < < " Usage: pdf2htmlEX [options] <input.pdf> [<output.html>] " < < endl ;
2012-09-10 09:01:15 +00:00
argparser . show_usage ( cerr ) ;
exit ( EXIT_FAILURE ) ;
2012-08-04 18:03:53 +00:00
}
2013-01-23 12:29:59 +00:00
void show_version_and_exit ( const char * dummy = nullptr )
{
2013-04-30 11:51:11 +00:00
cerr < < " pdf2htmlEX version " < < PDF2HTMLEX_VERSION < < endl ;
2013-09-18 12:24:48 +00:00
cerr < < " Copyright 2012,2013 Lu Wang <coolwanglu@gmail.com> and other contributers " < < endl ;
2013-09-18 12:30:52 +00:00
cerr < < " Libraries: " < < endl ;
cerr < < " poppler " < < POPPLER_VERSION < < endl ;
cerr < < " libfontforge " < < ffw_get_version ( ) < < endl ;
2013-09-18 12:24:48 +00:00
# if ENABLE_SVG
2013-09-18 12:30:52 +00:00
cerr < < " cairo " < < cairo_version_string ( ) < < endl ;
2013-09-18 12:24:48 +00:00
# endif
2013-11-22 21:12:59 +00:00
cerr < < " Default data-dir: " < < param . data_dir < < endl ;
2013-09-18 12:24:48 +00:00
cerr < < " Supported image format: " ;
# ifdef ENABLE_LIBPNG
cerr < < " png " ;
# endif
# ifdef ENABLE_LIBJPEG
cerr < < " jpg " ;
# endif
# if ENABLE_SVG
cerr < < " svg " ;
# endif
cerr < < endl ;
cerr < < endl ;
2013-01-25 13:13:27 +00:00
exit ( EXIT_SUCCESS ) ;
2013-01-23 12:29:59 +00:00
}
2013-05-26 23:43:26 +00:00
void embed_parser ( const char * str )
{
while ( true )
{
switch ( * str )
{
case ' \0 ' : return ; break ;
case ' c ' : param . embed_css = 0 ; break ;
case ' C ' : param . embed_css = 1 ; break ;
case ' f ' : param . embed_font = 0 ; break ;
case ' F ' : param . embed_font = 1 ; break ;
case ' i ' : param . embed_image = 0 ; break ;
case ' I ' : param . embed_image = 1 ; break ;
case ' j ' : param . embed_javascript = 0 ; break ;
case ' J ' : param . embed_javascript = 1 ; break ;
case ' o ' : param . embed_outline = 0 ; break ;
case ' O ' : param . embed_outline = 1 ; break ;
default :
cerr < < " Unknown character ` " < < ( * str ) < < " ` for --embed " < < endl ;
break ;
}
+ + str ;
}
}
2013-11-22 21:12:59 +00:00
void prepare_directories ( )
{
std : : string tmp_dir = param . basetmp_dir + " /pdf2htmlEX-XXXXXX " ;
# ifndef _WIN32
errno = 0 ;
auto_ptr < char > pBuf ( new char [ tmp_dir . size ( ) + 1 ] ) ;
strcpy ( pBuf . get ( ) , tmp_dir . c_str ( ) ) ;
auto p = mkdtemp ( pBuf . get ( ) ) ;
if ( p = = nullptr )
{
const char * errmsg = strerror ( errno ) ;
if ( ! errmsg )
{
errmsg = " unknown error " ;
}
cerr < < " Cannot create temp directory: " < < errmsg < < endl ;
exit ( EXIT_FAILURE ) ;
}
param . tmp_dir = pBuf . get ( ) ;
# else
srand ( ( unsigned ) time ( 0 ) ) ;
int rand_value = ( int ) ( ( rand ( ) / ( ( double ) RAND_MAX + 1.0 ) ) * 1e6 ) ;
stringstream ss ;
ss < < setw ( 6 ) < < rand_value ;
tmp_dir . erase ( tmp_dir . size ( ) - 6 ) ;
param . tmp_dir = tmp_dir + ss . str ( ) ;
2013-12-05 20:20:53 +00:00
if ( mkdir ( param . tmp_dir . c_str ( ) ) ) {
cerr < < " Cannot create temp directory ( " < < param . tmp_dir < < " ): " < < strerror ( errno ) < < endl ;
exit ( EXIT_FAILURE ) ;
}
2013-11-22 21:12:59 +00:00
# endif
}
2012-09-10 09:01:15 +00:00
void parse_options ( int argc , char * * argv )
2012-08-04 18:03:53 +00:00
{
2012-09-10 09:01:15 +00:00
argparser
2013-01-28 22:54:42 +00:00
// pages
2013-01-28 22:45:12 +00:00
. add ( " first-page,f " , & param . first_page , 1 , " first page to convert " )
. add ( " last-page,l " , & param . last_page , numeric_limits < int > : : max ( ) , " last page to convert " )
2013-11-22 20:16:21 +00:00
2013-01-28 22:54:42 +00:00
// dimensions
2013-05-26 23:43:26 +00:00
. add ( " zoom " , & param . zoom , 0 , " zoom ratio " , true )
2013-11-22 20:16:21 +00:00
. add ( " fit-width " , & param . fit_width , 0 , " fit width to <fp> pixels " , true )
2013-05-26 23:43:26 +00:00
. add ( " fit-height " , & param . fit_height , 0 , " fit height to <fp> pixels " , true )
2013-03-07 01:37:27 +00:00
. add ( " use-cropbox " , & param . use_cropbox , 1 , " use CropBox instead of MediaBox " )
2013-01-28 22:45:12 +00:00
. add ( " hdpi " , & param . h_dpi , 144.0 , " horizontal resolution for graphics in DPI " )
. add ( " vdpi " , & param . v_dpi , 144.0 , " vertical resolution for graphics in DPI " )
2013-11-22 20:16:21 +00:00
2013-01-28 22:54:42 +00:00
// output files
2013-05-26 23:43:26 +00:00
. add ( " embed " , " specify which elements should be embedded into output " , embed_parser , true )
. add ( " embed-css " , & param . embed_css , 1 , " embed CSS files into output " )
. add ( " embed-font " , & param . embed_font , 1 , " embed font files into output " )
. add ( " embed-image " , & param . embed_image , 1 , " embed image files into output " )
. add ( " embed-javascript " , & param . embed_javascript , 1 , " embed JavaScript files into output " )
. add ( " embed-outline " , & param . embed_outline , 1 , " embed outlines into output " )
2013-12-14 22:38:02 +00:00
. add ( " tmp-file-size-limit " , & param . max_size , - 1 , " Limit the temporary file output size, in KB (-1 for no limit). This is only an estimate, the output may be bigger " )
2013-01-28 22:45:12 +00:00
. add ( " split-pages " , & param . split_pages , 0 , " split pages into separate files " )
2013-01-28 22:54:42 +00:00
. add ( " dest-dir " , & param . dest_dir , " . " , " specify destination directory " )
. add ( " css-filename " , & param . css_filename , " " , " filename of the generated css file " )
2013-05-02 08:09:42 +00:00
. add ( " page-filename " , & param . page_filename , " " , " filename template for splitted pages " )
2013-01-28 22:54:42 +00:00
. add ( " outline-filename " , & param . outline_filename , " " , " filename of the generated outline file " )
2013-01-30 18:18:18 +00:00
. add ( " process-nontext " , & param . process_nontext , 1 , " render graphics in addition to text " )
. add ( " process-outline " , & param . process_outline , 1 , " show outline in HTML " )
2013-04-30 11:07:55 +00:00
. add ( " printing " , & param . printing , 1 , " enable printing support " )
2013-03-08 17:45:13 +00:00
. add ( " fallback " , & param . fallback , 0 , " output in fallback mode " )
2013-11-22 20:16:21 +00:00
2013-01-29 10:38:39 +00:00
// fonts
2013-04-30 07:58:26 +00:00
. add ( " embed-external-font " , & param . embed_external_font , 1 , " embed local match for external fonts " )
2013-11-08 05:53:42 +00:00
. add ( " font-format " , & param . font_format , " woff " , " suffix for embedded font files (ttf,otf,woff,svg) " )
2013-01-28 22:45:12 +00:00
. add ( " decompose-ligature " , & param . decompose_ligature , 0 , " decompose ligatures, such as \uFB01 -> fi " )
2013-01-28 22:54:42 +00:00
. add ( " auto-hint " , & param . auto_hint , 0 , " use fontforge autohint on fonts without hints " )
. add ( " external-hint-tool " , & param . external_hint_tool , " " , " external tool for hinting fonts (overrides --auto-hint) " )
. add ( " stretch-narrow-glyph " , & param . stretch_narrow_glyph , 0 , " stretch narrow glyphs instead of padding them " )
. add ( " squeeze-wide-glyph " , & param . squeeze_wide_glyph , 1 , " shrink wide glyphs instead of truncating them " )
2013-07-02 00:04:20 +00:00
. add ( " override-fstype " , & param . override_fstype , 0 , " clear the fstype bits in TTF/OTF fonts " )
2013-09-21 05:56:57 +00:00
. add ( " process-type3 " , & param . process_type3 , 0 , " convert Type 3 fonts for web (experimental) " )
2013-11-22 20:16:21 +00:00
2013-01-28 22:54:42 +00:00
// text
2013-01-28 22:45:12 +00:00
. add ( " heps " , & param . h_eps , 1.0 , " horizontal threshold for merging text, in pixels " )
. add ( " veps " , & param . v_eps , 1.0 , " vertical threshold for merging text, in pixels " )
. add ( " space-threshold " , & param . space_threshold , ( 1.0 / 8 ) , " word break threshold (threshold * em) " )
. add ( " font-size-multiplier " , & param . font_size_multiplier , 4.0 , " a value greater than 1 increases the rendering accuracy " )
2013-04-04 14:10:25 +00:00
. add ( " space-as-offset " , & param . space_as_offset , 0 , " treat space characters as offsets " )
2013-01-28 22:54:42 +00:00
. add ( " tounicode " , & param . tounicode , 0 , " how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore) " )
2013-05-06 03:08:29 +00:00
. add ( " optimize-text " , & param . optimize_text , 0 , " try to reduce the number of HTML elements used for text " )
2013-09-18 10:01:56 +00:00
// background image
. add ( " bg-format " , & param . bg_format , " png " , " specify background image format " )
2013-11-22 20:16:21 +00:00
2013-01-28 22:54:42 +00:00
// encryption
2013-05-26 23:43:26 +00:00
. add ( " owner-password,o " , & param . owner_password , " " , " owner password (for encrypted files) " , true )
. add ( " user-password,u " , & param . user_password , " " , " user password (for encrypted files) " , true )
2013-01-28 22:54:42 +00:00
. add ( " no-drm " , & param . no_drm , 0 , " override document DRM settings " )
2013-11-22 20:16:21 +00:00
2013-01-28 22:54:42 +00:00
// misc.
2013-01-28 22:45:12 +00:00
. add ( " clean-tmp " , & param . clean_tmp , 1 , " remove temporary files after conversion " )
2013-12-06 13:29:53 +00:00
. add ( " tmp-dir " , & param . basetmp_dir , param . basetmp_dir , " specify the location of tempory directory. " )
2013-11-22 21:12:59 +00:00
. add ( " data-dir " , & param . data_dir , param . data_dir , " specify data directory " )
2013-02-28 16:00:06 +00:00
// TODO: css drawings are hidden on print, for annot links, need to fix it for other drawings
// .add("css-draw", ¶m.css_draw, 0, "[experimental and unsupported] CSS drawing")
2013-01-28 22:54:42 +00:00
. add ( " debug " , & param . debug , 0 , " print debugging information " )
2013-11-22 20:16:21 +00:00
2013-01-28 22:54:42 +00:00
// meta
. add ( " version,v " , " print copyright and version info " , & show_version_and_exit )
. add ( " help,h " , " print usage information " , & show_usage_and_exit )
2013-05-03 19:29:10 +00:00
2013-09-18 12:30:52 +00:00
// deprecated
. add ( " font-suffix " , " " , & deprecated_font_suffix )
2012-09-10 14:22:01 +00:00
. add ( " " , & param . input_filename , " " , " " )
. add ( " " , & param . output_filename , " " , " " )
2012-08-04 18:03:53 +00:00
;
2012-09-10 09:01:15 +00:00
try
{
argparser . parse ( argc , argv ) ;
2012-08-04 18:03:53 +00:00
}
2012-09-10 14:44:19 +00:00
catch ( const char * s )
{
// if s == "", getopt_long would have printed the error message
if ( s & & s [ 0 ] )
{
cerr < < " Error when parsing the arguments: " < < endl ;
cerr < < s < < endl ;
}
exit ( EXIT_FAILURE ) ;
}
2012-09-10 09:01:15 +00:00
catch ( const std : : string & s )
{
2012-09-10 14:44:19 +00:00
// if s == "", getopt_long would have printed the error message
if ( s ! = " " )
{
cerr < < " Error when parsing the arguments: " < < endl ;
cerr < < s < < endl ;
}
2012-09-10 09:01:15 +00:00
exit ( EXIT_FAILURE ) ;
2012-08-04 18:03:53 +00:00
}
}
2013-09-18 10:01:56 +00:00
void check_param ( )
2012-08-04 18:03:53 +00:00
{
2012-09-10 09:01:15 +00:00
if ( param . input_filename = = " " )
2012-08-04 18:03:53 +00:00
{
2013-01-28 22:16:38 +00:00
show_usage_and_exit ( ) ;
2012-08-04 18:03:53 +00:00
}
2013-09-18 10:01:56 +00:00
if ( param . output_filename . empty ( ) )
{
const string s = get_filename ( param . input_filename ) ;
if ( get_suffix ( param . input_filename ) = = " .pdf " )
{
param . output_filename = s . substr ( 0 , s . size ( ) - 4 ) + " .html " ;
}
else
{
param . output_filename = s + " .html " ;
}
}
if ( param . page_filename . empty ( ) )
{
const string s = get_filename ( param . input_filename ) ;
if ( get_suffix ( param . input_filename ) = = " .pdf " )
{
param . page_filename = s . substr ( 0 , s . size ( ) - 4 ) + " %d.page " ;
}
else
{
param . page_filename = s + " %d.page " ;
}
sanitize_filename ( param . page_filename ) ;
}
else
{
// Need to make sure we have a page number placeholder in the filename
if ( ! sanitize_filename ( param . page_filename ) )
{
// Inject the placeholder just before the file extension
const string suffix = get_suffix ( param . page_filename ) ;
param . page_filename = param . page_filename . substr ( 0 , param . page_filename . size ( ) - suffix . size ( ) ) + " %d " + suffix ;
sanitize_filename ( param . page_filename ) ;
}
}
if ( param . css_filename . empty ( ) )
{
const string s = get_filename ( param . input_filename ) ;
if ( get_suffix ( param . input_filename ) = = " .pdf " )
{
param . css_filename = s . substr ( 0 , s . size ( ) - 4 ) + " .css " ;
}
else
{
if ( ! param . split_pages )
param . css_filename = s + " .css " ;
}
}
if ( param . outline_filename . empty ( ) )
{
const string s = get_filename ( param . input_filename ) ;
if ( get_suffix ( param . input_filename ) = = " .pdf " )
{
param . outline_filename = s . substr ( 0 , s . size ( ) - 4 ) + " .outline " ;
}
else
{
if ( ! param . split_pages )
param . outline_filename = s + " .outline " ;
}
}
2013-09-18 12:24:48 +00:00
if ( false ) { }
# ifdef ENABLE_LIBPNG
else if ( param . bg_format = = " png " ) { }
# endif
# ifdef ENABLE_LIBJPEG
else if ( param . bg_format = = " jpg " ) { }
# endif
2013-09-18 16:17:56 +00:00
# if ENABLE_SVG
2013-09-18 12:24:48 +00:00
else if ( param . bg_format = = " svg " ) { }
2013-09-18 10:01:56 +00:00
# endif
else
{
2013-09-18 12:24:48 +00:00
cerr < < " Image format not supported: " < < param . bg_format < < endl ;
2013-09-18 10:01:56 +00:00
exit ( EXIT_FAILURE ) ;
}
2013-09-18 21:56:57 +00:00
2013-09-21 05:56:57 +00:00
# if not ENABLE_SVG
if ( param . process_type3 )
{
cerr < < " process-type3 is enabled, however SVG support is not built in this version of pdf2htmlEX. " < < endl ;
exit ( EXIT_FAILURE ) ;
}
# endif
2013-10-03 07:19:41 +00:00
if ( ( param . font_format = = " ttf " ) & & ( param . external_hint_tool = = " " ) )
{
cerr < < " Warning: No hint tool is specified for truetype fonts, the result may be rendered poorly in some circumstances. " < < endl ;
}
2013-09-18 10:01:56 +00:00
}
int main ( int argc , char * * argv )
{
2013-12-06 13:29:53 +00:00
// We need to adjust these directories before parsing the options.
2013-11-22 21:12:59 +00:00
# ifndef _WIN32
param . basetmp_dir = " /tmp " ;
2013-12-05 20:20:53 +00:00
param . data_dir = PDF2HTMLEX_DATA_PATH ;
2013-11-22 21:12:59 +00:00
# else
2013-11-22 20:16:21 +00:00
{
// Under Windows, the default data_dir is under /data in the pdf2htmlEX directory
stringstream ss ;
ss < < dirname ( argv [ 0 ] ) < < " /data " ;
2013-11-22 21:12:59 +00:00
param . data_dir = ss . str ( ) ;
// Under Windows, the temp path is not under /tmp, find it.
char temppath [ MAX_PATH ] ;
: : GetTempPath ( MAX_PATH , temppath ) ;
param . basetmp_dir = temppath ;
2013-11-22 20:16:21 +00:00
}
# endif
2013-09-18 10:01:56 +00:00
parse_options ( argc , argv ) ;
check_param ( ) ;
2012-08-14 10:12:58 +00:00
//prepare the directories
2013-11-22 21:12:59 +00:00
prepare_directories ( ) ;
2012-09-09 18:22:49 +00:00
2013-11-22 21:12:59 +00:00
if ( param . debug ) {
2012-09-09 17:23:28 +00:00
cerr < < " temporary dir: " < < ( param . tmp_dir ) < < endl ;
2013-11-22 21:12:59 +00:00
}
2012-09-09 17:23:28 +00:00
2012-08-14 10:12:58 +00:00
try
{
create_directories ( param . dest_dir ) ;
}
2012-09-09 17:23:28 +00:00
catch ( const string & s )
2012-08-14 10:12:58 +00:00
{
2012-09-09 17:23:28 +00:00
cerr < < s < < endl ;
2012-09-10 09:01:15 +00:00
exit ( EXIT_FAILURE ) ;
2012-08-14 10:12:58 +00:00
}
2012-09-10 09:01:15 +00:00
bool finished = false ;
2012-08-04 18:03:53 +00:00
// read config file
globalParams = new GlobalParams ( ) ;
// open PDF file
2013-04-06 09:01:05 +00:00
PDFDoc * doc = nullptr ;
2012-09-10 09:01:15 +00:00
try
{
{
GooString * ownerPW = ( param . owner_password = = " " ) ? ( nullptr ) : ( new GooString ( param . owner_password . c_str ( ) ) ) ;
GooString * userPW = ( param . user_password = = " " ) ? ( nullptr ) : ( new GooString ( param . user_password . c_str ( ) ) ) ;
GooString fileName ( param . input_filename . c_str ( ) ) ;
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
doc = PDFDocFactory ( ) . createPDFDoc ( fileName , ownerPW , userPW ) ;
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
delete userPW ;
delete ownerPW ;
}
2012-08-04 18:03:53 +00:00
2013-11-22 20:16:21 +00:00
if ( ! doc - > isOk ( ) )
2012-09-10 09:01:15 +00:00
throw " Cannot read the file " ;
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
// check for copy permission
2013-11-22 20:16:21 +00:00
if ( ! doc - > okToCopy ( ) )
2013-09-18 10:01:56 +00:00
{
if ( param . no_drm = = 0 )
2013-01-25 00:56:49 +00:00
throw " Copying of text from this document is not allowed. " ;
cerr < < " Document has copy-protection bit set. " < < endl ;
2012-09-10 09:01:15 +00:00
}
2012-08-04 18:03:53 +00:00
2012-10-02 18:19:40 +00:00
param . first_page = min < int > ( max < int > ( param . first_page , 1 ) , doc - > getNumPages ( ) ) ;
param . last_page = min < int > ( max < int > ( param . last_page , param . first_page ) , doc - > getNumPages ( ) ) ;
2012-09-09 17:18:09 +00:00
2012-08-04 18:03:53 +00:00
2013-04-06 09:01:05 +00:00
unique_ptr < HTMLRenderer > ( new HTMLRenderer ( param ) ) - > process ( doc ) ;
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
finished = true ;
}
2012-09-10 14:44:19 +00:00
catch ( const char * s )
{
cerr < < " Error: " < < s < < endl ;
}
2012-09-10 09:01:15 +00:00
catch ( const string & s )
{
cerr < < " Error: " < < s < < endl ;
}
2012-08-04 18:03:53 +00:00
// clean up
2013-04-06 09:01:05 +00:00
delete doc ;
delete globalParams ;
2012-08-04 18:03:53 +00:00
// check for memory leaks
Object : : memCheck ( stderr ) ;
gMemReport ( stderr ) ;
2012-09-10 09:01:15 +00:00
exit ( finished ? ( EXIT_SUCCESS ) : ( EXIT_FAILURE ) ) ;
2012-09-09 19:30:54 +00:00
return 0 ;
2012-08-04 18:03:53 +00:00
}