2012-08-04 18:03:53 +00:00
// pdftohtmlEX.cc
//
2012-08-28 10:27:45 +00:00
// Copyright (C) 2012 Lu Wang coolwanglu<at>gmail.com
2012-08-04 18:03:53 +00:00
# include <cstdio>
# include <cstdlib>
# include <cstddef>
# include <cstring>
# include <ctime>
# include <string>
# include <limits>
2012-08-12 10:53:22 +00:00
# include <iostream>
2012-08-04 18:03:53 +00:00
# include <boost/program_options.hpp>
2012-08-15 07:29:35 +00:00
# include <boost/filesystem.hpp>
2012-08-04 18:03:53 +00:00
2012-08-12 10:53:22 +00:00
# include <goo/GooString.h>
2012-08-13 14:20:38 +00:00
# include <Object.h>
# include <PDFDoc.h>
# include <PDFDocFactory.h>
# include <GlobalParams.h>
2012-08-04 18:03:53 +00:00
2012-08-13 14:20:38 +00:00
# include "HTMLRenderer.h"
2012-08-04 18:03:53 +00:00
# include "Param.h"
2012-08-14 05:56:41 +00:00
# include "config.h"
2012-08-04 18:03:53 +00:00
namespace po = boost : : program_options ;
using namespace std ;
2012-08-14 10:12:58 +00:00
using namespace boost : : filesystem ;
2012-08-04 18:03:53 +00:00
Param param ;
// variables
PDFDoc * doc = nullptr ;
GooString * fileName = nullptr ;
GooString * ownerPW , * userPW ;
HTMLRenderer * htmlOut = nullptr ;
int finished = - 1 ;
po : : options_description opt_visible ( " Options " ) , opt_hidden , opt_all ;
po : : positional_options_description opt_positional ;
void show_usage ( void )
{
2012-08-14 05:56:41 +00:00
cerr < < " pdftohtmlEX version " < < PDF2HTMLEX_VERSION < < endl ;
2012-08-04 18:03:53 +00:00
cerr < < endl ;
cerr < < " Copyright 2012 Lu Wang (coolwanglu<at>gmail.com) " < < endl ;
cerr < < endl ;
2012-08-13 02:26:10 +00:00
cerr < < " Usage: pdf2htmlEX [Options] <PDF-file> " < < endl ;
2012-08-04 18:03:53 +00:00
cerr < < endl ;
cerr < < opt_visible < < endl ;
}
po : : variables_map parse_options ( int argc , char * * argv )
{
opt_visible . add_options ( )
( " help " , " show all options " )
( " version,v " , " show copyright and version info " )
2012-08-31 05:00:24 +00:00
2012-08-04 18:03:53 +00:00
( " owner-password,o " , po : : value < string > ( & param . owner_password ) - > default_value ( " " ) , " owner password (for encrypted files) " )
( " user-password,u " , po : : value < string > ( & param . user_password ) - > default_value ( " " ) , " user password (for encrypted files) " )
2012-08-31 05:00:24 +00:00
2012-08-14 09:50:16 +00:00
( " dest-dir " , po : : value < string > ( & param . dest_dir ) - > default_value ( " . " ) , " destination directory " )
2012-09-06 08:28:49 +00:00
( " tmp-dir " , po : : value < string > ( & param . tmp_dir ) - > default_value ( ( temp_directory_path ( ) / " /pdf2htmlEX " ) . string ( ) ) , " temporary directory " )
2012-08-31 05:00:24 +00:00
( " first-page,f " , po : : value < int > ( & param . first_page ) - > default_value ( 1 ) , " first page to process " )
( " last-page,l " , po : : value < int > ( & param . last_page ) - > default_value ( numeric_limits < int > : : max ( ) ) , " last page to process " )
2012-08-15 13:26:13 +00:00
( " zoom " , po : : value < double > ( & param . zoom ) - > default_value ( 1.0 ) , " zoom ratio " )
( " hdpi " , po : : value < double > ( & param . h_dpi ) - > default_value ( 144.0 ) , " horizontal DPI for non-text " )
( " vdpi " , po : : value < double > ( & param . v_dpi ) - > default_value ( 144.0 ) , " vertical DPI for non-text " )
2012-08-31 05:00:24 +00:00
( " process-nontext " , po : : value < int > ( & param . process_nontext ) - > default_value ( 1 ) , " process nontext objects " )
( " single-html " , po : : value < int > ( & param . single_html ) - > default_value ( 1 ) , " combine everything into one single HTML file " )
2012-09-02 13:34:30 +00:00
( " embed-base-font " , po : : value < int > ( & param . embed_base_font ) - > default_value ( 0 ) , " embed local matched font for base 14 fonts in the PDF file " )
2012-08-31 07:27:17 +00:00
( " embed-external-font " , po : : value < int > ( & param . embed_external_font ) - > default_value ( 0 ) , " embed local matched font for external fonts in the PDF file " )
2012-09-06 07:09:47 +00:00
( " decompose-ligature " , po : : value < int > ( & param . decompose_ligature ) - > default_value ( 0 ) , " decompose ligatures, for example 'fi' -> 'f''i' " )
2012-08-31 05:00:24 +00:00
2012-08-04 18:03:53 +00:00
( " heps " , po : : value < double > ( & param . h_eps ) - > default_value ( 1.0 ) , " max tolerated horizontal offset (in pixels) " )
( " veps " , po : : value < double > ( & param . v_eps ) - > default_value ( 1.0 ) , " max tolerated vertical offset (in pixels) " )
2012-09-07 08:19:22 +00:00
( " space-threshold " , po : : value < double > ( & param . space_threshold ) - > default_value ( 1.0 / 8 ) , " distance no thiner than (threshold * em) will be considered as a space character " )
2012-08-31 05:00:24 +00:00
( " font-size-multiplier " , po : : value < double > ( & param . font_size_multiplier ) - > default_value ( 10.0 ) , " setting a value greater than 1 would increase the rendering accuracy " )
2012-09-08 17:49:47 +00:00
( " tounicode " , po : : value < int > ( & param . tounicode ) - > default_value ( 0 ) , " Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled " )
2012-09-07 00:39:21 +00:00
( " space-as-offset " , po : : value < int > ( & param . space_as_offset ) - > default_value ( 0 ) , " treat space characters as offsets " )
2012-08-31 05:00:24 +00:00
2012-08-31 02:10:06 +00:00
( " font-suffix " , po : : value < string > ( & param . font_suffix ) - > default_value ( " .ttf " ) , " suffix for extracted font files " )
( " font-format " , po : : value < string > ( & param . font_format ) - > default_value ( " truetype " ) , " format for extracted font files " )
2012-08-31 05:00:24 +00:00
2012-08-12 06:04:45 +00:00
( " debug " , po : : value < int > ( & param . debug ) - > default_value ( 0 ) , " output debug information " )
2012-08-15 07:43:49 +00:00
( " clean-tmp " , po : : value < int > ( & param . clean_tmp ) - > default_value ( 1 ) , " clean temporary files after processing " )
2012-08-04 18:03:53 +00:00
;
opt_hidden . add_options ( )
( " inputfilename " , po : : value < string > ( & param . input_filename ) - > default_value ( " " ) , " " )
( " outputfilename " , po : : value < string > ( & param . output_filename ) - > default_value ( " " ) , " " )
;
opt_positional . add ( " inputfilename " , 1 ) . add ( " outputfilename " , 1 ) ;
opt_all . add ( opt_visible ) . add ( opt_hidden ) ;
try {
po : : variables_map opt_vm ;
po : : store ( po : : command_line_parser ( argc , argv ) . options ( opt_all ) . positional ( opt_positional ) . run ( )
, opt_vm ) ;
po : : notify ( opt_vm ) ;
return opt_vm ;
}
catch ( . . . ) {
show_usage ( ) ;
exit ( - 1 ) ;
}
}
int main ( int argc , char * * argv )
{
auto opt_map = parse_options ( argc , argv ) ;
if ( opt_map . count ( " version " ) | | opt_map . count ( " help " ) | | ( param . input_filename = = " " ) )
{
show_usage ( ) ;
return - 1 ;
}
2012-08-14 10:12:58 +00:00
//prepare the directories
2012-09-09 06:48:10 +00:00
auto user_dirs = { param . dest_dir , param . tmp_dir } ;
for ( auto iter = user_dirs . begin ( ) ; iter ! = user_dirs . end ( ) ; + + iter )
2012-08-15 07:29:35 +00:00
{
2012-09-09 06:48:10 +00:00
const auto & p = * iter ;
2012-08-17 10:13:21 +00:00
if ( equivalent ( PDF2HTMLEX_DATA_PATH , p ) )
2012-08-15 07:29:35 +00:00
{
cerr < < " The specified directory \" " < < p < < " \" is the library path for pdf2htmlEX. Please use another one. " < < endl ;
return - 1 ;
}
}
2012-08-14 10:12:58 +00:00
try
{
create_directories ( param . dest_dir ) ;
create_directories ( param . tmp_dir ) ;
}
catch ( const filesystem_error & err )
{
cerr < < err . what ( ) < < endl ;
return - 1 ;
}
2012-08-04 18:03:53 +00:00
// read config file
globalParams = new GlobalParams ( ) ;
// open PDF file
ownerPW = ( param . owner_password = = " " ) ? ( nullptr ) : ( new GooString ( param . owner_password . c_str ( ) ) ) ;
userPW = ( param . user_password = = " " ) ? ( nullptr ) : ( new GooString ( param . user_password . c_str ( ) ) ) ;
fileName = new GooString ( param . input_filename . c_str ( ) ) ;
doc = PDFDocFactory ( ) . createPDFDoc ( * fileName , ownerPW , userPW ) ;
delete userPW ;
delete ownerPW ;
if ( ! doc - > isOk ( ) ) {
goto error ;
}
// check for copy permission
if ( ! doc - > okToCopy ( ) ) {
error ( errNotAllowed , - 1 , " Copying of text from this document is not allowed. " ) ;
goto error ;
}
param . first_page = min ( max ( param . first_page , 1 ) , doc - > getNumPages ( ) ) ;
param . last_page = min ( max ( param . last_page , param . first_page ) , doc - > getNumPages ( ) ) ;
if ( param . output_filename = = " " )
{
2012-08-16 07:54:50 +00:00
const string s = path ( param . input_filename ) . filename ( ) . string ( ) ;
2012-09-07 17:18:15 +00:00
if ( ( s . size ( ) > = 4 ) & & ( s . compare ( s . size ( ) - 4 , 4 , " .pdf " ) = = 0 ) )
2012-08-04 18:03:53 +00:00
{
param . output_filename = s . substr ( 0 , s . size ( ) - 4 ) + " .html " ;
}
else
{
param . output_filename = s + " .html " ;
}
}
htmlOut = new HTMLRenderer ( & param ) ;
htmlOut - > process ( doc ) ;
delete htmlOut ;
finished = 0 ;
// clean up
error :
if ( doc ) delete doc ;
delete fileName ;
if ( globalParams ) delete globalParams ;
// check for memory leaks
Object : : memCheck ( stderr ) ;
gMemReport ( stderr ) ;
return finished ;
}