2012-08-04 18:03:53 +00:00
|
|
|
// pdftohtmlEX.cc
|
|
|
|
//
|
2012-08-28 10:27:45 +00:00
|
|
|
// Copyright (C) 2012 Lu Wang coolwanglu<at>gmail.com
|
|
|
|
|
2012-08-04 18:03:53 +00:00
|
|
|
#include <cstdio>
|
|
|
|
#include <cstdlib>
|
|
|
|
#include <cstddef>
|
|
|
|
#include <cstring>
|
|
|
|
#include <ctime>
|
|
|
|
#include <string>
|
|
|
|
#include <limits>
|
2012-08-12 10:53:22 +00:00
|
|
|
#include <iostream>
|
2012-09-10 05:03:25 +00:00
|
|
|
#include <getopt.h>
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-12 10:53:22 +00:00
|
|
|
#include <goo/GooString.h>
|
|
|
|
|
2012-08-13 14:20:38 +00:00
|
|
|
#include <Object.h>
|
|
|
|
#include <PDFDoc.h>
|
|
|
|
#include <PDFDocFactory.h>
|
|
|
|
#include <GlobalParams.h>
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-13 14:20:38 +00:00
|
|
|
#include "HTMLRenderer.h"
|
2012-08-04 18:03:53 +00:00
|
|
|
#include "Param.h"
|
2012-09-10 05:08:47 +00:00
|
|
|
#include "pdf2htmlEX-config.h"
|
2012-09-10 09:01:15 +00:00
|
|
|
#include "ArgParser.h"
|
2012-08-04 18:03:53 +00:00
|
|
|
|
|
|
|
using namespace std;
|
2012-09-11 13:52:46 +00:00
|
|
|
using namespace pdf2htmlEX;
|
2012-08-04 18:03:53 +00:00
|
|
|
|
|
|
|
Param param;
|
2012-09-10 09:01:15 +00:00
|
|
|
ArgParser argparser;
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-09-10 14:22:01 +00:00
|
|
|
void show_usage_and_exit(const char * dummy = nullptr)
|
2012-08-04 18:03:53 +00:00
|
|
|
{
|
2012-08-14 05:56:41 +00:00
|
|
|
cerr << "pdftohtmlEX version " << PDF2HTMLEX_VERSION << endl;
|
2012-08-04 18:03:53 +00:00
|
|
|
cerr << endl;
|
|
|
|
cerr << "Copyright 2012 Lu Wang (coolwanglu<at>gmail.com)" << endl;
|
|
|
|
cerr << endl;
|
2012-09-10 09:01:15 +00:00
|
|
|
cerr << "Usage: pdf2htmlEX [Options] <input.pdf> [<output.html>]" << endl;
|
|
|
|
cerr << endl;
|
|
|
|
cerr << "Options:" << endl;
|
|
|
|
argparser.show_usage(cerr);
|
2012-08-04 18:03:53 +00:00
|
|
|
cerr << endl;
|
2012-09-10 09:01:15 +00:00
|
|
|
exit(EXIT_FAILURE);
|
2012-08-04 18:03:53 +00:00
|
|
|
}
|
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
void parse_options (int argc, char **argv)
|
2012-08-04 18:03:53 +00:00
|
|
|
{
|
2012-09-10 09:01:15 +00:00
|
|
|
argparser
|
|
|
|
.add("help,h", "show all options", &show_usage_and_exit)
|
|
|
|
.add("version,v", "show copyright and version info", &show_usage_and_exit)
|
2012-08-31 05:00:24 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
.add("owner-password,o", ¶m.owner_password, "", "owner password (for encrypted files)")
|
|
|
|
.add("user-password,u", ¶m.user_password, "", "user password (for encrypted files)")
|
2012-08-31 05:00:24 +00:00
|
|
|
|
2012-09-12 16:16:34 +00:00
|
|
|
.add("dest-dir", ¶m.dest_dir, ".", "specify destination directory")
|
|
|
|
.add("data-dir", ¶m.data_dir, PDF2HTMLEX_DATA_PATH, "specify data directory")
|
2012-08-31 05:00:24 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
.add("first-page,f", ¶m.first_page, 1, "first page to process")
|
|
|
|
.add("last-page,l", ¶m.last_page, numeric_limits<int>::max(), "last page to process")
|
2012-08-31 05:00:24 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
.add("zoom", ¶m.zoom, 1.0, "zoom ratio")
|
|
|
|
.add("hdpi", ¶m.h_dpi, 144.0, "horizontal DPI for non-text")
|
|
|
|
.add("vdpi", ¶m.v_dpi, 144.0, "vertical DPI for non-text")
|
2012-08-31 05:00:24 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
.add("process-nontext", ¶m.process_nontext, 1, "process nontext objects")
|
|
|
|
.add("single-html", ¶m.single_html, 1, "combine everything into one single HTML file")
|
2012-09-12 15:26:14 +00:00
|
|
|
.add("split-pages", ¶m.split_pages, 0, "split pages into separated files")
|
2012-09-10 09:01:15 +00:00
|
|
|
.add("embed-base-font", ¶m.embed_base_font, 0, "embed local matched font for base 14 fonts in the PDF file")
|
|
|
|
.add("embed-external-font", ¶m.embed_external_font, 0, "embed local matched font for external fonts in the PDF file")
|
|
|
|
.add("decompose-ligature", ¶m.decompose_ligature, 0, "decompose ligatures, for example 'fi' -> 'f''i'")
|
2012-08-31 05:00:24 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
.add("heps", ¶m.h_eps, 1.0, "max tolerated horizontal offset (in pixels)")
|
|
|
|
.add("veps", ¶m.v_eps, 1.0, "max tolerated vertical offset (in pixels)")
|
|
|
|
.add("space-threshold", ¶m.space_threshold, (1.0/8), "distance no thiner than (threshold * em) will be considered as a space character")
|
2012-09-24 16:55:41 +00:00
|
|
|
.add("font-size-multiplier", ¶m.font_size_multiplier, 4.0, "setting a value greater than 1 would increase the rendering accuracy")
|
2012-09-23 18:28:53 +00:00
|
|
|
.add("auto-hint", ¶m.auto_hint, 0, "Whether to generate hints for fonts")
|
2012-09-10 09:01:15 +00:00
|
|
|
.add("tounicode", ¶m.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled")
|
|
|
|
.add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets")
|
2012-08-31 05:00:24 +00:00
|
|
|
|
2012-09-12 15:26:14 +00:00
|
|
|
.add("css-filename", ¶m.css_filename, "", "Specify the file name of the generated css file")
|
2012-09-10 09:01:15 +00:00
|
|
|
.add("font-suffix", ¶m.font_suffix, ".ttf", "suffix for extracted font files")
|
2012-09-11 14:08:55 +00:00
|
|
|
.add("font-format", ¶m.font_format, "opentype", "format for extracted font files")
|
2012-09-23 18:28:53 +00:00
|
|
|
.add("external-hint-tool", ¶m.external_hint_tool, "", "external tool for hintting fonts.(overrides --auto-hint)")
|
2012-08-31 05:00:24 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
.add("debug", ¶m.debug, 0, "output debug information")
|
|
|
|
.add("clean-tmp", ¶m.clean_tmp, 1, "clean temporary files after processing")
|
2012-09-10 14:22:01 +00:00
|
|
|
.add("", ¶m.input_filename, "", "")
|
|
|
|
.add("", ¶m.output_filename, "", "")
|
2012-08-04 18:03:53 +00:00
|
|
|
;
|
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
try
|
|
|
|
{
|
|
|
|
argparser.parse(argc, argv);
|
2012-08-04 18:03:53 +00:00
|
|
|
}
|
2012-09-10 14:44:19 +00:00
|
|
|
catch(const char * s)
|
|
|
|
{
|
|
|
|
// if s == "", getopt_long would have printed the error message
|
|
|
|
if(s && s[0])
|
|
|
|
{
|
|
|
|
cerr << "Error when parsing the arguments:" << endl;
|
|
|
|
cerr << s << endl;
|
|
|
|
}
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
2012-09-10 09:01:15 +00:00
|
|
|
catch(const std::string & s)
|
|
|
|
{
|
2012-09-10 14:44:19 +00:00
|
|
|
// if s == "", getopt_long would have printed the error message
|
|
|
|
if(s != "")
|
|
|
|
{
|
|
|
|
cerr << "Error when parsing the arguments:" << endl;
|
|
|
|
cerr << s << endl;
|
|
|
|
}
|
2012-09-10 09:01:15 +00:00
|
|
|
exit(EXIT_FAILURE);
|
2012-08-04 18:03:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char **argv)
|
|
|
|
{
|
2012-09-10 09:01:15 +00:00
|
|
|
parse_options(argc, argv);
|
|
|
|
if (param.input_filename == "")
|
2012-08-04 18:03:53 +00:00
|
|
|
{
|
2012-09-10 14:22:01 +00:00
|
|
|
cerr << "Missing input filename" << endl;
|
|
|
|
exit(EXIT_FAILURE);
|
2012-08-04 18:03:53 +00:00
|
|
|
}
|
|
|
|
|
2012-08-14 10:12:58 +00:00
|
|
|
//prepare the directories
|
2012-09-09 18:22:49 +00:00
|
|
|
{
|
2012-09-10 09:01:15 +00:00
|
|
|
char buf[] = "/tmp/pdf2htmlEX-XXXXXX";
|
|
|
|
auto p = mkdtemp(buf);
|
|
|
|
if(p == nullptr)
|
|
|
|
{
|
|
|
|
cerr << "Cannot create temp directory" << endl;
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
|
|
|
param.tmp_dir = buf;
|
2012-09-09 18:22:49 +00:00
|
|
|
}
|
|
|
|
|
2012-09-09 17:23:28 +00:00
|
|
|
if(param.debug)
|
|
|
|
cerr << "temporary dir: " << (param.tmp_dir) << endl;
|
|
|
|
|
2012-08-14 10:12:58 +00:00
|
|
|
try
|
|
|
|
{
|
|
|
|
create_directories(param.dest_dir);
|
|
|
|
}
|
2012-09-09 17:23:28 +00:00
|
|
|
catch (const string & s)
|
2012-08-14 10:12:58 +00:00
|
|
|
{
|
2012-09-09 17:23:28 +00:00
|
|
|
cerr << s << endl;
|
2012-09-10 09:01:15 +00:00
|
|
|
exit(EXIT_FAILURE);
|
2012-08-14 10:12:58 +00:00
|
|
|
}
|
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
bool finished = false;
|
2012-08-04 18:03:53 +00:00
|
|
|
// read config file
|
|
|
|
globalParams = new GlobalParams();
|
|
|
|
// open PDF file
|
2012-09-10 09:01:15 +00:00
|
|
|
PDFDoc *doc = nullptr;
|
|
|
|
try
|
|
|
|
{
|
|
|
|
{
|
|
|
|
GooString * ownerPW = (param.owner_password == "") ? (nullptr) : (new GooString(param.owner_password.c_str()));
|
|
|
|
GooString * userPW = (param.user_password == "") ? (nullptr) : (new GooString(param.user_password.c_str()));
|
|
|
|
GooString fileName(param.input_filename.c_str());
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
doc = PDFDocFactory().createPDFDoc(fileName, ownerPW, userPW);
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
delete userPW;
|
|
|
|
delete ownerPW;
|
|
|
|
}
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
if (!doc->isOk()) {
|
|
|
|
throw "Cannot read the file";
|
|
|
|
}
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
// check for copy permission
|
|
|
|
if (!doc->okToCopy()) {
|
|
|
|
throw "Copying of text from this document is not allowed.";
|
|
|
|
}
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
param.first_page = min(max(param.first_page, 1), doc->getNumPages());
|
|
|
|
param.last_page = min(max(param.last_page, param.first_page), doc->getNumPages());
|
2012-09-09 17:18:09 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
if(param.output_filename == "")
|
2012-08-04 18:03:53 +00:00
|
|
|
{
|
2012-09-10 09:01:15 +00:00
|
|
|
const string s = get_filename(param.input_filename);
|
|
|
|
|
|
|
|
if(get_suffix(param.input_filename) == ".pdf")
|
|
|
|
{
|
2012-09-13 03:38:56 +00:00
|
|
|
if(param.split_pages)
|
|
|
|
param.output_filename = s.substr(0, s.size() - 4);
|
|
|
|
else
|
2012-09-12 15:26:14 +00:00
|
|
|
param.output_filename = s.substr(0, s.size() - 4) + ".html";
|
2012-09-13 03:38:56 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2012-09-13 03:38:56 +00:00
|
|
|
if(param.split_pages)
|
|
|
|
param.output_filename = s;
|
|
|
|
else
|
2012-09-12 15:26:14 +00:00
|
|
|
param.output_filename = s + ".html";
|
2012-09-13 03:38:56 +00:00
|
|
|
|
2012-09-12 15:26:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if(param.css_filename == "")
|
|
|
|
{
|
|
|
|
const string s = get_filename(param.input_filename);
|
|
|
|
|
|
|
|
if(get_suffix(param.input_filename) == ".pdf")
|
|
|
|
{
|
2012-09-12 15:55:29 +00:00
|
|
|
param.css_filename = s.substr(0, s.size() - 4) + ".css";
|
2012-09-12 15:26:14 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if(!param.split_pages)
|
2012-09-12 15:55:29 +00:00
|
|
|
param.css_filename = s + ".css";
|
2012-09-10 09:01:15 +00:00
|
|
|
}
|
2012-08-04 18:03:53 +00:00
|
|
|
}
|
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
HTMLRenderer * htmlOut = new HTMLRenderer(¶m);
|
|
|
|
htmlOut->process(doc);
|
|
|
|
delete htmlOut;
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
finished = true;
|
|
|
|
}
|
2012-09-10 14:44:19 +00:00
|
|
|
catch (const char * s)
|
|
|
|
{
|
|
|
|
cerr << "Error: " << s << endl;
|
|
|
|
}
|
2012-09-10 09:01:15 +00:00
|
|
|
catch (const string & s)
|
|
|
|
{
|
|
|
|
cerr << "Error: " << s << endl;
|
|
|
|
}
|
2012-08-04 18:03:53 +00:00
|
|
|
|
|
|
|
// clean up
|
|
|
|
if(doc) delete doc;
|
|
|
|
if(globalParams) delete globalParams;
|
|
|
|
|
|
|
|
// check for memory leaks
|
|
|
|
Object::memCheck(stderr);
|
|
|
|
gMemReport(stderr);
|
|
|
|
|
2012-09-10 09:01:15 +00:00
|
|
|
exit(finished ? (EXIT_SUCCESS) : (EXIT_FAILURE));
|
2012-09-09 19:30:54 +00:00
|
|
|
|
|
|
|
return 0;
|
2012-08-04 18:03:53 +00:00
|
|
|
}
|