1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-08 10:50:33 +00:00
pdf2htmlEX/src/pdf2htmlEX.cc

286 lines
9.6 KiB
C++
Raw Normal View History

2012-08-04 18:03:53 +00:00
// pdftohtmlEX.cc
//
2013-01-24 14:21:06 +00:00
// Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
2012-08-28 10:27:45 +00:00
2012-08-04 18:03:53 +00:00
#include <cstdio>
#include <cstdlib>
#include <cstddef>
#include <cstring>
#include <ctime>
#include <string>
#include <limits>
2012-08-12 10:53:22 +00:00
#include <iostream>
2012-09-10 05:03:25 +00:00
#include <getopt.h>
2012-08-04 18:03:53 +00:00
2013-01-23 12:29:59 +00:00
#include <poppler-config.h>
2012-08-12 10:53:22 +00:00
#include <goo/GooString.h>
2012-08-13 14:20:38 +00:00
#include <Object.h>
#include <PDFDoc.h>
#include <PDFDocFactory.h>
#include <GlobalParams.h>
2012-08-04 18:03:53 +00:00
#include "Param.h"
2012-09-10 05:08:47 +00:00
#include "pdf2htmlEX-config.h"
2012-11-29 09:28:05 +00:00
#include "HTMLRenderer/HTMLRenderer.h"
#include "util/ArgParser.h"
2012-11-29 10:16:05 +00:00
#include "util/path.h"
2013-01-23 12:29:59 +00:00
#include "util/ffw.h"
2012-08-04 18:03:53 +00:00
using namespace std;
using namespace pdf2htmlEX;
2012-08-04 18:03:53 +00:00
Param param;
2012-09-10 09:01:15 +00:00
ArgParser argparser;
2012-08-04 18:03:53 +00:00
2013-01-23 15:02:11 +00:00
void show_usage_and_exit(const char * dummy = nullptr)
2013-01-23 12:29:59 +00:00
{
2013-01-28 22:16:38 +00:00
cerr << "Usage: pdf2htmlEX [options] <input.pdf> [<output.html>]" << endl;
2012-09-10 09:01:15 +00:00
argparser.show_usage(cerr);
exit(EXIT_FAILURE);
2012-08-04 18:03:53 +00:00
}
2013-01-23 12:29:59 +00:00
void show_version_and_exit(const char * dummy = nullptr)
{
2013-01-23 15:02:11 +00:00
cerr << "pdftohtmlEX version " << PDF2HTMLEX_VERSION << endl;
2013-01-24 14:21:06 +00:00
cerr << "Copyright 2012,2013 Lu Wang <coolwanglu@gmail.com>" << endl;
2013-01-23 15:02:11 +00:00
cerr << "Libraries: ";
cerr << "poppler " << POPPLER_VERSION << ", ";
cerr << "libfontforge " << ffw_get_version() << endl;
2013-01-25 13:13:27 +00:00
exit(EXIT_SUCCESS);
2013-01-23 12:29:59 +00:00
}
2012-09-10 09:01:15 +00:00
void parse_options (int argc, char **argv)
2012-08-04 18:03:53 +00:00
{
2012-09-10 09:01:15 +00:00
argparser
// pages
2013-01-28 22:45:12 +00:00
.add("first-page,f", &param.first_page, 1, "first page to convert")
.add("last-page,l", &param.last_page, numeric_limits<int>::max(), "last page to convert")
// dimensions
2012-09-26 16:17:56 +00:00
.add("zoom", &param.zoom, 0, "zoom ratio", nullptr, true)
2013-01-28 22:45:12 +00:00
.add("fit-width", &param.fit_width, 0, "fit width to <fp> pixels", nullptr, true)
.add("fit-height", &param.fit_height, 0, "fit height to <fp> pixels", nullptr, true)
.add("use-cropbox", &param.use_cropbox, 0, "use CropBox instead of MediaBox")
2013-01-28 22:45:12 +00:00
.add("hdpi", &param.h_dpi, 144.0, "horizontal resolution for graphics in DPI")
.add("vdpi", &param.v_dpi, 144.0, "vertical resolution for graphics in DPI")
// output files
2013-01-28 22:45:12 +00:00
.add("single-html", &param.single_html, 1, "generate a single HTML file")
.add("split-pages", &param.split_pages, 0, "split pages into separate files")
.add("dest-dir", &param.dest_dir, ".", "specify destination directory")
.add("css-filename", &param.css_filename, "", "filename of the generated css file")
.add("outline-filename", &param.outline_filename, "", "filename of the generated outline file")
2013-01-29 10:38:39 +00:00
// fonts
2013-01-28 22:45:12 +00:00
.add("embed-base-font", &param.embed_base_font, 0, "embed local match for standard 14 fonts")
.add("embed-external-font", &param.embed_external_font, 0, "embed local match for external fonts")
.add("font-suffix", &param.font_suffix, ".ttf", "suffix for embedded font files (.ttf,.otf,.woff,.svg)")
.add("font-format", &param.font_format, "opentype", "CSS @font-face format for embedded fonts")
2013-01-28 22:45:12 +00:00
.add("decompose-ligature", &param.decompose_ligature, 0, "decompose ligatures, such as \uFB01 -> fi")
.add("remove-unused-glyph", &param.remove_unused_glyph, 1, "remove unused glyphs in embedded fonts")
.add("auto-hint", &param.auto_hint, 0, "use fontforge autohint on fonts without hints")
.add("external-hint-tool", &param.external_hint_tool, "", "external tool for hinting fonts (overrides --auto-hint)")
.add("stretch-narrow-glyph", &param.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding them")
.add("squeeze-wide-glyph", &param.squeeze_wide_glyph, 1, "shrink wide glyphs instead of truncating them")
// text
2013-01-28 22:45:12 +00:00
.add("heps", &param.h_eps, 1.0, "horizontal threshold for merging text, in pixels")
.add("veps", &param.v_eps, 1.0, "vertical threshold for merging text, in pixels")
.add("space-threshold", &param.space_threshold, (1.0/8), "word break threshold (threshold * em)")
.add("font-size-multiplier", &param.font_size_multiplier, 4.0, "a value greater than 1 increases the rendering accuracy")
2012-09-10 09:01:15 +00:00
.add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets")
.add("tounicode", &param.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)")
// encryption
.add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", nullptr, true)
.add("user-password,u", &param.user_password, "", "user password (for encrypted files)", nullptr, true)
.add("no-drm", &param.no_drm, 0, "override document DRM settings")
// misc.
2013-01-28 22:45:12 +00:00
.add("clean-tmp", &param.clean_tmp, 1, "remove temporary files after conversion")
.add("process-nontext", &param.process_nontext, 1, "render graphics in addition to text")
.add("data-dir", &param.data_dir, PDF2HTMLEX_DATA_PATH, "specify data directory")
2013-01-28 22:45:12 +00:00
.add("css-draw", &param.css_draw, 0, "[experimental and unsupported] CSS drawing")
.add("debug", &param.debug, 0, "print debugging information")
// meta
.add("version,v", "print copyright and version info", &show_version_and_exit)
.add("help,h", "print usage information", &show_usage_and_exit)
2012-09-10 14:22:01 +00:00
.add("", &param.input_filename, "", "")
.add("", &param.output_filename, "", "")
2012-08-04 18:03:53 +00:00
;
2012-09-10 09:01:15 +00:00
try
{
argparser.parse(argc, argv);
2012-08-04 18:03:53 +00:00
}
2012-09-10 14:44:19 +00:00
catch(const char * s)
{
// if s == "", getopt_long would have printed the error message
if(s && s[0])
{
cerr << "Error when parsing the arguments:" << endl;
cerr << s << endl;
}
exit(EXIT_FAILURE);
}
2012-09-10 09:01:15 +00:00
catch(const std::string & s)
{
2012-09-10 14:44:19 +00:00
// if s == "", getopt_long would have printed the error message
if(s != "")
{
cerr << "Error when parsing the arguments:" << endl;
cerr << s << endl;
}
2012-09-10 09:01:15 +00:00
exit(EXIT_FAILURE);
2012-08-04 18:03:53 +00:00
}
}
int main(int argc, char **argv)
{
2012-09-10 09:01:15 +00:00
parse_options(argc, argv);
if (param.input_filename == "")
2012-08-04 18:03:53 +00:00
{
2013-01-28 22:16:38 +00:00
show_usage_and_exit();
2012-08-04 18:03:53 +00:00
}
2012-08-14 10:12:58 +00:00
//prepare the directories
2012-09-09 18:22:49 +00:00
{
2012-09-10 09:01:15 +00:00
char buf[] = "/tmp/pdf2htmlEX-XXXXXX";
auto p = mkdtemp(buf);
if(p == nullptr)
{
cerr << "Cannot create temp directory" << endl;
exit(EXIT_FAILURE);
}
param.tmp_dir = buf;
2012-09-09 18:22:49 +00:00
}
2012-09-09 17:23:28 +00:00
if(param.debug)
cerr << "temporary dir: " << (param.tmp_dir) << endl;
2012-08-14 10:12:58 +00:00
try
{
create_directories(param.dest_dir);
}
2012-09-09 17:23:28 +00:00
catch (const string & s)
2012-08-14 10:12:58 +00:00
{
2012-09-09 17:23:28 +00:00
cerr << s << endl;
2012-09-10 09:01:15 +00:00
exit(EXIT_FAILURE);
2012-08-14 10:12:58 +00:00
}
2012-09-10 09:01:15 +00:00
bool finished = false;
2012-08-04 18:03:53 +00:00
// read config file
globalParams = new GlobalParams();
// open PDF file
2012-09-10 09:01:15 +00:00
PDFDoc *doc = nullptr;
try
{
{
GooString * ownerPW = (param.owner_password == "") ? (nullptr) : (new GooString(param.owner_password.c_str()));
GooString * userPW = (param.user_password == "") ? (nullptr) : (new GooString(param.user_password.c_str()));
GooString fileName(param.input_filename.c_str());
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
doc = PDFDocFactory().createPDFDoc(fileName, ownerPW, userPW);
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
delete userPW;
delete ownerPW;
}
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
if (!doc->isOk()) {
throw "Cannot read the file";
}
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
// check for copy permission
if (!doc->okToCopy()) {
if (param.no_drm == 0) {
2013-01-25 00:56:49 +00:00
throw "Copying of text from this document is not allowed.";
}
cerr << "Document has copy-protection bit set." << endl;
2012-09-10 09:01:15 +00:00
}
2012-08-04 18:03:53 +00:00
2012-10-02 18:19:40 +00:00
param.first_page = min<int>(max<int>(param.first_page, 1), doc->getNumPages());
param.last_page = min<int>(max<int>(param.last_page, param.first_page), doc->getNumPages());
2012-09-09 17:18:09 +00:00
2013-01-28 12:00:20 +00:00
if(param.output_filename.empty())
2012-08-04 18:03:53 +00:00
{
2012-09-10 09:01:15 +00:00
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
2012-09-13 03:38:56 +00:00
if(param.split_pages)
param.output_filename = s.substr(0, s.size() - 4);
else
2012-09-12 15:26:14 +00:00
param.output_filename = s.substr(0, s.size() - 4) + ".html";
2012-09-13 03:38:56 +00:00
2012-09-10 09:01:15 +00:00
}
else
{
2012-09-13 03:38:56 +00:00
if(param.split_pages)
param.output_filename = s;
else
2012-09-12 15:26:14 +00:00
param.output_filename = s + ".html";
2012-09-13 03:38:56 +00:00
2012-09-12 15:26:14 +00:00
}
}
2013-01-28 12:00:20 +00:00
if(param.css_filename.empty())
2012-09-12 15:26:14 +00:00
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.css_filename = s.substr(0, s.size() - 4) + ".css";
2012-09-12 15:26:14 +00:00
}
else
{
if(!param.split_pages)
param.css_filename = s + ".css";
2012-09-10 09:01:15 +00:00
}
2012-08-04 18:03:53 +00:00
}
2013-01-28 12:00:20 +00:00
if(param.outline_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.outline_filename = s.substr(0, s.size() - 4) + ".outline";
}
else
{
if(!param.split_pages)
param.outline_filename = s + ".outline";
}
}
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
HTMLRenderer * htmlOut = new HTMLRenderer(&param);
htmlOut->process(doc);
delete htmlOut;
2012-08-04 18:03:53 +00:00
2012-09-10 09:01:15 +00:00
finished = true;
}
2012-09-10 14:44:19 +00:00
catch (const char * s)
{
cerr << "Error: " << s << endl;
}
2012-09-10 09:01:15 +00:00
catch (const string & s)
{
cerr << "Error: " << s << endl;
}
2012-08-04 18:03:53 +00:00
// clean up
if(doc) delete doc;
if(globalParams) delete globalParams;
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
2012-09-10 09:01:15 +00:00
exit(finished ? (EXIT_SUCCESS) : (EXIT_FAILURE));
2012-09-09 19:30:54 +00:00
return 0;
2012-08-04 18:03:53 +00:00
}