1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

working on getopt

This commit is contained in:
Lu Wang 2012-09-10 17:01:15 +08:00
parent efd15e3735
commit 2cc7da786a
5 changed files with 394 additions and 113 deletions

View File

@ -77,6 +77,8 @@ add_executable(pdf2htmlEX
src/FontPreprocessor.cc
src/include/util.h
src/util.cc
src/include/ArgParser.h
src/ArgParser.cc
src/include/pdf2htmlEX-config.h
)

132
src/ArgParser.cc Normal file
View File

@ -0,0 +1,132 @@
/*
* A wrapper of getopt
*
* by WangLu
* 2012.09.10
*/
#include <getopt.h>
#include <iostream>
#include <vector>
#include <unordered_map>
#include "ArgParser.h"
using std::ostream;
using std::cerr;
using std::endl;
using std::string;
using std::vector;
using std::unordered_map;
using std::make_pair;
using std::ostringstream;
ArgParser::~ArgParser(void)
{
for(auto iter = arg_entries.begin(); iter != arg_entries.end(); ++iter)
delete (*iter);
}
ArgParser & ArgParser::add(const char * optname, const char * description, ArgParserCallBack callback)
{
return add<char>(optname, nullptr, 0, description, callback);
}
void ArgParser::parse(int argc, char ** argv) const
{
//prepare optstring and longopts
vector<char> optstring;
optstring.reserve(arg_entries.size() + 1);
vector<struct option> longopts;
longopts.reserve(arg_entries.size() + 1);
unordered_map<int, const ArgEntryBase*> opt_map;
for(auto iter = arg_entries.begin(); iter != arg_entries.end(); ++iter)
{
const ArgEntryBase * p = *iter;
if(p->shortname != 0)
{
optstring.push_back(p->shortname);
if(p->need_arg)
optstring.push_back(':');
int v = p->shortname;
if(!(opt_map.insert(make_pair(v, p)).second))
{
cerr << "Warning: duplicated shortname '" << v << "' used by -" << (char)(p->shortname) << " and -" << (char)(opt_map[p->shortname]->shortname) << endl;
}
}
if(p->name != "")
{
int v = (256 + (iter - arg_entries.begin()));
longopts.push_back({p->name.c_str(), ((p->need_arg) ? required_argument : no_argument), nullptr, v});
if(!(opt_map.insert(make_pair(v, p)).second))
{
cerr << "Warning: duplicated shortname '" << v << "' used by --" << (p->name) << " and --" << (opt_map[p->shortname]->name) << endl;
}
}
}
optstring.push_back(0);
longopts.push_back({0,0,0,0});
{
int r;
int idx;
opterr = 0;
while(true)
{
r = getopt_long(argc, argv, &optstring.front(), &longopts.front(), &idx);
if(r == -1)
return;
if(r == ':')
{
ostringstream sout;
sout << "Missing argument for option ";
if(r < 256)
sout << "-" << (char)(opt_map[optopt]->shortname);
else
sout << "--" << opt_map[optopt]->name;
sout << endl;
throw sout.str();
}
cerr << r << ' ' << idx << ' ' << (optarg ? optarg : "") << endl;
}
}
}
void ArgParser::show_usage(ostream & out) const
{
for(auto iter = arg_entries.begin(); iter != arg_entries.end(); ++iter)
{
(*iter)->show_usage(out);
}
}
ArgParser::ArgEntryBase::ArgEntryBase(const char * name, const char * description, bool need_arg)
: shortname(0), name(name), description(description), need_arg(need_arg)
{
size_t idx = this->name.rfind(',');
if(idx != string::npos)
{
if(idx+2 == this->name.size())
{
shortname = this->name[this->name.size()-1];
this->name = this->name.substr(0, idx);
}
else
{
cerr << "Warning: argument '" << this->name << "' may not be parsed correctly" << endl;
}
}
}
void dump_default_value(std::ostream & out, const std::string & v)
{
out << '"' << v << '"';
}
const int ArgParser::arg_col_width = 40;

139
src/include/ArgParser.h Normal file
View File

@ -0,0 +1,139 @@
/*
* A wrapper of getopt
*
* by WangLu
* 2012.09.10
*/
#ifndef ARGPARSER_H__
#define ARGPARSER_H__
#include <string>
#include <vector>
#include <ostream>
#include <sstream>
class ArgParser
{
public:
~ArgParser(void);
typedef void (*ArgParserCallBack) (void);
/*
* optname: name of the argment, should be provided as --optname
* description: if description is "", the argument won't be shown in show_usage()
*/
ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback = nullptr);
template <class T, class Tv>
ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback = nullptr);
void parse(int argc, char ** argv) const;
void show_usage(std::ostream & out) const;
private:
class ArgEntryBase
{
public:
ArgEntryBase(const char * name, const char * description, bool need_arg);
virtual ~ArgEntryBase() { }
char shortname;
std::string name;
std::string description;
bool need_arg;
virtual void parse (void) const = 0;
virtual void show_usage (std::ostream & out) const = 0;
};
template <class T, class Tv>
class ArgEntry : public ArgEntryBase
{
public:
ArgEntry(const char * name, T * location, const Tv & deafult_value, ArgParserCallBack callback, const char * description);
virtual void parse (void) const;
virtual void show_usage (std::ostream & out) const;
private:
T * location;
T default_value;
ArgParserCallBack callback;
};
std::vector<ArgEntryBase *> arg_entries;
static const int arg_col_width;
};
template<class T, class Tv>
ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback)
{
arg_entries.push_back(new ArgEntry<T, Tv>(optname, location, default_value, callback, description));
return *this;
}
template<class T, class Tv>
ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, T * location, const Tv & default_value, ArgParserCallBack callback, const char * description)
: ArgEntryBase(name, description, (location != nullptr))
, location(location)
, default_value(default_value)
, callback(callback)
{
if(need_arg)
*location = T(default_value);
}
template<class T, class Tv>
void ArgParser::ArgEntry<T, Tv>::parse(void) const
{ }
// helper
template<class T>
void dump_default_value(std::ostream & out, const T & v)
{
out << v;
}
extern void dump_default_value(std::ostream & out, const std::string & v);
template<class T, class Tv>
void ArgParser::ArgEntry<T, Tv>::show_usage(std::ostream & out) const
{
if(description == "")
return;
std::ostringstream sout;
sout << " ";
if(shortname != 0)
{
sout << "-" << shortname;
}
if(name != "")
{
if(shortname != 0)
sout << ",";
sout << "--" << name;
}
if(need_arg)
{
sout << " <arg> (=";
dump_default_value(sout, default_value);
sout << ")";
}
std::string s = sout.str();
out << s;
for(int i = s.size(); i < arg_col_width; ++i)
out << ' ';
out << " " << description << std::endl;
}
#endif //ARGPARSER_H__

View File

@ -0,0 +1,19 @@
/*
* config.h
* Compile time constants
*
* by WangLu
*/
#ifndef PDF2HTMLEX_CONFIG_H__
#define PDF2HTMLEX_CONFIG_H__
#include <string>
static const std::string PDF2HTMLEX_VERSION = "0.3";
static const std::string PDF2HTMLEX_PREFIX = "/usr/local";
static const std::string PDF2HTMLEX_DATA_PATH = "/usr/local""/share/pdf2htmlEX";
#endif //PDF2HTMLEX_CONFIG_H__

View File

@ -24,113 +24,96 @@
#include "HTMLRenderer.h"
#include "Param.h"
#include "pdf2htmlEX-config.h"
#include "ArgParser.h"
namespace po = boost::program_options;
using namespace std;
Param param;
ArgParser argparser;
// variables
PDFDoc *doc = nullptr;
GooString *fileName = nullptr;
GooString *ownerPW, *userPW;
HTMLRenderer *htmlOut = nullptr;
bool finished = false;
po::options_description opt_visible("Options"), opt_hidden, opt_all;
po::positional_options_description opt_positional;
void show_usage(void)
void show_usage_and_exit(void)
{
cerr << "pdftohtmlEX version " << PDF2HTMLEX_VERSION << endl;
cerr << endl;
cerr << "Copyright 2012 Lu Wang (coolwanglu<at>gmail.com)" << endl;
cerr << endl;
cerr << "Usage: pdf2htmlEX [Options] <PDF-file>" << endl;
cerr << "Usage: pdf2htmlEX [Options] <input.pdf> [<output.html>]" << endl;
cerr << endl;
cerr << opt_visible << endl;
cerr << "Options:" << endl;
argparser.show_usage(cerr);
cerr << endl;
exit(EXIT_FAILURE);
}
po::variables_map parse_options (int argc, char **argv)
void parse_options (int argc, char **argv)
{
opt_visible.add_options()
("help", "show all options")
("version,v", "show copyright and version info")
argparser
.add("help,h", "show all options", &show_usage_and_exit)
.add("version,v", "show copyright and version info", &show_usage_and_exit)
("owner-password,o", po::value<string>(&param.owner_password)->default_value(""), "owner password (for encrypted files)")
("user-password,u", po::value<string>(&param.user_password)->default_value(""), "user password (for encrypted files)")
.add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)")
.add("user-password,u", &param.user_password, "", "user password (for encrypted files)")
("dest-dir", po::value<string>(&param.dest_dir)->default_value("."), "destination directory")
.add("dest-dir", &param.dest_dir, ".", "destination directory")
("first-page,f", po::value<int>(&param.first_page)->default_value(1), "first page to process")
("last-page,l", po::value<int>(&param.last_page)->default_value(numeric_limits<int>::max()), "last page to process")
.add("first-page,f", &param.first_page, 1, "first page to process")
.add("last-page,l", &param.last_page, numeric_limits<int>::max(), "last page to process")
("zoom", po::value<double>(&param.zoom)->default_value(1.0), "zoom ratio")
("hdpi", po::value<double>(&param.h_dpi)->default_value(144.0), "horizontal DPI for non-text")
("vdpi", po::value<double>(&param.v_dpi)->default_value(144.0), "vertical DPI for non-text")
.add("zoom", &param.zoom, 1.0, "zoom ratio")
.add("hdpi", &param.h_dpi, 144.0, "horizontal DPI for non-text")
.add("vdpi", &param.v_dpi, 144.0, "vertical DPI for non-text")
("process-nontext", po::value<int>(&param.process_nontext)->default_value(1), "process nontext objects")
("single-html", po::value<int>(&param.single_html)->default_value(1), "combine everything into one single HTML file")
("embed-base-font", po::value<int>(&param.embed_base_font)->default_value(0), "embed local matched font for base 14 fonts in the PDF file")
("embed-external-font", po::value<int>(&param.embed_external_font)->default_value(0), "embed local matched font for external fonts in the PDF file")
("decompose-ligature", po::value<int>(&param.decompose_ligature)->default_value(0), "decompose ligatures, for example 'fi' -> 'f''i'")
.add("process-nontext", &param.process_nontext, 1, "process nontext objects")
.add("single-html", &param.single_html, 1, "combine everything into one single HTML file")
.add("embed-base-font", &param.embed_base_font, 0, "embed local matched font for base 14 fonts in the PDF file")
.add("embed-external-font", &param.embed_external_font, 0, "embed local matched font for external fonts in the PDF file")
.add("decompose-ligature", &param.decompose_ligature, 0, "decompose ligatures, for example 'fi' -> 'f''i'")
("heps", po::value<double>(&param.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)")
("veps", po::value<double>(&param.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")
("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/8), "distance no thiner than (threshold * em) will be considered as a space character")
("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy")
("tounicode", po::value<int>(&param.tounicode)->default_value(0), "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled")
("space-as-offset", po::value<int>(&param.space_as_offset)->default_value(0), "treat space characters as offsets")
.add("heps", &param.h_eps, 1.0, "max tolerated horizontal offset (in pixels)")
.add("veps", &param.v_eps, 1.0, "max tolerated vertical offset (in pixels)")
.add("space-threshold", &param.space_threshold, (1.0/8), "distance no thiner than (threshold * em) will be considered as a space character")
.add("font-size-multiplier", &param.font_size_multiplier, 10.0, "setting a value greater than 1 would increase the rendering accuracy")
.add("tounicode", &param.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled")
.add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets")
("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files")
("font-format", po::value<string>(&param.font_format)->default_value("truetype"), "format for extracted font files")
.add("font-suffix", &param.font_suffix, ".ttf", "suffix for extracted font files")
.add("font-format", &param.font_format, "truetype", "format for extracted font files")
("debug", po::value<int>(&param.debug)->default_value(0), "output debug information")
("clean-tmp", po::value<int>(&param.clean_tmp)->default_value(1), "clean temporary files after processing")
.add("debug", &param.debug, 0, "output debug information")
.add("clean-tmp", &param.clean_tmp, 1, "clean temporary files after processing")
;
opt_hidden.add_options()
("inputfilename", po::value<string>(&param.input_filename)->default_value(""), "")
("outputfilename", po::value<string>(&param.output_filename)->default_value(""), "")
;
opt_positional.add("inputfilename", 1).add("outputfilename",1);
opt_all.add(opt_visible).add(opt_hidden);
try {
po::variables_map opt_vm;
po::store(po::command_line_parser(argc, argv).options(opt_all).positional(opt_positional).run()
, opt_vm);
po::notify(opt_vm);
return opt_vm;
try
{
argparser.parse(argc, argv);
}
catch(...) {
show_usage();
abort();
catch(const std::string & s)
{
cerr << "Error when parsing the arguments:" << endl;
cerr << s << endl;
exit(EXIT_FAILURE);
}
}
int main(int argc, char **argv)
{
auto opt_map = parse_options(argc, argv);
if (opt_map.count("version") || opt_map.count("help") || (param.input_filename == ""))
parse_options(argc, argv);
if (param.input_filename == "")
{
show_usage();
abort();
show_usage_and_exit();
}
//prepare the directories
char buf[] = "/tmp/pdf2htmlEX-XXXXXX";
auto p = mkdtemp(buf);
if(p == nullptr)
{
cerr << "Cannot create temp directory" << endl;
abort();
char buf[] = "/tmp/pdf2htmlEX-XXXXXX";
auto p = mkdtemp(buf);
if(p == nullptr)
{
cerr << "Cannot create temp directory" << endl;
exit(EXIT_FAILURE);
}
param.tmp_dir = buf;
}
param.tmp_dir = buf;
if(param.debug)
cerr << "temporary dir: " << (param.tmp_dir) << endl;
@ -142,67 +125,73 @@ int main(int argc, char **argv)
catch (const string & s)
{
cerr << s << endl;
abort();
exit(EXIT_FAILURE);
}
bool finished = false;
// read config file
globalParams = new GlobalParams();
// open PDF file
ownerPW = (param.owner_password == "") ? (nullptr) : (new GooString(param.owner_password.c_str()));
userPW = (param.user_password == "") ? (nullptr) : (new GooString(param.user_password.c_str()));
fileName = new GooString(param.input_filename.c_str());
doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);
delete userPW;
delete ownerPW;
if (!doc->isOk()) {
goto error;
}
// check for copy permission
if (!doc->okToCopy()) {
error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
goto error;
}
param.first_page = min(max(param.first_page, 1), doc->getNumPages());
param.last_page = min(max(param.last_page, param.first_page), doc->getNumPages());
if(param.output_filename == "")
PDFDoc *doc = nullptr;
try
{
const string s = get_filename(param.input_filename);
{
GooString * ownerPW = (param.owner_password == "") ? (nullptr) : (new GooString(param.owner_password.c_str()));
GooString * userPW = (param.user_password == "") ? (nullptr) : (new GooString(param.user_password.c_str()));
GooString fileName(param.input_filename.c_str());
if(get_suffix(param.input_filename) == ".pdf")
{
param.output_filename = s.substr(0, s.size() - 4) + ".html";
doc = PDFDocFactory().createPDFDoc(fileName, ownerPW, userPW);
delete userPW;
delete ownerPW;
}
else
{
param.output_filename = s + ".html";
if (!doc->isOk()) {
throw "Cannot read the file";
}
// check for copy permission
if (!doc->okToCopy()) {
throw "Copying of text from this document is not allowed.";
}
param.first_page = min(max(param.first_page, 1), doc->getNumPages());
param.last_page = min(max(param.last_page, param.first_page), doc->getNumPages());
if(param.output_filename == "")
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.output_filename = s.substr(0, s.size() - 4) + ".html";
}
else
{
param.output_filename = s + ".html";
}
}
HTMLRenderer * htmlOut = new HTMLRenderer(&param);
htmlOut->process(doc);
delete htmlOut;
finished = true;
}
catch (const string & s)
{
cerr << "Error: " << s << endl;
}
htmlOut = new HTMLRenderer(&param);
htmlOut->process(doc);
delete htmlOut;
finished = true;
// clean up
error:
if(doc) delete doc;
delete fileName;
if(globalParams) delete globalParams;
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
if(!finished)
abort();
exit(finished ? (EXIT_SUCCESS) : (EXIT_FAILURE));
return 0;
}