1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

new option --embed, --embed-*; better ArgEntry

This commit is contained in:
Lu Wang 2013-05-27 07:43:26 +08:00
parent c815f304f3
commit 92454a9883
12 changed files with 202 additions and 115 deletions

0
build_for_ppa.py Executable file → Normal file
View File

0
debian/rules vendored Executable file → Normal file
View File

View File

@ -35,6 +35,7 @@ Specify the last page to process
.SS Dimensions .SS Dimensions
.TP
.B --zoom <ratio>, --fit-width <width>, --fit-height <height> .B --zoom <ratio>, --fit-width <width>, --fit-height <height>
--zoom specifies the zoom factor directly; --fit-width/height specifies the maximum width/height of a page, the values are in pixels. --zoom specifies the zoom factor directly; --fit-width/height specifies the maximum width/height of a page, the values are in pixels.
@ -53,16 +54,26 @@ Specify the horizontal and vertical DPI for images
.SS Output .SS Output
.TP .B --embed <string>
.B --single-html <0|1> (Default: 1) .br
Whether to embed everything into one HTML file. .B --embed-css <0|1> (Default: 1)
.br
.B --embed-font <0|1> (Default: 1)
.br
.B --embed-image <0|1> (Default: 1)
.br
.B --embed-javascript <0|1> (Default: 1)
.br
.B --embed-outline <0|1> (Default: 1)
.RS
Specify which elements should be embedded into the output HTML file.
If switched off, there will be several files generated along with the HTML file including files for fonts, css, images. If switched off, separated files will be generated along with the HTML file for the corresponding elements.
Note that the outline will always be embedded into the main HTML file no matter if this switch is on or not.
And only when this switch is off will there be a separate .outline file contains the outline.
You need to modify the manifest if you do not want outline embedded.
--embed accepts a string as argument. Each letter of the string must be one of `cCfFiIjJoO`, which corresponds
to one of the --embed-*** switches. Lower case letters for 0 and upper case letters for 1. For example,
`--embed cFIJo` means to embed everything but CSS files and outlines.
.RE
.TP .TP
.B --split-pages <0|1> (Default: 0) .B --split-pages <0|1> (Default: 0)
If turned on, the content of each page is stored in a separated file. If turned on, the content of each page is stored in a separated file.

View File

@ -41,9 +41,20 @@ void dump_value(std::ostream & out, const std::string & v)
out << '"' << v << '"'; out << '"' << v << '"';
} }
ArgParser & ArgParser::add(const char * optname, const char * description, ArgParserCallBack callback) ArgParser & ArgParser::add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg)
{ {
return add<char>(optname, nullptr, 0, description, callback, true); // ArgEntry does not accept nullptr as optname nor description
if((!optname) || (!optname[0]))
{
// when optname is nullptr or "", it's optional, and description is dropped
optional_arg_entries.emplace_back(new ArgEntry<string, string>("", "", callback, need_arg));
}
else
{
arg_entries.emplace_back(new ArgEntry<string, string>(optname, (description ? description : ""), callback, need_arg));
}
return *this;
} }
void ArgParser::parse(int argc, char ** argv) const void ArgParser::parse(int argc, char ** argv) const

View File

@ -42,79 +42,86 @@ extern void dump_value(std::ostream & out, const std::string & v);
class ArgParser class ArgParser
{ {
public: public:
typedef void (*ArgParserCallBack) (const char * arg); typedef void (*ArgParserCallBack) (const char * arg);
/* /*
* The 1st is for arg without arguments (i.e. flags), and the 2nd is for general args. * The 1st is for arguments with callbacks(i.e. flags)
* optname: * The 2nd is for arguments linked to variables
* - if not nullptr, it should be the name of the arg, should be in the format of "<long name>[,<short char>]", e.g. "help,h" *
* - if nullptr, it denotes an optional arg, and description will be ignored * optname:
* description: * - if not nullptr, it should be the name of the arg, should be in the format of "<long name>[,<short char>]", e.g. "help,h"
* - if description is nullptr or "", the argument won't be shown in show_usage() * - if nullptr, it denotes an optional arg, and description will be ignored
*/ * description:
ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback = nullptr); * - if description is nullptr or "", the argument won't be shown in show_usage()
template <class T, class Tv> *
ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback = nullptr, bool dont_show_default = false); * location:
* - if not nullptr, the argument for this arg is stored there
* - if nullptr, this arg does not need arguments
*/
ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg = false);
template <class T, class Tv>
ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default = false);
void parse(int argc, char ** argv) const; void parse(int argc, char ** argv) const;
void show_usage(std::ostream & out) const; void show_usage(std::ostream & out) const;
private:
// type names helper
template<class>
static const char * get_type_name(void) { return "unknown"; }
struct ArgEntryBase
{
/* name or description cannot be nullptr */
ArgEntryBase(const char * name, const char * description, bool need_arg);
virtual ~ArgEntryBase() { }
char shortname;
std::string name;
std::string description;
bool need_arg;
virtual void parse (const char * arg) const = 0;
virtual void show_usage (std::ostream & out) const = 0;
};
template <class T, class Tv>
struct ArgEntry : public ArgEntryBase
{
ArgEntry(const char * name,
const char * description,
ArgParserCallBack callback,
bool need_arg);
ArgEntry(const char * name,
T * location, const Tv & default_value,
const char * description, bool dont_show_default);
virtual void parse (const char * arg) const;
virtual void show_usage (std::ostream & out) const;
private: private:
// type names helper T * location;
template<class> T default_value;
static const char * get_type_name(void) { return "unknown"; } ArgParserCallBack callback;
bool dont_show_default;
};
class ArgEntryBase std::vector<std::unique_ptr<ArgEntryBase>> arg_entries, optional_arg_entries;
{ static const int arg_col_width;
public:
/* name or description cannot be nullptr */
ArgEntryBase(const char * name, const char * description, bool need_arg);
virtual ~ArgEntryBase() { }
char shortname;
std::string name;
std::string description;
bool need_arg;
virtual void parse (const char * arg) const = 0;
virtual void show_usage (std::ostream & out) const = 0;
};
template <class T, class Tv>
class ArgEntry : public ArgEntryBase
{
public:
ArgEntry(const char * name,
T * location, const Tv & deafult_value,
ArgParserCallBack callback,
const char * description, bool dont_show_default);
virtual void parse (const char * arg) const;
virtual void show_usage (std::ostream & out) const;
private:
T * location;
T default_value;
ArgParserCallBack callback;
bool dont_show_default;
};
std::vector<std::unique_ptr<ArgEntryBase>> arg_entries, optional_arg_entries;
static const int arg_col_width;
}; };
template<class T, class Tv> template<class T, class Tv>
ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback, bool dont_show_default) ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default)
{ {
// ArgEntry does not accept nullptr as optname nor description // ArgEntry does not accept nullptr as optname nor description
if((!optname) || (!optname[0])) if((!optname) || (!optname[0]))
{ {
// when optname is nullptr or "", it's optional, and description is dropped // when optname is nullptr or "", it's optional, and description is dropped
optional_arg_entries.emplace_back(new ArgEntry<T, Tv>("", location, default_value, callback, "", dont_show_default)); optional_arg_entries.emplace_back(new ArgEntry<T, Tv>("", location, default_value, "", dont_show_default));
} }
else else
{ {
arg_entries.emplace_back(new ArgEntry<T, Tv>(optname, location, default_value, callback, (description ? description : ""), dont_show_default)); arg_entries.emplace_back(new ArgEntry<T, Tv>(optname, location, default_value, (description ? description : ""), dont_show_default));
} }
return *this; return *this;
@ -126,12 +133,22 @@ template<> const char * ArgParser::get_type_name<double> (void);
template<> const char * ArgParser::get_type_name<std::string> (void); template<> const char * ArgParser::get_type_name<std::string> (void);
template<class T, class Tv> template<class T, class Tv>
ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, T * location, const Tv & default_value, ArgParserCallBack callback, const char * description, bool dont_show_default) ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, const char * description, ArgParserCallBack callback, bool need_arg)
: ArgEntryBase(name, description, need_arg)
, location(nullptr)
, default_value()
, callback(callback)
, dont_show_default(true)
{
}
template<class T, class Tv>
ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, T * location, const Tv & default_value, const char * description, bool dont_show_default)
: ArgEntryBase(name, description, (location != nullptr)) : ArgEntryBase(name, description, (location != nullptr))
, location(location) , location(location)
, default_value(default_value) , default_value(default_value)
, callback(callback) , callback(nullptr)
, dont_show_default(dont_show_default) , dont_show_default(dont_show_default)
{ {
if(need_arg) if(need_arg)
*location = T(default_value); *location = T(default_value);
@ -145,7 +162,7 @@ void ArgParser::ArgEntry<T, Tv>::parse(const char * arg) const
if(!arg) if(!arg)
throw std::string("Missing argument of option: --") + name; throw std::string("Missing argument of option: --") + name;
if(!read_value(arg, location)) if((location != nullptr) && (!read_value(arg, location)))
throw std::string("Invalid argument: ") + arg; throw std::string("Invalid argument: ") + arg;
} }

View File

@ -562,10 +562,10 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
* We need to reload in order to retrieve/fix accurate ascent/descent, some info won't be written to the font by fontforge until saved. * We need to reload in order to retrieve/fix accurate ascent/descent, some info won't be written to the font by fontforge until saved.
*/ */
string fn = (char*)str_fmt("%s/f%llx%s", string fn = (char*)str_fmt("%s/f%llx%s",
(param.single_html ? param.tmp_dir : param.dest_dir).c_str(), (param.embed_font ? param.tmp_dir : param.dest_dir).c_str(),
info.id, param.font_suffix.c_str()); info.id, param.font_suffix.c_str());
if(param.single_html) if(param.embed_font)
tmp_files.add(fn); tmp_files.add(fn);
ffw_load_font(cur_tmp_fn.c_str()); ffw_load_font(cur_tmp_fn.c_str());
@ -763,7 +763,7 @@ void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suff
{ {
auto fn = str_fmt("f%llx%s", info.id, suffix.c_str()); auto fn = str_fmt("f%llx%s", info.id, suffix.c_str());
if(param.single_html) if(param.embed_font)
{ {
auto path = param.tmp_dir + "/" + (char*)fn; auto path = param.tmp_dir + "/" + (char*)fn;
ifstream fin(path, ifstream::binary); ifstream fin(path, ifstream::binary);

View File

@ -122,8 +122,8 @@ void HTMLRenderer::process(PDFDoc *doc)
if(param.process_nontext) if(param.process_nontext)
{ {
auto fn = str_fmt("%s/bg%x.png", (param.single_html ? param.tmp_dir : param.dest_dir).c_str(), i); auto fn = str_fmt("%s/bg%x.png", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), i);
if(param.single_html) if(param.embed_image)
tmp_files.add((char*)fn); tmp_files.add((char*)fn);
bg_renderer->render_page(doc, i, (char*)fn); bg_renderer->render_page(doc, i, (char*)fn);
@ -196,7 +196,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
{ {
f_pages.fs << "<img class=\"" << CSS::BACKGROUND_IMAGE_CN f_pages.fs << "<img class=\"" << CSS::BACKGROUND_IMAGE_CN
<< "\" alt=\"\" src=\""; << "\" alt=\"\" src=\"";
if(param.single_html) if(param.embed_image)
{ {
auto path = str_fmt("%s/bg%x.png", param.tmp_dir.c_str(), pageNum); auto path = str_fmt("%s/bg%x.png", param.tmp_dir.c_str(), pageNum);
ifstream fin((char*)path, ifstream::binary); ifstream fin((char*)path, ifstream::binary);
@ -280,23 +280,19 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
// we may output utf8 characters, so always use binary // we may output utf8 characters, so always use binary
{ {
/* /*
* If single-html * If embed-css
* we have to keep the generated css file into a temporary place * we have to keep the generated css file into a temporary place
* and embed it into the main html later * and embed it into the main html later
* *
* * otherwise
* If single-html
* as there's no place to embed the css file, just leave it alone (into param.dest_dir)
*
* If !single-html
* leave it in param.dest_dir * leave it in param.dest_dir
*/ */
auto fn = (param.single_html) auto fn = (param.embed_css)
? str_fmt("%s/__css", param.tmp_dir.c_str()) ? str_fmt("%s/__css", param.tmp_dir.c_str())
: str_fmt("%s/%s", param.dest_dir.c_str(), param.css_filename.c_str()); : str_fmt("%s/%s", param.dest_dir.c_str(), param.css_filename.c_str());
if(param.single_html) if(param.embed_css)
tmp_files.add((char*)fn); tmp_files.add((char*)fn);
f_css.path = (char*)fn; f_css.path = (char*)fn;
@ -312,11 +308,11 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
* The logic for outline is similar to css * The logic for outline is similar to css
*/ */
auto fn = (param.single_html) auto fn = (param.embed_outline)
? str_fmt("%s/__outline", param.tmp_dir.c_str()) ? str_fmt("%s/__outline", param.tmp_dir.c_str())
: str_fmt("%s/%s", param.dest_dir.c_str(), param.outline_filename.c_str()); : str_fmt("%s/%s", param.dest_dir.c_str(), param.outline_filename.c_str());
if(param.single_html) if(param.embed_outline)
tmp_files.add((char*)fn); tmp_files.add((char*)fn);
f_outline.path = (char*)fn; f_outline.path = (char*)fn;
@ -333,7 +329,6 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
if(!param.split_pages) if(!param.split_pages)
{ {
/* /*
* If single-html
* we have to keep the html file for pages into a temporary place * we have to keep the html file for pages into a temporary place
* because we'll have to embed css before it * because we'll have to embed css before it
* *
@ -430,7 +425,7 @@ void HTMLRenderer::post_process(void)
} }
else if (line == "$outline") else if (line == "$outline")
{ {
if (param.process_outline) if (param.process_outline && param.embed_outline)
{ {
ifstream fin(f_outline.path, ifstream::binary); ifstream fin(f_outline.path, ifstream::binary);
if(!fin) if(!fin)
@ -522,28 +517,31 @@ void HTMLRenderer::embed_file(ostream & out, const string & path, const string &
string fn = get_filename(path); string fn = get_filename(path);
string suffix = (type == "") ? get_suffix(fn) : type; string suffix = (type == "") ? get_suffix(fn) : type;
auto iter = EMBED_STRING_MAP.find(make_pair(suffix, (bool)param.single_html)); // TODO
auto iter = EMBED_STRING_MAP.find(suffix);
if(iter == EMBED_STRING_MAP.end()) if(iter == EMBED_STRING_MAP.end())
{ {
cerr << "Warning: unknown suffix: " << suffix << endl; cerr << "Warning: unknown suffix: " << suffix << endl;
return; return;
} }
if(param.single_html) const auto & entry = iter->second;
if(param.*(entry.embed_flag))
{ {
ifstream fin(path, ifstream::binary); ifstream fin(path, ifstream::binary);
if(!fin) if(!fin)
throw string("Cannot open file ") + path + " for embedding"; throw string("Cannot open file ") + path + " for embedding";
out << iter->second.first << endl out << entry.prefix_embed << endl
<< fin.rdbuf(); << fin.rdbuf();
out.clear(); // out will set fail big if fin is empty out.clear(); // out will set fail big if fin is empty
out << iter->second.second << endl; out << entry.suffix_embed << endl;
} }
else else
{ {
out << iter->second.first; out << entry.prefix_external;
outputURL(out, fn); outputURL(out, fn);
out << iter->second.second << endl; out << entry.suffix_external << endl;
if(copy) if(copy)
{ {

View File

@ -25,7 +25,11 @@ struct Param
double h_dpi, v_dpi; double h_dpi, v_dpi;
// output // output
int single_html; int embed_css;
int embed_font;
int embed_image;
int embed_javascript;
int embed_outline;
int split_pages; int split_pages;
std::string dest_dir; std::string dest_dir;
std::string css_filename; std::string css_filename;

View File

@ -36,9 +36,9 @@ using namespace pdf2htmlEX;
Param param; Param param;
ArgParser argparser; ArgParser argparser;
void deprecated_embed_base_font(const char * dummy = nullptr) void deprecated_single_html(const char * dummy = nullptr)
{ {
cerr << "--embed-base-font is deprecated. Use --embed-external-font instead." << endl; cerr << "--single_html is deprecated. Use `--embed CFIJO` instead." << endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -60,6 +60,31 @@ void show_version_and_exit(const char * dummy = nullptr)
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
void embed_parser (const char * str)
{
while(true)
{
switch(*str)
{
case '\0': return; break;
case 'c': param.embed_css = 0; break;
case 'C': param.embed_css = 1; break;
case 'f': param.embed_font = 0; break;
case 'F': param.embed_font = 1; break;
case 'i': param.embed_image = 0; break;
case 'I': param.embed_image = 1; break;
case 'j': param.embed_javascript = 0; break;
case 'J': param.embed_javascript = 1; break;
case 'o': param.embed_outline = 0; break;
case 'O': param.embed_outline = 1; break;
default:
cerr << "Unknown character `" << (*str) << "` for --embed" << endl;
break;
}
++ str;
}
}
void parse_options (int argc, char **argv) void parse_options (int argc, char **argv)
{ {
argparser argparser
@ -68,15 +93,20 @@ void parse_options (int argc, char **argv)
.add("last-page,l", &param.last_page, numeric_limits<int>::max(), "last page to convert") .add("last-page,l", &param.last_page, numeric_limits<int>::max(), "last page to convert")
// dimensions // dimensions
.add("zoom", &param.zoom, 0, "zoom ratio", nullptr, true) .add("zoom", &param.zoom, 0, "zoom ratio", true)
.add("fit-width", &param.fit_width, 0, "fit width to <fp> pixels", nullptr, true) .add("fit-width", &param.fit_width, 0, "fit width to <fp> pixels", true)
.add("fit-height", &param.fit_height, 0, "fit height to <fp> pixels", nullptr, true) .add("fit-height", &param.fit_height, 0, "fit height to <fp> pixels", true)
.add("use-cropbox", &param.use_cropbox, 1, "use CropBox instead of MediaBox") .add("use-cropbox", &param.use_cropbox, 1, "use CropBox instead of MediaBox")
.add("hdpi", &param.h_dpi, 144.0, "horizontal resolution for graphics in DPI") .add("hdpi", &param.h_dpi, 144.0, "horizontal resolution for graphics in DPI")
.add("vdpi", &param.v_dpi, 144.0, "vertical resolution for graphics in DPI") .add("vdpi", &param.v_dpi, 144.0, "vertical resolution for graphics in DPI")
// output files // output files
.add("single-html", &param.single_html, 1, "generate a single HTML file") .add("embed", "specify which elements should be embedded into output", embed_parser, true)
.add("embed-css", &param.embed_css, 1, "embed CSS files into output")
.add("embed-font", &param.embed_font, 1, "embed font files into output")
.add("embed-image", &param.embed_image, 1, "embed image files into output")
.add("embed-javascript", &param.embed_javascript, 1, "embed JavaScript files into output")
.add("embed-outline", &param.embed_outline, 1, "embed outlines into output")
.add("split-pages", &param.split_pages, 0, "split pages into separate files") .add("split-pages", &param.split_pages, 0, "split pages into separate files")
.add("dest-dir", &param.dest_dir, ".", "specify destination directory") .add("dest-dir", &param.dest_dir, ".", "specify destination directory")
.add("css-filename", &param.css_filename, "", "filename of the generated css file") .add("css-filename", &param.css_filename, "", "filename of the generated css file")
@ -107,8 +137,8 @@ void parse_options (int argc, char **argv)
.add("optimize-text", &param.optimize_text, 0, "try to reduce the number of HTML elements used for text") .add("optimize-text", &param.optimize_text, 0, "try to reduce the number of HTML elements used for text")
// encryption // encryption
.add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", nullptr, true) .add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", true)
.add("user-password,u", &param.user_password, "", "user password (for encrypted files)", nullptr, true) .add("user-password,u", &param.user_password, "", "user password (for encrypted files)", true)
.add("no-drm", &param.no_drm, 0, "override document DRM settings") .add("no-drm", &param.no_drm, 0, "override document DRM settings")
// misc. // misc.
@ -123,7 +153,7 @@ void parse_options (int argc, char **argv)
.add("help,h", "print usage information", &show_usage_and_exit) .add("help,h", "print usage information", &show_usage_and_exit)
// deprecated // deprecated
.add("embed-base-font", "", &deprecated_embed_base_font) .add("single-html", "", &deprecated_single_html)
.add("", &param.input_filename, "", "") .add("", &param.input_filename, "", "")
.add("", &param.output_filename, "", "") .add("", &param.output_filename, "", "")

View File

@ -22,10 +22,16 @@ const map<string, string> GB_ENCODED_FONT_NAME_MAP({
{"\xC1\xA5\xCA\xE9", "SimLi"}, {"\xC1\xA5\xCA\xE9", "SimLi"},
}); });
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({ const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP({
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}}, {".css", {&Param::embed_css,
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}}, "<style type=\"text/css\">",
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}}, "</style>",
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}} "<link rel=\"stylesheet\" type=\"text/css\" href=\"",
"\"/>" }},
{".js", {&Param::embed_javascript,
"<script type=\"text/javascript\">",
"</script>",
"<script type=\"text/javascript\" src=\"",
"\"></script>" }}
}); });
} //namespace pdf2htmlEX } //namespace pdf2htmlEX

View File

@ -11,6 +11,8 @@
#include <map> #include <map>
#include <string> #include <string>
#include "Param.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
#ifndef nullptr #ifndef nullptr
@ -24,9 +26,17 @@ extern const double ID_MATRIX[6];
// For GB encoded font names // For GB encoded font names
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP; extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
// map to embed files into html // map to embed files into html
// key: (suffix, if_embed_content) struct EmbedStringEntry
// value: (prefix string, suffix string) {
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP; int Param::*embed_flag;
// used when *embed_flag == true
std::string prefix_embed;
std::string suffix_embed;
// used when *embed_flag == false
std::string prefix_external;
std::string suffix_external;
};
extern const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP;
} // namespace pdf2htmlEX } // namespace pdf2htmlEX

0
test/test.py Executable file → Normal file
View File