1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-01 07:59:00 +00:00

new option --embed, --embed-*; better ArgEntry

This commit is contained in:
Lu Wang 2013-05-27 07:43:26 +08:00
parent c815f304f3
commit 92454a9883
12 changed files with 202 additions and 115 deletions

0
build_for_ppa.py Executable file → Normal file
View File

0
debian/rules vendored Executable file → Normal file
View File

View File

@ -35,6 +35,7 @@ Specify the last page to process
.SS Dimensions
.TP
.B --zoom <ratio>, --fit-width <width>, --fit-height <height>
--zoom specifies the zoom factor directly; --fit-width/height specifies the maximum width/height of a page, the values are in pixels.
@ -53,16 +54,26 @@ Specify the horizontal and vertical DPI for images
.SS Output
.TP
.B --single-html <0|1> (Default: 1)
Whether to embed everything into one HTML file.
.B --embed <string>
.br
.B --embed-css <0|1> (Default: 1)
.br
.B --embed-font <0|1> (Default: 1)
.br
.B --embed-image <0|1> (Default: 1)
.br
.B --embed-javascript <0|1> (Default: 1)
.br
.B --embed-outline <0|1> (Default: 1)
.RS
Specify which elements should be embedded into the output HTML file.
If switched off, there will be several files generated along with the HTML file including files for fonts, css, images.
Note that the outline will always be embedded into the main HTML file no matter if this switch is on or not.
And only when this switch is off will there be a separate .outline file contains the outline.
You need to modify the manifest if you do not want outline embedded.
If switched off, separated files will be generated along with the HTML file for the corresponding elements.
--embed accepts a string as argument. Each letter of the string must be one of `cCfFiIjJoO`, which corresponds
to one of the --embed-*** switches. Lower case letters for 0 and upper case letters for 1. For example,
`--embed cFIJo` means to embed everything but CSS files and outlines.
.RE
.TP
.B --split-pages <0|1> (Default: 0)
If turned on, the content of each page is stored in a separated file.

View File

@ -41,9 +41,20 @@ void dump_value(std::ostream & out, const std::string & v)
out << '"' << v << '"';
}
ArgParser & ArgParser::add(const char * optname, const char * description, ArgParserCallBack callback)
ArgParser & ArgParser::add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg)
{
return add<char>(optname, nullptr, 0, description, callback, true);
// ArgEntry does not accept nullptr as optname nor description
if((!optname) || (!optname[0]))
{
// when optname is nullptr or "", it's optional, and description is dropped
optional_arg_entries.emplace_back(new ArgEntry<string, string>("", "", callback, need_arg));
}
else
{
arg_entries.emplace_back(new ArgEntry<string, string>(optname, (description ? description : ""), callback, need_arg));
}
return *this;
}
void ArgParser::parse(int argc, char ** argv) const

View File

@ -42,79 +42,86 @@ extern void dump_value(std::ostream & out, const std::string & v);
class ArgParser
{
public:
typedef void (*ArgParserCallBack) (const char * arg);
public:
typedef void (*ArgParserCallBack) (const char * arg);
/*
* The 1st is for arg without arguments (i.e. flags), and the 2nd is for general args.
* optname:
* - if not nullptr, it should be the name of the arg, should be in the format of "<long name>[,<short char>]", e.g. "help,h"
* - if nullptr, it denotes an optional arg, and description will be ignored
* description:
* - if description is nullptr or "", the argument won't be shown in show_usage()
*/
ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback = nullptr);
template <class T, class Tv>
ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback = nullptr, bool dont_show_default = false);
/*
* The 1st is for arguments with callbacks(i.e. flags)
* The 2nd is for arguments linked to variables
*
* optname:
* - if not nullptr, it should be the name of the arg, should be in the format of "<long name>[,<short char>]", e.g. "help,h"
* - if nullptr, it denotes an optional arg, and description will be ignored
* description:
* - if description is nullptr or "", the argument won't be shown in show_usage()
*
* location:
* - if not nullptr, the argument for this arg is stored there
* - if nullptr, this arg does not need arguments
*/
ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg = false);
template <class T, class Tv>
ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default = false);
void parse(int argc, char ** argv) const;
void show_usage(std::ostream & out) const;
void parse(int argc, char ** argv) const;
void show_usage(std::ostream & out) const;
private:
// type names helper
template<class>
static const char * get_type_name(void) { return "unknown"; }
struct ArgEntryBase
{
/* name or description cannot be nullptr */
ArgEntryBase(const char * name, const char * description, bool need_arg);
virtual ~ArgEntryBase() { }
char shortname;
std::string name;
std::string description;
bool need_arg;
virtual void parse (const char * arg) const = 0;
virtual void show_usage (std::ostream & out) const = 0;
};
template <class T, class Tv>
struct ArgEntry : public ArgEntryBase
{
ArgEntry(const char * name,
const char * description,
ArgParserCallBack callback,
bool need_arg);
ArgEntry(const char * name,
T * location, const Tv & default_value,
const char * description, bool dont_show_default);
virtual void parse (const char * arg) const;
virtual void show_usage (std::ostream & out) const;
private:
// type names helper
template<class>
static const char * get_type_name(void) { return "unknown"; }
T * location;
T default_value;
ArgParserCallBack callback;
bool dont_show_default;
};
class ArgEntryBase
{
public:
/* name or description cannot be nullptr */
ArgEntryBase(const char * name, const char * description, bool need_arg);
virtual ~ArgEntryBase() { }
char shortname;
std::string name;
std::string description;
bool need_arg;
virtual void parse (const char * arg) const = 0;
virtual void show_usage (std::ostream & out) const = 0;
};
template <class T, class Tv>
class ArgEntry : public ArgEntryBase
{
public:
ArgEntry(const char * name,
T * location, const Tv & deafult_value,
ArgParserCallBack callback,
const char * description, bool dont_show_default);
virtual void parse (const char * arg) const;
virtual void show_usage (std::ostream & out) const;
private:
T * location;
T default_value;
ArgParserCallBack callback;
bool dont_show_default;
};
std::vector<std::unique_ptr<ArgEntryBase>> arg_entries, optional_arg_entries;
static const int arg_col_width;
std::vector<std::unique_ptr<ArgEntryBase>> arg_entries, optional_arg_entries;
static const int arg_col_width;
};
template<class T, class Tv>
ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, ArgParserCallBack callback, bool dont_show_default)
ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default)
{
// ArgEntry does not accept nullptr as optname nor description
if((!optname) || (!optname[0]))
{
// when optname is nullptr or "", it's optional, and description is dropped
optional_arg_entries.emplace_back(new ArgEntry<T, Tv>("", location, default_value, callback, "", dont_show_default));
optional_arg_entries.emplace_back(new ArgEntry<T, Tv>("", location, default_value, "", dont_show_default));
}
else
{
arg_entries.emplace_back(new ArgEntry<T, Tv>(optname, location, default_value, callback, (description ? description : ""), dont_show_default));
arg_entries.emplace_back(new ArgEntry<T, Tv>(optname, location, default_value, (description ? description : ""), dont_show_default));
}
return *this;
@ -126,12 +133,22 @@ template<> const char * ArgParser::get_type_name<double> (void);
template<> const char * ArgParser::get_type_name<std::string> (void);
template<class T, class Tv>
ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, T * location, const Tv & default_value, ArgParserCallBack callback, const char * description, bool dont_show_default)
ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, const char * description, ArgParserCallBack callback, bool need_arg)
: ArgEntryBase(name, description, need_arg)
, location(nullptr)
, default_value()
, callback(callback)
, dont_show_default(true)
{
}
template<class T, class Tv>
ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, T * location, const Tv & default_value, const char * description, bool dont_show_default)
: ArgEntryBase(name, description, (location != nullptr))
, location(location)
, default_value(default_value)
, callback(callback)
, dont_show_default(dont_show_default)
, location(location)
, default_value(default_value)
, callback(nullptr)
, dont_show_default(dont_show_default)
{
if(need_arg)
*location = T(default_value);
@ -145,7 +162,7 @@ void ArgParser::ArgEntry<T, Tv>::parse(const char * arg) const
if(!arg)
throw std::string("Missing argument of option: --") + name;
if(!read_value(arg, location))
if((location != nullptr) && (!read_value(arg, location)))
throw std::string("Invalid argument: ") + arg;
}

View File

@ -562,10 +562,10 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
* We need to reload in order to retrieve/fix accurate ascent/descent, some info won't be written to the font by fontforge until saved.
*/
string fn = (char*)str_fmt("%s/f%llx%s",
(param.single_html ? param.tmp_dir : param.dest_dir).c_str(),
(param.embed_font ? param.tmp_dir : param.dest_dir).c_str(),
info.id, param.font_suffix.c_str());
if(param.single_html)
if(param.embed_font)
tmp_files.add(fn);
ffw_load_font(cur_tmp_fn.c_str());
@ -763,7 +763,7 @@ void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suff
{
auto fn = str_fmt("f%llx%s", info.id, suffix.c_str());
if(param.single_html)
if(param.embed_font)
{
auto path = param.tmp_dir + "/" + (char*)fn;
ifstream fin(path, ifstream::binary);

View File

@ -122,8 +122,8 @@ void HTMLRenderer::process(PDFDoc *doc)
if(param.process_nontext)
{
auto fn = str_fmt("%s/bg%x.png", (param.single_html ? param.tmp_dir : param.dest_dir).c_str(), i);
if(param.single_html)
auto fn = str_fmt("%s/bg%x.png", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), i);
if(param.embed_image)
tmp_files.add((char*)fn);
bg_renderer->render_page(doc, i, (char*)fn);
@ -196,7 +196,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
{
f_pages.fs << "<img class=\"" << CSS::BACKGROUND_IMAGE_CN
<< "\" alt=\"\" src=\"";
if(param.single_html)
if(param.embed_image)
{
auto path = str_fmt("%s/bg%x.png", param.tmp_dir.c_str(), pageNum);
ifstream fin((char*)path, ifstream::binary);
@ -280,23 +280,19 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
// we may output utf8 characters, so always use binary
{
/*
* If single-html
* If embed-css
* we have to keep the generated css file into a temporary place
* and embed it into the main html later
*
*
* If single-html
* as there's no place to embed the css file, just leave it alone (into param.dest_dir)
*
* If !single-html
* otherwise
* leave it in param.dest_dir
*/
auto fn = (param.single_html)
auto fn = (param.embed_css)
? str_fmt("%s/__css", param.tmp_dir.c_str())
: str_fmt("%s/%s", param.dest_dir.c_str(), param.css_filename.c_str());
if(param.single_html)
if(param.embed_css)
tmp_files.add((char*)fn);
f_css.path = (char*)fn;
@ -312,11 +308,11 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
* The logic for outline is similar to css
*/
auto fn = (param.single_html)
auto fn = (param.embed_outline)
? str_fmt("%s/__outline", param.tmp_dir.c_str())
: str_fmt("%s/%s", param.dest_dir.c_str(), param.outline_filename.c_str());
if(param.single_html)
if(param.embed_outline)
tmp_files.add((char*)fn);
f_outline.path = (char*)fn;
@ -333,7 +329,6 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
if(!param.split_pages)
{
/*
* If single-html
* we have to keep the html file for pages into a temporary place
* because we'll have to embed css before it
*
@ -430,7 +425,7 @@ void HTMLRenderer::post_process(void)
}
else if (line == "$outline")
{
if (param.process_outline)
if (param.process_outline && param.embed_outline)
{
ifstream fin(f_outline.path, ifstream::binary);
if(!fin)
@ -522,28 +517,31 @@ void HTMLRenderer::embed_file(ostream & out, const string & path, const string &
string fn = get_filename(path);
string suffix = (type == "") ? get_suffix(fn) : type;
auto iter = EMBED_STRING_MAP.find(make_pair(suffix, (bool)param.single_html));
// TODO
auto iter = EMBED_STRING_MAP.find(suffix);
if(iter == EMBED_STRING_MAP.end())
{
cerr << "Warning: unknown suffix: " << suffix << endl;
return;
}
const auto & entry = iter->second;
if(param.single_html)
if(param.*(entry.embed_flag))
{
ifstream fin(path, ifstream::binary);
if(!fin)
throw string("Cannot open file ") + path + " for embedding";
out << iter->second.first << endl
out << entry.prefix_embed << endl
<< fin.rdbuf();
out.clear(); // out will set fail big if fin is empty
out << iter->second.second << endl;
out << entry.suffix_embed << endl;
}
else
{
out << iter->second.first;
out << entry.prefix_external;
outputURL(out, fn);
out << iter->second.second << endl;
out << entry.suffix_external << endl;
if(copy)
{

View File

@ -25,7 +25,11 @@ struct Param
double h_dpi, v_dpi;
// output
int single_html;
int embed_css;
int embed_font;
int embed_image;
int embed_javascript;
int embed_outline;
int split_pages;
std::string dest_dir;
std::string css_filename;

View File

@ -36,9 +36,9 @@ using namespace pdf2htmlEX;
Param param;
ArgParser argparser;
void deprecated_embed_base_font(const char * dummy = nullptr)
void deprecated_single_html(const char * dummy = nullptr)
{
cerr << "--embed-base-font is deprecated. Use --embed-external-font instead." << endl;
cerr << "--single_html is deprecated. Use `--embed CFIJO` instead." << endl;
exit(EXIT_FAILURE);
}
@ -60,6 +60,31 @@ void show_version_and_exit(const char * dummy = nullptr)
exit(EXIT_SUCCESS);
}
void embed_parser (const char * str)
{
while(true)
{
switch(*str)
{
case '\0': return; break;
case 'c': param.embed_css = 0; break;
case 'C': param.embed_css = 1; break;
case 'f': param.embed_font = 0; break;
case 'F': param.embed_font = 1; break;
case 'i': param.embed_image = 0; break;
case 'I': param.embed_image = 1; break;
case 'j': param.embed_javascript = 0; break;
case 'J': param.embed_javascript = 1; break;
case 'o': param.embed_outline = 0; break;
case 'O': param.embed_outline = 1; break;
default:
cerr << "Unknown character `" << (*str) << "` for --embed" << endl;
break;
}
++ str;
}
}
void parse_options (int argc, char **argv)
{
argparser
@ -68,15 +93,20 @@ void parse_options (int argc, char **argv)
.add("last-page,l", &param.last_page, numeric_limits<int>::max(), "last page to convert")
// dimensions
.add("zoom", &param.zoom, 0, "zoom ratio", nullptr, true)
.add("fit-width", &param.fit_width, 0, "fit width to <fp> pixels", nullptr, true)
.add("fit-height", &param.fit_height, 0, "fit height to <fp> pixels", nullptr, true)
.add("zoom", &param.zoom, 0, "zoom ratio", true)
.add("fit-width", &param.fit_width, 0, "fit width to <fp> pixels", true)
.add("fit-height", &param.fit_height, 0, "fit height to <fp> pixels", true)
.add("use-cropbox", &param.use_cropbox, 1, "use CropBox instead of MediaBox")
.add("hdpi", &param.h_dpi, 144.0, "horizontal resolution for graphics in DPI")
.add("vdpi", &param.v_dpi, 144.0, "vertical resolution for graphics in DPI")
// output files
.add("single-html", &param.single_html, 1, "generate a single HTML file")
.add("embed", "specify which elements should be embedded into output", embed_parser, true)
.add("embed-css", &param.embed_css, 1, "embed CSS files into output")
.add("embed-font", &param.embed_font, 1, "embed font files into output")
.add("embed-image", &param.embed_image, 1, "embed image files into output")
.add("embed-javascript", &param.embed_javascript, 1, "embed JavaScript files into output")
.add("embed-outline", &param.embed_outline, 1, "embed outlines into output")
.add("split-pages", &param.split_pages, 0, "split pages into separate files")
.add("dest-dir", &param.dest_dir, ".", "specify destination directory")
.add("css-filename", &param.css_filename, "", "filename of the generated css file")
@ -107,8 +137,8 @@ void parse_options (int argc, char **argv)
.add("optimize-text", &param.optimize_text, 0, "try to reduce the number of HTML elements used for text")
// encryption
.add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", nullptr, true)
.add("user-password,u", &param.user_password, "", "user password (for encrypted files)", nullptr, true)
.add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", true)
.add("user-password,u", &param.user_password, "", "user password (for encrypted files)", true)
.add("no-drm", &param.no_drm, 0, "override document DRM settings")
// misc.
@ -123,7 +153,7 @@ void parse_options (int argc, char **argv)
.add("help,h", "print usage information", &show_usage_and_exit)
// deprecated
.add("embed-base-font", "", &deprecated_embed_base_font)
.add("single-html", "", &deprecated_single_html)
.add("", &param.input_filename, "", "")
.add("", &param.output_filename, "", "")

View File

@ -22,10 +22,16 @@ const map<string, string> GB_ENCODED_FONT_NAME_MAP({
{"\xC1\xA5\xCA\xE9", "SimLi"},
});
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}},
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}},
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}},
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}}
const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP({
{".css", {&Param::embed_css,
"<style type=\"text/css\">",
"</style>",
"<link rel=\"stylesheet\" type=\"text/css\" href=\"",
"\"/>" }},
{".js", {&Param::embed_javascript,
"<script type=\"text/javascript\">",
"</script>",
"<script type=\"text/javascript\" src=\"",
"\"></script>" }}
});
} //namespace pdf2htmlEX

View File

@ -11,6 +11,8 @@
#include <map>
#include <string>
#include "Param.h"
namespace pdf2htmlEX {
#ifndef nullptr
@ -24,9 +26,17 @@ extern const double ID_MATRIX[6];
// For GB encoded font names
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
// map to embed files into html
// key: (suffix, if_embed_content)
// value: (prefix string, suffix string)
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP;
struct EmbedStringEntry
{
int Param::*embed_flag;
// used when *embed_flag == true
std::string prefix_embed;
std::string suffix_embed;
// used when *embed_flag == false
std::string prefix_external;
std::string suffix_external;
};
extern const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP;
} // namespace pdf2htmlEX

0
test/test.py Executable file → Normal file
View File