1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-05 01:28:39 +00:00

updated split page filename formatting to not rely on regex to be compatible with older compilers. added several test cases to account for new implementation. updated documenation to more accurately reflect how split page filenames are generated.

This commit is contained in:
Ryan Morlok 2013-03-17 23:31:43 -05:00
parent 83c947462a
commit 3cafb540c6
5 changed files with 144 additions and 20 deletions

View File

@ -65,12 +65,34 @@ You need to modify the manifest if you do not want outline embedded.
.TP
.B --split-pages <0|1> (Default: 0)
If turned on, pages will be stored into separated files. By defualt, these files will be named as <output-filename>0.page, <output-filename>1.page, ..., however the name of the files can be customized by adding a %d marker in the <output-filename> to specify how the page should be used to generate the name. E.g. p%d.page yeilding p1.page, p2.page ... or p%03d.page yielding p001.page, p002.page etc. Only %d may be used, no other formatting markers.
If turned on, the pages, css, and outline will be stored into separated files and no consolidated <output-filename>.html will be generated.
Also the css and outline will be stored into separated files, and there will be no <output-filename>.html generated.
<output-filename> may be used to specify the format for the filenames for individual pages. <output-filename> may contain a %d placeholder to indicate where the page number should be placed.
If <output-filename> does not contain a placeholder for the page number, the page number will be inserted directly before the file extension. If the filename does not have an extension, the page number will be placed at the end of the file name.
If <output-filename> is not specified, <input-filename> will be used for the output filename, replacing the extension with .page and adding the page number directly before the extension.
This switch is useful if you want pages to be loaded separately & dynamically -- in which case you need to compose the page yourself, and a supporting backend might be necessary.
.B Examples
.B pdf2htmlEX --split-pages 1 foo.pdf
Yields page files foo1.page, foo2.page, etc.
.B pdf2htmlEX --split-pages 1 foo.pdf bar.baz
Yields page files bar1.baz, bar2.baz, etc.
.B pdf2htmlEX --split-pages 1 foo.pdf page%dbar.baz
Yields page files page1bar.baz, page2bar.baz, etc.
.B pdf2htmlEX --split-pages 1 foo.pdf bar%03d.baz
Yields page files bar001.baz, bar002.baz, etc.
.TP
.B --dest-dir <dir> (Default: .)
Specify destination folder

View File

@ -10,7 +10,6 @@
#include <string>
#include <limits>
#include <iostream>
#include <regex>
#include <getopt.h>
#include <poppler-config.h>
@ -216,7 +215,7 @@ int main(int argc, char **argv)
if(get_suffix(param.input_filename) == ".pdf")
{
if(param.split_pages)
param.output_filename = sanitize_filename(s.substr(0, s.size() - 4) + "%d.page", true);
param.output_filename = sanitize_filename(s.substr(0, s.size() - 4) + "%d.page");
else
param.output_filename = s.substr(0, s.size() - 4) + ".html";
@ -224,7 +223,7 @@ int main(int argc, char **argv)
else
{
if(param.split_pages)
param.output_filename = sanitize_filename(s + "%d.page", true);
param.output_filename = sanitize_filename(s + "%d.page");
else
param.output_filename = s + ".html";
@ -233,16 +232,16 @@ int main(int argc, char **argv)
else if(param.split_pages)
{
// Need to make sure we have a page number placeholder in the filename
if(!std::regex_match(param.output_filename, std::regex("^.*%[0-9]*d.*$")))
if(!contains_integer_placeholder(param.output_filename))
{
// Inject the placeholder just before the file extension
const string suffix = get_suffix(param.output_filename);
param.output_filename = sanitize_filename(param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix, true);
param.output_filename = sanitize_filename(param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix);
}
else
{
// Already have the placeholder, just make sure the name is safe.
param.output_filename = sanitize_filename(param.output_filename, true);
param.output_filename = sanitize_filename(param.output_filename);
}
}
if(param.css_filename.empty())

View File

@ -6,7 +6,6 @@
*/
#include <errno.h>
#include <regex>
#include <sys/stat.h>
#include <sys/types.h>
@ -40,20 +39,99 @@ void create_directories(const string & path)
}
}
string sanitize_filename(const string & filename, bool allow_single_format_number)
string sanitize_filename(const string & filename)
{
// First, escape all %'s to make safe for use in printf.
string sanitized = std::regex_replace(filename, std::regex("%"), "%%");
string sanitized = string();
bool format_specifier_found = false;
if(allow_single_format_number)
for(int i = 0; i < filename.size(); i++)
{
// A single %d or %0xd is allowed in the input.
sanitized = std::regex_replace(sanitized, std::regex("%%([0-9]*)d"), "%$1d", std::regex_constants::format_first_only);
if('%' == filename[i])
{
if(format_specifier_found)
{
sanitized.push_back('%');
sanitized.push_back('%');
}
else
{
// We haven't found the format specifier yet, so see if we can use this one as a valid formatter
int original_i = i;
string tmp("");
tmp.push_back('%');
while(++i < filename.size())
{
tmp.push_back(filename[i]);
// If we aren't still in option specifiers, stop looking
if(!strchr("+-#0123456789.", filename[i]))
{
break;
}
}
// Check to see if we yielded a valid format speifier
if('d' == tmp.back())
{
// Found a valid integer format
sanitized.append(tmp);
format_specifier_found = true;
}
else
{
// Not a valid format specifier. Just append the protected %
// and keep looking from where we left of in the search
sanitized.push_back('%');
sanitized.push_back('%');
i = original_i;
}
}
}
else
{
sanitized.push_back(filename[i]);
}
}
return sanitized;
}
bool contains_integer_placeholder(const string & filename)
{
for(int i = 0; i < filename.size(); i++)
{
if('%' == filename[i])
{
int original_i = i;
char last_char = '%';
while(++i < filename.size())
{
last_char = filename[i];
// If we aren't still in option specifiers, stop looking
if(!strchr("+-#0123456789.", last_char))
{
break;
}
}
// Check to see if we yielded a valid format speifier
if('d' == last_char)
{
// Yep.
return true;
}
else
{
// Nope. Resume looking where we left off.
i = original_i;
}
}
}
return false;
}
bool is_truetype_suffix(const string & suffix)
{

View File

@ -20,15 +20,24 @@ std::string get_filename(const std::string & path);
std::string get_suffix(const std::string & path);
/**
* Function to sanitize a filename so that it can be eventually safely used in a printf statement.
* Function to sanitize a filename so that it can be eventually safely used in a printf
* statement. Allows a single %d placeholder, but no other format specifiers.
*
* @param filename the filename to be sanitized.
* @param allow_single_form_number boolean flag indicatin if a single format (e.g. %d) should be allowed
* in the filename for use in templating of pages. e.g. page%02d.html is ok.
*
* @return the sanitized filename.
*/
std::string sanitize_filename(const std::string & filename, bool allow_single_format_number);
std::string sanitize_filename(const std::string & filename);
/**
* Function to check if a filename contains at least one %d integer placeholder
* for use in a printf statement.
*
* @param filename the filename to check
*
* @return true if the filename contains an integer placeholder, false otherwise.
*/
bool contains_integer_placeholder(const std::string & filename);
} //namespace pdf2htmlEX
#endif //PATH_H__

View File

@ -190,6 +190,22 @@ class OutputNamingTests(unittest.TestCase):
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%%oo1.xyz', 'f%%oo2.xyz', 'f%%oo3.xyz']))
def test_generate_split_pages_specify_name_only_formatter_starts_part_way_through_invalid_formatter(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'f%02%doo.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%021oo.xyz', 'f%022oo.xyz', 'f%023oo.xyz']))
def test_generate_split_pages_specify_output_filename_no_formatter_no_extension(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('1-page.pdf'),
'foo'
])
self.assertEquals(files, sorted(['1-page.css', '1-page.outline', 'foo1']))
def test_generate_single_html_name_specified_format_characters_percent_d(self):
files = execute_pdf2htmlex_and_get_files([
path_to_test_file('2-pages.pdf'),
@ -217,7 +233,7 @@ class OutputNamingTests(unittest.TestCase):
'foo%%.html'
])
self.assertEquals(files, ['foo%%.html'])
if __name__=="__main__":
if not os.path.isfile(PDF2HTMLEX_PATH) or not os.access(PDF2HTMLEX_PATH, os.X_OK):
print >> sys.stderr, "Cannot locate pdf2htmlEX executable. Make sure source was built before running this test."