1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-05 01:28:39 +00:00

added input sanitation for split page generation when accepting a custom outfile file name format. Added unit tests for various file name generation scenarios.

This commit is contained in:
Ryan Morlok 2013-03-17 00:08:06 -05:00
parent af8e9c10ae
commit 83c947462a
9 changed files with 274 additions and 10 deletions

View File

@ -65,9 +65,9 @@ You need to modify the manifest if you do not want outline embedded.
.TP
.B --split-pages <0|1> (Default: 0)
If turned on, pages will be stored into separated files named as <output-filename>0.page, <output-filename>1.page, ...
If turned on, pages will be stored into separated files. By defualt, these files will be named as <output-filename>0.page, <output-filename>1.page, ..., however the name of the files can be customized by adding a %d marker in the <output-filename> to specify how the page should be used to generate the name. E.g. p%d.page yeilding p1.page, p2.page ... or p%03d.page yielding p001.page, p002.page etc. Only %d may be used, no other formatting markers.
Also the css and outline will be stored into separated files, and the will be no <output-filename>.html generated.
Also the css and outline will be stored into separated files, and there will be no <output-filename>.html generated.
This switch is useful if you want pages to be loaded separately & dynamically -- in which case you need to compose the page yourself, and a supporting backend might be necessary.

View File

@ -101,8 +101,8 @@ void HTMLRenderer::process(PDFDoc *doc)
if(param->split_pages)
{
auto page_template_fn = str_fmt("%s/%s", param->dest_dir.c_str(), param->output_filename.c_str());
auto page_fn = str_fmt(page_template_fn, i);
auto filled_template_filename = str_fmt(param->output_filename.c_str(), i);
auto page_fn = str_fmt("%s/%s", param->dest_dir.c_str(), string((char*)filled_template_filename).c_str());
f_pages.fs.open((char*)page_fn, ofstream::binary);
if(!f_pages.fs)
throw string("Cannot open ") + (char*)page_fn + " for writing";

View File

@ -216,7 +216,7 @@ int main(int argc, char **argv)
if(get_suffix(param.input_filename) == ".pdf")
{
if(param.split_pages)
param.output_filename = s.substr(0, s.size() - 4) + "%d.page";
param.output_filename = sanitize_filename(s.substr(0, s.size() - 4) + "%d.page", true);
else
param.output_filename = s.substr(0, s.size() - 4) + ".html";
@ -224,16 +224,26 @@ int main(int argc, char **argv)
else
{
if(param.split_pages)
param.output_filename = s + "%d.page";
param.output_filename = sanitize_filename(s + "%d.page", true);
else
param.output_filename = s + ".html";
}
}
else if(param.split_pages && !std::regex_match(param.output_filename, std::regex("^.*%[0-9]*d.*$")))
else if(param.split_pages)
{
// Need to make sure we have a page number placeholder in the filename
if(!std::regex_match(param.output_filename, std::regex("^.*%[0-9]*d.*$")))
{
// Inject the placeholder just before the file extension
const string suffix = get_suffix(param.output_filename);
param.output_filename = param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix;
param.output_filename = sanitize_filename(param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix, true);
}
else
{
// Already have the placeholder, just make sure the name is safe.
param.output_filename = sanitize_filename(param.output_filename, true);
}
}
if(param.css_filename.empty())
{

View File

@ -6,6 +6,7 @@
*/
#include <errno.h>
#include <regex>
#include <sys/stat.h>
#include <sys/types.h>
@ -39,6 +40,21 @@ void create_directories(const string & path)
}
}
string sanitize_filename(const string & filename, bool allow_single_format_number)
{
// First, escape all %'s to make safe for use in printf.
string sanitized = std::regex_replace(filename, std::regex("%"), "%%");
if(allow_single_format_number)
{
// A single %d or %0xd is allowed in the input.
sanitized = std::regex_replace(sanitized, std::regex("%%([0-9]*)d"), "%$1d", std::regex_constants::format_first_only);
}
return sanitized;
}
bool is_truetype_suffix(const string & suffix)
{
return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf");

View File

@ -19,5 +19,16 @@ bool is_truetype_suffix(const std::string & suffix);
std::string get_filename(const std::string & path);
std::string get_suffix(const std::string & path);
/**
* Function to sanitize a filename so that it can be eventually safely used in a printf statement.
*
* @param filename the filename to be sanitized.
* @param allow_single_form_number boolean flag indicatin if a single format (e.g. %d) should be allowed
* in the filename for use in templating of pages. e.g. page%02d.html is ok.
*
* @return the sanitized filename.
*/
std::string sanitize_filename(const std::string & filename, bool allow_single_format_number);
} //namespace pdf2htmlEX
#endif //PATH_H__

BIN
test/test_data/1-page.pdf Normal file

Binary file not shown.

BIN
test/test_data/2-pages.pdf Normal file

Binary file not shown.

BIN
test/test_data/3-pages.pdf Normal file

Binary file not shown.

227
test/test_naming.py Normal file
View File

@ -0,0 +1,227 @@
#!/usr/bin/env python
import unittest
import os
import sys
import tempfile
import shutil
import subprocess
# The location where the executable is generated by the build
PDF2HTMLEX_PATH = '../pdf2htmlEX'
# The location where the base css file, etc is stored in the build folder
DATA_DIR = '../share'
# The location where our test PDFs are stored
TEST_DATA_DIR = './test_data'
def execute_pdf2htmlex_with_args(args):
"""
Execute the pdf2htmlEX with the specified arguments.
:type args: list of values
:param args: list of arguments to pass to executable. First part of each tuple is the argument, second part is the value.
:rtype: int
:return: The exit code of the command
"""
executable = os.path.abspath(os.path.join(os.path.dirname(__file__), PDF2HTMLEX_PATH))
cmd = [executable, '--data-dir', os.path.abspath(os.path.join(os.path.dirname(__file__), DATA_DIR))]
for val in args:
cmd.append(str(val))
return_code = subprocess.call(cmd)
if return_code != 0:
print >> sys.stderr, "Command return code %d: %s" % (return_code, ' '.join(cmd))
return return_code
def execute_pdf2htmlex_and_get_files(args):
"""
Execute the pdf2htmlEX with the specified arguments, and get the names of the output files. Will automatically create
a temporary directory for the output, pass that as the output dir to pdf2htmlEX, determine the files generated, and
clean up the temporary directory afterwards.
:type args: list of values
:param args: list of arguments to pass to executable. First part of each tuple is the argument, second part is the value.
:rtype: list of str
:return: List of the file names that were generated as output in alphabetical order. None if the command does not execute successfully.
"""
temp_dir = tempfile.mkdtemp()
try:
if execute_pdf2htmlex_with_args(['--dest-dir', temp_dir] + args) != 0:
return None
files = os.listdir(temp_dir)
files.sort()
return files
finally:
shutil.rmtree(path=temp_dir, ignore_errors=True)
def path_to_test_file(filename):
"""
Retrieve an absolute path to the specified test file.
:type filename:
:param filename: the name of the test file to get the path to
:rtype: str
:returns: the full path to the test file
"""
return os.path.abspath(os.path.join(os.path.dirname(__file__), TEST_DATA_DIR, filename))
class OutputNamingTests(unittest.TestCase):
def test_generate_single_html_default_name_single_page_pdf(self):
files = execute_pdf2htmlex_and_get_files([
path_to_test_file('1-page.pdf')
])
self.assertEquals(files, ['1-page.html'])
def test_generate_single_html_default_name_multiple_page_pdf(self):
files = execute_pdf2htmlex_and_get_files([
path_to_test_file('2-pages.pdf')
])
self.assertEquals(files, ['2-pages.html'])
def test_generate_single_html_specify_name_single_page_pdf(self):
files = execute_pdf2htmlex_and_get_files([
path_to_test_file('1-page.pdf'),
'foo.html'
])
self.assertEquals(files, ['foo.html'])
def test_generate_single_html_specify_name_multiple_page_pdf(self):
files = execute_pdf2htmlex_and_get_files([
path_to_test_file('2-pages.pdf'),
'foo.html'
])
self.assertEquals(files, ['foo.html'])
def test_generate_split_pages_default_name_single_page(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('1-page.pdf')
])
self.assertEquals(files, sorted(['1-page.css', '1-page.outline', '1-page1.page']))
def test_generate_split_pages_default_name_multiple_pages(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf')
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', '3-pages1.page', '3-pages2.page', '3-pages3.page']))
def test_generate_split_pages_specify_name_single_page(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('1-page.pdf'),
'foo.xyz'
])
self.assertEquals(files, sorted(['1-page.css', '1-page.outline', 'foo1.xyz']))
def test_generate_split_pages_specify_name_multiple_pages(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'foo.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'foo1.xyz', 'foo2.xyz', 'foo3.xyz']))
def test_generate_split_pages_specify_name_formatter_multiple_pages(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'fo%do.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'fo1o.xyz', 'fo2o.xyz', 'fo3o.xyz']))
def test_generate_split_pages_specify_name_formatter_with_padded_zeros_multiple_pages(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'fo%03do.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'fo001o.xyz', 'fo002o.xyz', 'fo003o.xyz']))
def test_generate_split_pages_specify_name_only_first_formatter_gets_taken(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'f%do%do.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f1o%do.xyz', 'f2o%do.xyz', 'f3o%do.xyz']))
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_s(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'f%soo.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%soo1.xyz', 'f%soo2.xyz', 'f%soo3.xyz']))
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_p(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'f%poo.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%poo1.xyz', 'f%poo2.xyz', 'f%poo3.xyz']))
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_n(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'f%noo.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%noo1.xyz', 'f%noo2.xyz', 'f%noo3.xyz']))
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_percent(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'f%%oo.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%%oo1.xyz', 'f%%oo2.xyz', 'f%%oo3.xyz']))
def test_generate_single_html_name_specified_format_characters_percent_d(self):
files = execute_pdf2htmlex_and_get_files([
path_to_test_file('2-pages.pdf'),
'foo%d.html'
])
self.assertEquals(files, ['foo%d.html'])
def test_generate_single_html_name_specified_format_characters_percent_p(self):
files = execute_pdf2htmlex_and_get_files([
path_to_test_file('2-pages.pdf'),
'foo%p.html'
])
self.assertEquals(files, ['foo%p.html'])
def test_generate_single_html_name_specified_format_characters_percent_n(self):
files = execute_pdf2htmlex_and_get_files([
path_to_test_file('2-pages.pdf'),
'foo%n.html'
])
self.assertEquals(files, ['foo%n.html'])
def test_generate_single_html_name_specified_format_characters_percent_percent(self):
files = execute_pdf2htmlex_and_get_files([
path_to_test_file('2-pages.pdf'),
'foo%%.html'
])
self.assertEquals(files, ['foo%%.html'])
if __name__=="__main__":
if not os.path.isfile(PDF2HTMLEX_PATH) or not os.access(PDF2HTMLEX_PATH, os.X_OK):
print >> sys.stderr, "Cannot locate pdf2htmlEX executable. Make sure source was built before running this test."
exit(1)
suite = unittest.loader.TestLoader().loadTestsFromTestCase(OutputNamingTests)
unittest.TextTestRunner(verbosity=2).run(suite)