mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 04:50:09 +00:00
added input sanitation for split page generation when accepting a custom outfile file name format. Added unit tests for various file name generation scenarios.
This commit is contained in:
parent
af8e9c10ae
commit
83c947462a
@ -65,9 +65,9 @@ You need to modify the manifest if you do not want outline embedded.
|
||||
|
||||
.TP
|
||||
.B --split-pages <0|1> (Default: 0)
|
||||
If turned on, pages will be stored into separated files named as <output-filename>0.page, <output-filename>1.page, ...
|
||||
If turned on, pages will be stored into separated files. By defualt, these files will be named as <output-filename>0.page, <output-filename>1.page, ..., however the name of the files can be customized by adding a %d marker in the <output-filename> to specify how the page should be used to generate the name. E.g. p%d.page yeilding p1.page, p2.page ... or p%03d.page yielding p001.page, p002.page etc. Only %d may be used, no other formatting markers.
|
||||
|
||||
Also the css and outline will be stored into separated files, and the will be no <output-filename>.html generated.
|
||||
Also the css and outline will be stored into separated files, and there will be no <output-filename>.html generated.
|
||||
|
||||
This switch is useful if you want pages to be loaded separately & dynamically -- in which case you need to compose the page yourself, and a supporting backend might be necessary.
|
||||
|
||||
@ -83,7 +83,7 @@ If it's empty, the file name will be determined automatically.
|
||||
|
||||
.TP
|
||||
.B --outline-filename <filename> (Default: <none>)
|
||||
Specify the filename of the generated outline file, if not embedded.
|
||||
Specify the filename of the generated outline file, if not embedded.
|
||||
|
||||
If it's empty, the file name will be determined automatically.
|
||||
|
||||
|
@ -101,8 +101,8 @@ void HTMLRenderer::process(PDFDoc *doc)
|
||||
|
||||
if(param->split_pages)
|
||||
{
|
||||
auto page_template_fn = str_fmt("%s/%s", param->dest_dir.c_str(), param->output_filename.c_str());
|
||||
auto page_fn = str_fmt(page_template_fn, i);
|
||||
auto filled_template_filename = str_fmt(param->output_filename.c_str(), i);
|
||||
auto page_fn = str_fmt("%s/%s", param->dest_dir.c_str(), string((char*)filled_template_filename).c_str());
|
||||
f_pages.fs.open((char*)page_fn, ofstream::binary);
|
||||
if(!f_pages.fs)
|
||||
throw string("Cannot open ") + (char*)page_fn + " for writing";
|
||||
|
@ -216,7 +216,7 @@ int main(int argc, char **argv)
|
||||
if(get_suffix(param.input_filename) == ".pdf")
|
||||
{
|
||||
if(param.split_pages)
|
||||
param.output_filename = s.substr(0, s.size() - 4) + "%d.page";
|
||||
param.output_filename = sanitize_filename(s.substr(0, s.size() - 4) + "%d.page", true);
|
||||
else
|
||||
param.output_filename = s.substr(0, s.size() - 4) + ".html";
|
||||
|
||||
@ -224,16 +224,26 @@ int main(int argc, char **argv)
|
||||
else
|
||||
{
|
||||
if(param.split_pages)
|
||||
param.output_filename = s + "%d.page";
|
||||
param.output_filename = sanitize_filename(s + "%d.page", true);
|
||||
else
|
||||
param.output_filename = s + ".html";
|
||||
|
||||
}
|
||||
}
|
||||
else if(param.split_pages && !std::regex_match(param.output_filename, std::regex("^.*%[0-9]*d.*$")))
|
||||
else if(param.split_pages)
|
||||
{
|
||||
const string suffix = get_suffix(param.output_filename);
|
||||
param.output_filename = param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix;
|
||||
// Need to make sure we have a page number placeholder in the filename
|
||||
if(!std::regex_match(param.output_filename, std::regex("^.*%[0-9]*d.*$")))
|
||||
{
|
||||
// Inject the placeholder just before the file extension
|
||||
const string suffix = get_suffix(param.output_filename);
|
||||
param.output_filename = sanitize_filename(param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Already have the placeholder, just make sure the name is safe.
|
||||
param.output_filename = sanitize_filename(param.output_filename, true);
|
||||
}
|
||||
}
|
||||
if(param.css_filename.empty())
|
||||
{
|
||||
|
@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <regex>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
@ -39,6 +40,21 @@ void create_directories(const string & path)
|
||||
}
|
||||
}
|
||||
|
||||
string sanitize_filename(const string & filename, bool allow_single_format_number)
|
||||
{
|
||||
// First, escape all %'s to make safe for use in printf.
|
||||
string sanitized = std::regex_replace(filename, std::regex("%"), "%%");
|
||||
|
||||
if(allow_single_format_number)
|
||||
{
|
||||
// A single %d or %0xd is allowed in the input.
|
||||
sanitized = std::regex_replace(sanitized, std::regex("%%([0-9]*)d"), "%$1d", std::regex_constants::format_first_only);
|
||||
}
|
||||
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
|
||||
bool is_truetype_suffix(const string & suffix)
|
||||
{
|
||||
return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf");
|
||||
|
@ -19,5 +19,16 @@ bool is_truetype_suffix(const std::string & suffix);
|
||||
std::string get_filename(const std::string & path);
|
||||
std::string get_suffix(const std::string & path);
|
||||
|
||||
/**
|
||||
* Function to sanitize a filename so that it can be eventually safely used in a printf statement.
|
||||
*
|
||||
* @param filename the filename to be sanitized.
|
||||
* @param allow_single_form_number boolean flag indicatin if a single format (e.g. %d) should be allowed
|
||||
* in the filename for use in templating of pages. e.g. page%02d.html is ok.
|
||||
*
|
||||
* @return the sanitized filename.
|
||||
*/
|
||||
std::string sanitize_filename(const std::string & filename, bool allow_single_format_number);
|
||||
|
||||
} //namespace pdf2htmlEX
|
||||
#endif //PATH_H__
|
||||
|
BIN
test/test_data/1-page.pdf
Normal file
BIN
test/test_data/1-page.pdf
Normal file
Binary file not shown.
BIN
test/test_data/2-pages.pdf
Normal file
BIN
test/test_data/2-pages.pdf
Normal file
Binary file not shown.
BIN
test/test_data/3-pages.pdf
Normal file
BIN
test/test_data/3-pages.pdf
Normal file
Binary file not shown.
227
test/test_naming.py
Normal file
227
test/test_naming.py
Normal file
@ -0,0 +1,227 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import unittest
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
# The location where the executable is generated by the build
|
||||
PDF2HTMLEX_PATH = '../pdf2htmlEX'
|
||||
|
||||
# The location where the base css file, etc is stored in the build folder
|
||||
DATA_DIR = '../share'
|
||||
|
||||
# The location where our test PDFs are stored
|
||||
TEST_DATA_DIR = './test_data'
|
||||
|
||||
def execute_pdf2htmlex_with_args(args):
|
||||
"""
|
||||
Execute the pdf2htmlEX with the specified arguments.
|
||||
|
||||
:type args: list of values
|
||||
:param args: list of arguments to pass to executable. First part of each tuple is the argument, second part is the value.
|
||||
|
||||
:rtype: int
|
||||
:return: The exit code of the command
|
||||
"""
|
||||
executable = os.path.abspath(os.path.join(os.path.dirname(__file__), PDF2HTMLEX_PATH))
|
||||
|
||||
cmd = [executable, '--data-dir', os.path.abspath(os.path.join(os.path.dirname(__file__), DATA_DIR))]
|
||||
|
||||
for val in args:
|
||||
cmd.append(str(val))
|
||||
|
||||
return_code = subprocess.call(cmd)
|
||||
|
||||
if return_code != 0:
|
||||
print >> sys.stderr, "Command return code %d: %s" % (return_code, ' '.join(cmd))
|
||||
|
||||
return return_code
|
||||
|
||||
def execute_pdf2htmlex_and_get_files(args):
|
||||
"""
|
||||
Execute the pdf2htmlEX with the specified arguments, and get the names of the output files. Will automatically create
|
||||
a temporary directory for the output, pass that as the output dir to pdf2htmlEX, determine the files generated, and
|
||||
clean up the temporary directory afterwards.
|
||||
|
||||
:type args: list of values
|
||||
:param args: list of arguments to pass to executable. First part of each tuple is the argument, second part is the value.
|
||||
|
||||
:rtype: list of str
|
||||
:return: List of the file names that were generated as output in alphabetical order. None if the command does not execute successfully.
|
||||
"""
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
|
||||
try:
|
||||
if execute_pdf2htmlex_with_args(['--dest-dir', temp_dir] + args) != 0:
|
||||
return None
|
||||
|
||||
files = os.listdir(temp_dir)
|
||||
files.sort()
|
||||
return files
|
||||
finally:
|
||||
shutil.rmtree(path=temp_dir, ignore_errors=True)
|
||||
|
||||
def path_to_test_file(filename):
|
||||
"""
|
||||
Retrieve an absolute path to the specified test file.
|
||||
|
||||
:type filename:
|
||||
:param filename: the name of the test file to get the path to
|
||||
|
||||
:rtype: str
|
||||
:returns: the full path to the test file
|
||||
"""
|
||||
return os.path.abspath(os.path.join(os.path.dirname(__file__), TEST_DATA_DIR, filename))
|
||||
|
||||
class OutputNamingTests(unittest.TestCase):
|
||||
def test_generate_single_html_default_name_single_page_pdf(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
path_to_test_file('1-page.pdf')
|
||||
])
|
||||
self.assertEquals(files, ['1-page.html'])
|
||||
|
||||
def test_generate_single_html_default_name_multiple_page_pdf(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
path_to_test_file('2-pages.pdf')
|
||||
])
|
||||
self.assertEquals(files, ['2-pages.html'])
|
||||
|
||||
def test_generate_single_html_specify_name_single_page_pdf(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
path_to_test_file('1-page.pdf'),
|
||||
'foo.html'
|
||||
])
|
||||
self.assertEquals(files, ['foo.html'])
|
||||
|
||||
def test_generate_single_html_specify_name_multiple_page_pdf(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
path_to_test_file('2-pages.pdf'),
|
||||
'foo.html'
|
||||
])
|
||||
self.assertEquals(files, ['foo.html'])
|
||||
|
||||
def test_generate_split_pages_default_name_single_page(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('1-page.pdf')
|
||||
])
|
||||
self.assertEquals(files, sorted(['1-page.css', '1-page.outline', '1-page1.page']))
|
||||
|
||||
def test_generate_split_pages_default_name_multiple_pages(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('3-pages.pdf')
|
||||
])
|
||||
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', '3-pages1.page', '3-pages2.page', '3-pages3.page']))
|
||||
|
||||
def test_generate_split_pages_specify_name_single_page(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('1-page.pdf'),
|
||||
'foo.xyz'
|
||||
])
|
||||
self.assertEquals(files, sorted(['1-page.css', '1-page.outline', 'foo1.xyz']))
|
||||
|
||||
def test_generate_split_pages_specify_name_multiple_pages(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('3-pages.pdf'),
|
||||
'foo.xyz'
|
||||
])
|
||||
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'foo1.xyz', 'foo2.xyz', 'foo3.xyz']))
|
||||
|
||||
def test_generate_split_pages_specify_name_formatter_multiple_pages(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('3-pages.pdf'),
|
||||
'fo%do.xyz'
|
||||
])
|
||||
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'fo1o.xyz', 'fo2o.xyz', 'fo3o.xyz']))
|
||||
|
||||
def test_generate_split_pages_specify_name_formatter_with_padded_zeros_multiple_pages(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('3-pages.pdf'),
|
||||
'fo%03do.xyz'
|
||||
])
|
||||
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'fo001o.xyz', 'fo002o.xyz', 'fo003o.xyz']))
|
||||
|
||||
def test_generate_split_pages_specify_name_only_first_formatter_gets_taken(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('3-pages.pdf'),
|
||||
'f%do%do.xyz'
|
||||
])
|
||||
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f1o%do.xyz', 'f2o%do.xyz', 'f3o%do.xyz']))
|
||||
|
||||
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_s(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('3-pages.pdf'),
|
||||
'f%soo.xyz'
|
||||
])
|
||||
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%soo1.xyz', 'f%soo2.xyz', 'f%soo3.xyz']))
|
||||
|
||||
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_p(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('3-pages.pdf'),
|
||||
'f%poo.xyz'
|
||||
])
|
||||
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%poo1.xyz', 'f%poo2.xyz', 'f%poo3.xyz']))
|
||||
|
||||
|
||||
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_n(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('3-pages.pdf'),
|
||||
'f%noo.xyz'
|
||||
])
|
||||
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%noo1.xyz', 'f%noo2.xyz', 'f%noo3.xyz']))
|
||||
|
||||
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_percent(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
'--split-pages', 1,
|
||||
path_to_test_file('3-pages.pdf'),
|
||||
'f%%oo.xyz'
|
||||
])
|
||||
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%%oo1.xyz', 'f%%oo2.xyz', 'f%%oo3.xyz']))
|
||||
|
||||
def test_generate_single_html_name_specified_format_characters_percent_d(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
path_to_test_file('2-pages.pdf'),
|
||||
'foo%d.html'
|
||||
])
|
||||
self.assertEquals(files, ['foo%d.html'])
|
||||
|
||||
def test_generate_single_html_name_specified_format_characters_percent_p(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
path_to_test_file('2-pages.pdf'),
|
||||
'foo%p.html'
|
||||
])
|
||||
self.assertEquals(files, ['foo%p.html'])
|
||||
|
||||
def test_generate_single_html_name_specified_format_characters_percent_n(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
path_to_test_file('2-pages.pdf'),
|
||||
'foo%n.html'
|
||||
])
|
||||
self.assertEquals(files, ['foo%n.html'])
|
||||
|
||||
def test_generate_single_html_name_specified_format_characters_percent_percent(self):
|
||||
files = execute_pdf2htmlex_and_get_files([
|
||||
path_to_test_file('2-pages.pdf'),
|
||||
'foo%%.html'
|
||||
])
|
||||
self.assertEquals(files, ['foo%%.html'])
|
||||
|
||||
if __name__=="__main__":
|
||||
if not os.path.isfile(PDF2HTMLEX_PATH) or not os.access(PDF2HTMLEX_PATH, os.X_OK):
|
||||
print >> sys.stderr, "Cannot locate pdf2htmlEX executable. Make sure source was built before running this test."
|
||||
exit(1)
|
||||
|
||||
suite = unittest.loader.TestLoader().loadTestsFromTestCase(OutputNamingTests)
|
||||
unittest.TextTestRunner(verbosity=2).run(suite)
|
Loading…
Reference in New Issue
Block a user