added input sanitation for split page generation when accepting a custom outfile file name format. Added unit tests for various file name generation scenarios.

2024-12-22 13:00:08 +00:00 · 2013-03-17 00:08:06 -05:00 · 2013-03-17 00:08:06 -05:00 · 83c947462a
commit 83c947462a
parent af8e9c10ae
9 changed files with 274 additions and 10 deletions
--- a/pdf2htmlEX.1.in
+++ b/pdf2htmlEX.1.in
@ -65,9 +65,9 @@ You need to modify the manifest if you do not want outline embedded.

 .TP
 .B --split-pages <0|1> (Default: 0)
-If turned on, pages will be stored into separated files named as <output-filename>0.page, <output-filename>1.page, ...
+If turned on, pages will be stored into separated files. By defualt, these files will be named as <output-filename>0.page, <output-filename>1.page, ..., however the name of the files can be customized by adding a %d marker in the <output-filename> to specify how the page should be used to generate the name. E.g. p%d.page yeilding p1.page, p2.page ... or p%03d.page yielding p001.page, p002.page etc. Only %d may be used, no other formatting markers.

-Also the css and outline will be stored into separated files, and the will be no <output-filename>.html generated.
+Also the css and outline will be stored into separated files, and there will be no <output-filename>.html generated.

 This switch is useful if you want pages to be loaded separately & dynamically -- in which case you need to compose the page yourself, and a supporting backend might be necessary.

--- a/src/HTMLRenderer/general.cc
+++ b/src/HTMLRenderer/general.cc
@ -101,8 +101,8 @@ void HTMLRenderer::process(PDFDoc *doc)

        if(param->split_pages)
        {
-            auto page_template_fn = str_fmt("%s/%s", param->dest_dir.c_str(), param->output_filename.c_str());
-            auto page_fn = str_fmt(page_template_fn, i);
+            auto filled_template_filename = str_fmt(param->output_filename.c_str(), i);
+            auto page_fn = str_fmt("%s/%s", param->dest_dir.c_str(), string((char*)filled_template_filename).c_str());
            f_pages.fs.open((char*)page_fn, ofstream::binary); 
            if(!f_pages.fs)
                throw string("Cannot open ") + (char*)page_fn + " for writing";
--- a/src/pdf2htmlEX.cc
+++ b/src/pdf2htmlEX.cc
@ -216,7 +216,7 @@ int main(int argc, char **argv)
            if(get_suffix(param.input_filename) == ".pdf")
            {
                if(param.split_pages)
-                    param.output_filename = s.substr(0, s.size() - 4) + "%d.page";
+                    param.output_filename = sanitize_filename(s.substr(0, s.size() - 4) + "%d.page", true);
                else
                    param.output_filename = s.substr(0, s.size() - 4) + ".html";

@ -224,16 +224,26 @@ int main(int argc, char **argv)
            else
            {
                if(param.split_pages)
-                    param.output_filename = s + "%d.page";
+                    param.output_filename = sanitize_filename(s + "%d.page", true);
                else
                    param.output_filename = s + ".html";
                
            }
        }
-		else if(param.split_pages && !std::regex_match(param.output_filename, std::regex("^.*%[0-9]*d.*$")))
+		else if(param.split_pages)
        {
-            const string suffix = get_suffix(param.output_filename);
-            param.output_filename = param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix;
+            // Need to make sure we have a page number placeholder in the filename
+            if(!std::regex_match(param.output_filename, std::regex("^.*%[0-9]*d.*$")))
+            {
+                // Inject the placeholder just before the file extension
+                const string suffix = get_suffix(param.output_filename);
+                param.output_filename = sanitize_filename(param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix, true);
+            }
+            else
+            {
+                // Already have the placeholder, just make sure the name is safe.
+                param.output_filename = sanitize_filename(param.output_filename, true);
+            }
        }
        if(param.css_filename.empty())
        {
--- a/src/util/path.cc
+++ b/src/util/path.cc
@ -6,6 +6,7 @@
 */

 #include <errno.h>
+#include <regex>
 #include <sys/stat.h>
 #include <sys/types.h>

@ -39,6 +40,21 @@ void create_directories(const string & path)
    }
 }

+string sanitize_filename(const string & filename, bool allow_single_format_number)
+{
+    // First, escape all %'s to make safe for use in printf.
+    string sanitized = std::regex_replace(filename, std::regex("%"), "%%");
+    
+    if(allow_single_format_number)
+    {
+        // A single %d or %0xd is allowed in the input.
+        sanitized = std::regex_replace(sanitized, std::regex("%%([0-9]*)d"), "%$1d", std::regex_constants::format_first_only);
+    }
+    
+    return sanitized;
+}
+
+
 bool is_truetype_suffix(const string & suffix)
 {
    return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf");
--- a/src/util/path.h
+++ b/src/util/path.h
@ -19,5 +19,16 @@ bool is_truetype_suffix(const std::string & suffix);
 std::string get_filename(const std::string & path);
 std::string get_suffix(const std::string & path);

+/**
+ * Function to sanitize a filename so that it can be eventually safely used in a printf statement.
+ *
+ * @param filename the filename to be sanitized.
+ * @param allow_single_form_number boolean flag indicatin if a single format (e.g. %d) should be allowed
+ *     in the filename for use in templating of pages. e.g. page%02d.html is ok.
+ *
+ * @return the sanitized filename.
+ */ 
+std::string sanitize_filename(const std::string & filename, bool allow_single_format_number);
+
 } //namespace pdf2htmlEX 
 #endif //PATH_H__
--- a/test/test_data/1-page.pdf
+++ b/test/test_data/1-page.pdf
--- a/test/test_data/2-pages.pdf
+++ b/test/test_data/2-pages.pdf
--- a/test/test_data/3-pages.pdf
+++ b/test/test_data/3-pages.pdf
--- a/test/test_naming.py
+++ b/test/test_naming.py
@ -0,0 +1,227 @@
+#!/usr/bin/env python
+
+import unittest
+import os
+import sys
+import tempfile
+import shutil
+import subprocess
+
+# The location where the executable is generated by the build
+PDF2HTMLEX_PATH = '../pdf2htmlEX'
+
+# The location where the base css file, etc is stored in the build folder
+DATA_DIR = '../share'
+
+# The location where our test PDFs are stored
+TEST_DATA_DIR = './test_data'
+
+def execute_pdf2htmlex_with_args(args):
+    """
+    Execute the pdf2htmlEX with the specified arguments.
+
+    :type args: list of values
+    :param args: list of arguments to pass to executable. First part of each tuple is the argument, second part is the value.
+
+    :rtype: int
+    :return: The exit code of the command
+    """
+    executable = os.path.abspath(os.path.join(os.path.dirname(__file__), PDF2HTMLEX_PATH))
+
+    cmd = [executable, '--data-dir', os.path.abspath(os.path.join(os.path.dirname(__file__), DATA_DIR))]
+
+    for val in args:
+        cmd.append(str(val))
+
+    return_code = subprocess.call(cmd)
+
+    if return_code != 0:
+        print >> sys.stderr, "Command return code %d: %s" % (return_code, ' '.join(cmd))
+
+    return return_code
+
+def execute_pdf2htmlex_and_get_files(args):
+    """
+    Execute the pdf2htmlEX with the specified arguments, and get the names of the output files. Will automatically create
+    a temporary directory for the output, pass that as the output dir to pdf2htmlEX, determine the files generated, and
+    clean up the temporary directory afterwards.
+
+    :type args: list of values
+    :param args: list of arguments to pass to executable. First part of each tuple is the argument, second part is the value.
+
+    :rtype: list of str
+    :return: List of the file names that were generated as output in alphabetical order. None if the command does not execute successfully.
+    """
+    temp_dir = tempfile.mkdtemp()
+
+    try:
+        if execute_pdf2htmlex_with_args(['--dest-dir', temp_dir] + args) != 0:
+            return None
+
+        files = os.listdir(temp_dir)
+        files.sort()
+        return files
+    finally:
+        shutil.rmtree(path=temp_dir, ignore_errors=True)
+
+def path_to_test_file(filename):
+    """
+    Retrieve an absolute path to the specified test file.
+
+    :type filename:
+    :param filename: the name of the test file to get the path to
+
+    :rtype: str
+    :returns: the full path to the test file
+    """
+    return os.path.abspath(os.path.join(os.path.dirname(__file__), TEST_DATA_DIR, filename))
+
+class OutputNamingTests(unittest.TestCase):
+    def test_generate_single_html_default_name_single_page_pdf(self):
+        files = execute_pdf2htmlex_and_get_files([
+            path_to_test_file('1-page.pdf')
+        ])
+        self.assertEquals(files, ['1-page.html'])
+
+    def test_generate_single_html_default_name_multiple_page_pdf(self):
+        files = execute_pdf2htmlex_and_get_files([
+            path_to_test_file('2-pages.pdf')
+        ])
+        self.assertEquals(files, ['2-pages.html'])
+
+    def test_generate_single_html_specify_name_single_page_pdf(self):
+        files = execute_pdf2htmlex_and_get_files([
+            path_to_test_file('1-page.pdf'),
+            'foo.html'
+        ])
+        self.assertEquals(files, ['foo.html'])
+
+    def test_generate_single_html_specify_name_multiple_page_pdf(self):
+        files = execute_pdf2htmlex_and_get_files([
+            path_to_test_file('2-pages.pdf'),
+            'foo.html'
+        ])
+        self.assertEquals(files, ['foo.html'])
+
+    def test_generate_split_pages_default_name_single_page(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('1-page.pdf')
+        ])
+        self.assertEquals(files, sorted(['1-page.css', '1-page.outline', '1-page1.page']))
+
+    def test_generate_split_pages_default_name_multiple_pages(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('3-pages.pdf')
+        ])
+        self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', '3-pages1.page', '3-pages2.page', '3-pages3.page']))
+
+    def test_generate_split_pages_specify_name_single_page(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('1-page.pdf'),
+            'foo.xyz'
+        ])
+        self.assertEquals(files, sorted(['1-page.css', '1-page.outline', 'foo1.xyz']))
+
+    def test_generate_split_pages_specify_name_multiple_pages(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('3-pages.pdf'),
+            'foo.xyz'
+        ])
+        self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'foo1.xyz', 'foo2.xyz', 'foo3.xyz']))
+
+    def test_generate_split_pages_specify_name_formatter_multiple_pages(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('3-pages.pdf'),
+            'fo%do.xyz'
+        ])
+        self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'fo1o.xyz', 'fo2o.xyz', 'fo3o.xyz']))
+
+    def test_generate_split_pages_specify_name_formatter_with_padded_zeros_multiple_pages(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('3-pages.pdf'),
+            'fo%03do.xyz'
+        ])
+        self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'fo001o.xyz', 'fo002o.xyz', 'fo003o.xyz']))
+
+    def test_generate_split_pages_specify_name_only_first_formatter_gets_taken(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('3-pages.pdf'),
+            'f%do%do.xyz'
+        ])
+        self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f1o%do.xyz', 'f2o%do.xyz', 'f3o%do.xyz']))
+
+    def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_s(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('3-pages.pdf'),
+            'f%soo.xyz'
+        ])
+        self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%soo1.xyz', 'f%soo2.xyz', 'f%soo3.xyz']))
+
+    def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_p(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('3-pages.pdf'),
+            'f%poo.xyz'
+        ])
+        self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%poo1.xyz', 'f%poo2.xyz', 'f%poo3.xyz']))
+
+
+    def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_n(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('3-pages.pdf'),
+            'f%noo.xyz'
+        ])
+        self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%noo1.xyz', 'f%noo2.xyz', 'f%noo3.xyz']))
+
+    def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_percent(self):
+        files = execute_pdf2htmlex_and_get_files([
+            '--split-pages', 1,
+            path_to_test_file('3-pages.pdf'),
+            'f%%oo.xyz'
+        ])
+        self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%%oo1.xyz', 'f%%oo2.xyz', 'f%%oo3.xyz']))
+
+    def test_generate_single_html_name_specified_format_characters_percent_d(self):
+        files = execute_pdf2htmlex_and_get_files([
+            path_to_test_file('2-pages.pdf'),
+            'foo%d.html'
+        ])
+        self.assertEquals(files, ['foo%d.html'])
+
+    def test_generate_single_html_name_specified_format_characters_percent_p(self):
+        files = execute_pdf2htmlex_and_get_files([
+            path_to_test_file('2-pages.pdf'),
+            'foo%p.html'
+        ])
+        self.assertEquals(files, ['foo%p.html'])
+
+    def test_generate_single_html_name_specified_format_characters_percent_n(self):
+        files = execute_pdf2htmlex_and_get_files([
+            path_to_test_file('2-pages.pdf'),
+            'foo%n.html'
+        ])
+        self.assertEquals(files, ['foo%n.html'])
+
+    def test_generate_single_html_name_specified_format_characters_percent_percent(self):
+        files = execute_pdf2htmlex_and_get_files([
+            path_to_test_file('2-pages.pdf'),
+            'foo%%.html'
+        ])
+        self.assertEquals(files, ['foo%%.html'])
+        
+if __name__=="__main__":
+    if not os.path.isfile(PDF2HTMLEX_PATH) or not os.access(PDF2HTMLEX_PATH, os.X_OK):
+        print >> sys.stderr, "Cannot locate pdf2htmlEX executable. Make sure source was built before running this test."
+        exit(1)
+
+    suite = unittest.loader.TestLoader().loadTestsFromTestCase(OutputNamingTests)
+    unittest.TextTestRunner(verbosity=2).run(suite)