1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

combined the sanitize and validate funcitons into a single function; limited the format characters supported to avoid validation complexity; updated documentation; feature implemented by Ryan Morlok (ryan.morlok@morlok.com) on behalf of Docalytics (http://www.docalytics.com/)

This commit is contained in:
Ryan Morlok 2013-03-18 01:39:02 -05:00
parent ccc8ff4761
commit 6298c19a3b
5 changed files with 45 additions and 64 deletions

View File

@ -67,7 +67,7 @@ You need to modify the manifest if you do not want outline embedded.
.B --split-pages <0|1> (Default: 0) .B --split-pages <0|1> (Default: 0)
If turned on, the pages, css, and outline will be stored into separated files and no consolidated <output-filename>.html will be generated. If turned on, the pages, css, and outline will be stored into separated files and no consolidated <output-filename>.html will be generated.
<output-filename> may be used to specify the format for the filenames for individual pages. <output-filename> may contain a %d placeholder to indicate where the page number should be placed. <output-filename> may be used to specify the format for the filenames for individual pages. <output-filename> may contain a %d placeholder to indicate where the page number should be placed. The placeholder supports a limited subset of normal numerical placeholders, including specified width and zero padding.
If <output-filename> does not contain a placeholder for the page number, the page number will be inserted directly before the file extension. If the filename does not have an extension, the page number will be placed at the end of the file name. If <output-filename> does not contain a placeholder for the page number, the page number will be inserted directly before the file extension. If the filename does not have an extension, the page number will be placed at the end of the file name.

View File

@ -215,33 +215,39 @@ int main(int argc, char **argv)
if(get_suffix(param.input_filename) == ".pdf") if(get_suffix(param.input_filename) == ".pdf")
{ {
if(param.split_pages) if(param.split_pages)
param.output_filename = sanitize_filename(s.substr(0, s.size() - 4) + "%d.page"); {
param.output_filename = s.substr(0, s.size() - 4) + "%d.page";
sanitize_filename(param.output_filename);
}
else else
{
param.output_filename = s.substr(0, s.size() - 4) + ".html"; param.output_filename = s.substr(0, s.size() - 4) + ".html";
}
} }
else else
{ {
if(param.split_pages) if(param.split_pages)
param.output_filename = sanitize_filename(s + "%d.page"); {
param.output_filename = s + "%d.page";
sanitize_filename(param.output_filename);
}
else else
{
param.output_filename = s + ".html"; param.output_filename = s + ".html";
}
} }
} }
else if(param.split_pages) else if(param.split_pages)
{ {
// Need to make sure we have a page number placeholder in the filename // Need to make sure we have a page number placeholder in the filename
if(!contains_integer_placeholder(param.output_filename)) if(!sanitize_filename(param.output_filename))
{ {
// Inject the placeholder just before the file extension // Inject the placeholder just before the file extension
const string suffix = get_suffix(param.output_filename); const string suffix = get_suffix(param.output_filename);
param.output_filename = sanitize_filename(param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix); param.output_filename = param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix;
} sanitize_filename(param.output_filename);
else
{
// Already have the placeholder, just make sure the name is safe.
param.output_filename = sanitize_filename(param.output_filename);
} }
} }
if(param.css_filename.empty()) if(param.css_filename.empty())

View File

@ -40,7 +40,7 @@ void create_directories(const string & path)
} }
} }
string sanitize_filename(const string & filename) bool sanitize_filename(string & filename)
{ {
string sanitized; string sanitized;
bool format_specifier_found = false; bool format_specifier_found = false;
@ -65,13 +65,13 @@ string sanitize_filename(const string & filename)
tmp.push_back(filename[i]); tmp.push_back(filename[i]);
// If we aren't still in option specifiers, stop looking // If we aren't still in option specifiers, stop looking
if(!strchr("+-#0123456789.", filename[i])) if(!strchr("0123456789", filename[i]))
{ {
break; break;
} }
} }
// Check to see if we yielded a valid format speifier // Check to see if we yielded a valid format specifier
if('d' == tmp.back()) if('d' == tmp.back())
{ {
// Found a valid integer format // Found a valid integer format
@ -94,46 +94,15 @@ string sanitize_filename(const string & filename)
} }
} }
return sanitized; // Only sanitize if it is a valid format.
} if(format_specifier_found)
bool contains_integer_placeholder(const string & filename)
{
for(size_t i = 0; i < filename.size(); i++)
{ {
if('%' == filename[i]) filename.assign(sanitized);
{
size_t original_i = i;
char last_char = '%';
while(++i < filename.size())
{
last_char = filename[i];
// If we aren't still in option specifiers, stop looking
if(!strchr("+-#0123456789.", last_char))
{
break;
}
}
// Check to see if we yielded a valid format speifier
if('d' == last_char)
{
// Yep.
return true;
}
else
{
// Nope. Resume looking where we left off.
i = original_i;
}
}
} }
return false; return format_specifier_found;
} }
bool is_truetype_suffix(const string & suffix) bool is_truetype_suffix(const string & suffix)
{ {
return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf"); return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf");

View File

@ -20,24 +20,14 @@ std::string get_filename(const std::string & path);
std::string get_suffix(const std::string & path); std::string get_suffix(const std::string & path);
/** /**
* Function to sanitize a filename so that it can be eventually safely used in a printf * Sanitize all occurrences of '%' except for the first valid format specifier. Filename
* statement. Allows a single %d placeholder, but no other format specifiers. * is only sanitized if a formatter is found, and the function returns true.
* *
* @param filename the filename to be sanitized. * @param filename the filename to be sanitized. Value will be modified.
* *
* @return the sanitized filename. * @return true if a format specifier was found, false otherwise.
*/ */
std::string sanitize_filename(const std::string & filename); bool sanitize_filename(std::string & filename);
/**
* Function to check if a filename contains at least one %d integer placeholder
* for use in a printf statement.
*
* @param filename the filename to check
*
* @return true if the filename contains an integer placeholder, false otherwise.
*/
bool contains_integer_placeholder(const std::string & filename);
} //namespace pdf2htmlEX } //namespace pdf2htmlEX
#endif //PATH_H__ #endif //PATH_H__

View File

@ -190,6 +190,22 @@ class OutputNamingTests(unittest.TestCase):
]) ])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%%oo1.xyz', 'f%%oo2.xyz', 'f%%oo3.xyz'])) self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%%oo1.xyz', 'f%%oo2.xyz', 'f%%oo3.xyz']))
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_percent_with_actual_placeholder(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'f%%o%do.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'f%%o1o.xyz', 'f%%o2o.xyz', 'f%%o3o.xyz']))
def test_generate_split_pages_specify_name_only_percent_d_is_used_percent_percent_with_actual_placeholder(self):
files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1,
path_to_test_file('3-pages.pdf'),
'fo%do%%.xyz'
])
self.assertEquals(files, sorted(['3-pages.css', '3-pages.outline', 'fo1o%%.xyz', 'fo2o%%.xyz', 'fo3o%%.xyz']))
def test_generate_split_pages_specify_name_only_formatter_starts_part_way_through_invalid_formatter(self): def test_generate_split_pages_specify_name_only_formatter_starts_part_way_through_invalid_formatter(self):
files = execute_pdf2htmlex_and_get_files([ files = execute_pdf2htmlex_and_get_files([
'--split-pages', 1, '--split-pages', 1,