From 64279bd26d3f1af456bb59570e2c119e1ebd747d Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 2 May 2013 16:09:42 +0800 Subject: [PATCH] --page-filename; a default HTML when --split-pages is on; Fixes #125 --- pdf2htmlEX.1.in | 18 ++++++++---- share/manifest | 7 ++++- share/pdf2htmlEX.js.in | 40 +++++++++++++++++++------- src/HTMLRenderer/HTMLRenderer.h | 2 ++ src/HTMLRenderer/general.cc | 50 ++++++++++++++++++++++----------- src/Param.h | 1 + src/pdf2htmlEX.cc | 48 +++++++++++++++---------------- 7 files changed, 107 insertions(+), 59 deletions(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 22d5fad..3f6528e 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -15,7 +15,7 @@ .PP pdf2htmlEX is a utility that converts PDF files to HTML files. -pdf2htmlEX tries its best to render the PDF precisely, maintain proper styling, while retaining text and optmizing for Web. +pdf2htmlEX tries its best to render the PDF precisely, maintain proper styling, while retaining text and optimizing for Web. Fonts are extracted form PDF and then embedded into HTML (Type 3 fonts are not supported). Text in the converted HTML file is usually selectable and copyable. @@ -65,15 +65,15 @@ You need to modify the manifest if you do not want outline embedded. .TP .B --split-pages <0|1> (Default: 0) -If turned on, the pages, css, and outline will be stored into separated files and no consolidated .html will be generated. +If turned on, the content of each page is stored in a separated file. - may be used to specify the format for the filenames for individual pages. may contain a %d placeholder to indicate where the page number should be placed. The placeholder supports a limited subset of normal numerical placeholders, including specified width and zero padding. +--page-filename may be used to specify the format for the filenames for individual pages. A %d placeholder may be included to indicate where the page number should be placed. The placeholder supports a limited subset of normal numerical placeholders, including specified width and zero padding. -If does not contain a placeholder for the page number, the page number will be inserted directly before the file extension. If the filename does not have an extension, the page number will be placed at the end of the file name. +If --page-filename does not contain a placeholder for the page number, the page number will be inserted directly before the file extension. If the filename does not have an extension, the page number will be placed at the end of the file name. -If is not specified, will be used for the output filename, replacing the extension with .page and adding the page number directly before the extension. +If --page-filename is not specified, will be used for the output filename, replacing the extension with .page and adding the page number directly before the extension. -This switch is useful if you want pages to be loaded separately & dynamically -- in which case you need to compose the page yourself, and a supporting backend might be necessary. +This switch is useful if you want pages to be loaded separately & dynamically -- a supporting server might be necessary. .B Examples @@ -103,6 +103,12 @@ Specify the filename of the generated css file, if not embedded. If it's empty, the file name will be determined automatically. +.TP +.B --page-filename (Default: ) +Specify the filename template for pages. This is only useful when --split-pages is 1 + +If it's empty, a default one will be used, see description of --split-pages + .TP .B --outline-filename (Default: ) Specify the filename of the generated outline file, if not embedded. diff --git a/share/manifest b/share/manifest index 2fa7a85..fb38d05 100644 --- a/share/manifest +++ b/share/manifest @@ -38,7 +38,12 @@ $css new pdf2htmlEX.Viewer({ container_id : 'page-container', sidebar_id : 'sidebar', - outline_id : 'outline' + outline_id : 'outline', + page_urls : [ +""" +$page_urls +""" +] }); """ diff --git a/share/pdf2htmlEX.js.in b/share/pdf2htmlEX.js.in index ef9d24c..a1b70cf 100644 --- a/share/pdf2htmlEX.js.in +++ b/share/pdf2htmlEX.js.in @@ -107,6 +107,7 @@ var pdf2htmlEX = (function(){ this.container_id = config['container_id']; this.sidebar_id = config['sidebar_id']; this.outline_id = config['outline_id']; + this.page_urls = config['page_urls']; this.init_before_loading_content(); var _ = this; @@ -122,7 +123,6 @@ var pdf2htmlEX = (function(){ /*hide all pages before loading, will reveal only visible ones later */ this.pre_hide_pages(); }, - init_after_loading_content : function() { this.sidebar = $('#'+this.sidebar_id); this.outline = $('#'+this.outline_id); @@ -133,15 +133,7 @@ var pdf2htmlEX = (function(){ this.sidebar.addClass('opened'); } - // collect pages - var new_pages = new Array(); - var pl= $('.'+CSS_CLASS_NAMES['page_frame'], this.container); - /* don't use for(..in..) */ - for(var i = 0, l = pl.length; i < l; ++i) { - var p = new Page(pl[i], this.container); - new_pages[p.n] = p; - } - this.pages = new_pages; + this.find_pages(); // register schedule rendering var _ = this; @@ -156,6 +148,34 @@ var pdf2htmlEX = (function(){ $('.'+CSS_CLASS_NAMES['background_image'], this.container).on('dragstart', function(e){return false;}); this.render(); + + // load split pages + // has no effect if --split-pages is 0 + this.load_page(0); + }, + find_pages : function() { + var new_pages = new Array(); + var pl= $('.'+CSS_CLASS_NAMES['page_frame'], this.container); + /* don't use for(..in..) */ + for(var i = 0, l = pl.length; i < l; ++i) { + var p = new Page(pl[i], this.container); + new_pages[p.n] = p; + } + this.pages = new_pages; + }, + load_page : function(idx) { + if(idx < this.page_urls.length){ + var _ = this; + $.ajax({ + url: this.page_urls[idx], + dataType: 'text' + }).done(function(data){ + $('#'+_.container_id).append(data); + _.find_pages(); + _.schedule_render(); + _.load_page(idx+1); + }); + } }, pre_hide_pages : function() { /* pages might have not been loaded yet, so add a CSS rule */ diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 85d9eb0..844a1d1 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -309,6 +309,8 @@ protected: Preprocessor preprocessor; TmpFiles tmp_files; + // for splitted pages + std::vector page_filenames; // for string formatting StringFormatter str_fmt; diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index d327867..950501f 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -110,12 +110,14 @@ void HTMLRenderer::process(PDFDoc *doc) if(param.split_pages) { - string filled_template_filename = (char*)str_fmt(param.output_filename.c_str(), i); + string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i); auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str()); f_pages.fs.open((char*)page_fn, ofstream::binary); if(!f_pages.fs) throw string("Cannot open ") + (char*)page_fn + " for writing"; set_stream_flags(f_pages.fs); + + page_filenames.push_back(filled_template_filename); } if(param.process_nontext) @@ -273,23 +275,23 @@ void HTMLRenderer::pre_process(PDFDoc * doc) // we may output utf8 characters, so always use binary { /* - * If single-html && !split-pages + * If single-html * we have to keep the generated css file into a temporary place * and embed it into the main html later * * - * If single-html && split-page + * If single-html * as there's no place to embed the css file, just leave it alone (into param.dest_dir) * * If !single-html * leave it in param.dest_dir */ - auto fn = (param.single_html && (!param.split_pages)) + auto fn = (param.single_html) ? str_fmt("%s/__css", param.tmp_dir.c_str()) : str_fmt("%s/%s", param.dest_dir.c_str(), param.css_filename.c_str()); - if(param.single_html && (!param.split_pages)) + if(param.single_html) tmp_files.add((char*)fn); f_css.path = (char*)fn; @@ -305,11 +307,11 @@ void HTMLRenderer::pre_process(PDFDoc * doc) * The logic for outline is similar to css */ - auto fn = (param.single_html && (!param.split_pages)) + auto fn = (param.single_html) ? str_fmt("%s/__outline", param.tmp_dir.c_str()) : str_fmt("%s/%s", param.dest_dir.c_str(), param.outline_filename.c_str()); - if(param.single_html && (!param.split_pages)) + if(param.single_html) tmp_files.add((char*)fn); f_outline.path = (char*)fn; @@ -355,10 +357,7 @@ void HTMLRenderer::post_process(void) f_pages.fs.close(); f_css.fs.close(); - //only when split-page == 0, do we have some work left to do - if(param.split_pages) - return; - + // build the main HTML file ofstream output; { auto fn = str_fmt("%s/%s", param.dest_dir.c_str(), param.output_filename.c_str()); @@ -375,8 +374,11 @@ void HTMLRenderer::post_process(void) bool embed_string = false; string line; + long line_no = 0; while(getline(manifest_fin, line)) { + ++line_no; + if(line == "\"\"\"") { embed_string = !embed_string; @@ -420,15 +422,29 @@ void HTMLRenderer::post_process(void) } else if (line == "$pages") { - ifstream fin(f_pages.path, ifstream::binary); - if(!fin) - throw "Cannot open pages for reading"; - output << fin.rdbuf(); - output.clear(); // output will set fail big if fin is empty + if(!param.split_pages) + { + ifstream fin(f_pages.path, ifstream::binary); + if(!fin) + throw "Cannot open pages for reading"; + output << fin.rdbuf(); + output.clear(); // output will set fail bit if fin is empty + } + } + else if (line == "$page_urls") + { + for(auto iter = page_filenames.begin(); iter != page_filenames.end(); ++iter) + { + if(iter != page_filenames.begin()) + output << ","; + output << "'"; + outputURL(output, *iter); + output << "'"; + } } else { - cerr << "Warning: unknown line in manifest: " << line << endl; + cerr << "Warning: manifest line " << line_no << ": Unknown content \"" << line << "\"" << endl; } continue; } diff --git a/src/Param.h b/src/Param.h index 0178ec3..5ce0d4d 100644 --- a/src/Param.h +++ b/src/Param.h @@ -29,6 +29,7 @@ struct Param int split_pages; std::string dest_dir; std::string css_filename; + std::string page_filename; std::string outline_filename; int process_nontext; int process_outline; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index a5258ea..dfb89bf 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -74,6 +74,7 @@ void parse_options (int argc, char **argv) .add("split-pages", ¶m.split_pages, 0, "split pages into separate files") .add("dest-dir", ¶m.dest_dir, ".", "specify destination directory") .add("css-filename", ¶m.css_filename, "", "filename of the generated css file") + .add("page-filename", ¶m.page_filename, "", "filename template for splitted pages ") .add("outline-filename", ¶m.outline_filename, "", "filename of the generated outline file") .add("process-nontext", ¶m.process_nontext, 1, "render graphics in addition to text") .add("process-outline", ¶m.process_outline, 1, "show outline in HTML") @@ -215,43 +216,40 @@ int main(int argc, char **argv) if(param.output_filename.empty()) { const string s = get_filename(param.input_filename); - if(get_suffix(param.input_filename) == ".pdf") { - if(param.split_pages) - { - param.output_filename = s.substr(0, s.size() - 4) + "%d.page"; - sanitize_filename(param.output_filename); - } - else - { - param.output_filename = s.substr(0, s.size() - 4) + ".html"; - } + param.output_filename = s.substr(0, s.size() - 4) + ".html"; } else { - if(param.split_pages) - { - param.output_filename = s + "%d.page"; - sanitize_filename(param.output_filename); - } - else - { - param.output_filename = s + ".html"; - } - + param.output_filename = s + ".html"; } } - else if(param.split_pages) + + if(param.page_filename.empty()) + { + const string s = get_filename(param.input_filename); + if(get_suffix(param.input_filename) == ".pdf") + { + param.page_filename = s.substr(0, s.size() - 4) + "%d.page"; + } + else + { + param.page_filename = s + "%d.page"; + } + sanitize_filename(param.page_filename); + } + + else { // Need to make sure we have a page number placeholder in the filename - if(!sanitize_filename(param.output_filename)) + if(!sanitize_filename(param.page_filename)) { // Inject the placeholder just before the file extension - const string suffix = get_suffix(param.output_filename); - param.output_filename = param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix; - sanitize_filename(param.output_filename); + const string suffix = get_suffix(param.page_filename); + param.page_filename = param.page_filename.substr(0, param.page_filename.size() - suffix.size()) + "%d" + suffix; + sanitize_filename(param.page_filename); } } if(param.css_filename.empty())