--page-filename; a default HTML when --split-pages is on; Fixes #125

2024-12-22 13:00:08 +00:00 · 2013-05-02 16:09:42 +08:00 · 2013-05-02 16:09:42 +08:00 · 64279bd26d
commit 64279bd26d
parent 3d0b872a7b
7 changed files with 107 additions and 59 deletions
--- a/pdf2htmlEX.1.in
+++ b/pdf2htmlEX.1.in
@ -15,7 +15,7 @@
 .PP
 pdf2htmlEX is a utility that converts PDF files to HTML files.

-pdf2htmlEX tries its best to render the PDF precisely, maintain proper styling, while retaining text and optmizing for Web.
+pdf2htmlEX tries its best to render the PDF precisely, maintain proper styling, while retaining text and optimizing for Web.

 Fonts are extracted form PDF and then embedded into HTML (Type 3 fonts are not supported). Text in the converted HTML file is usually selectable and copyable. 

@ -65,15 +65,15 @@ You need to modify the manifest if you do not want outline embedded.

 .TP
 .B --split-pages <0|1> (Default: 0)
-If turned on, the pages, css, and outline will be stored into separated files and no consolidated <output-filename>.html will be generated. 
+If turned on, the content of each page is stored in a separated file.

-<output-filename> may be used to specify the format for the filenames for individual pages. <output-filename> may contain a %d placeholder to indicate where the page number should be placed. The placeholder supports a limited subset of normal numerical placeholders, including specified width and zero padding.
+--page-filename may be used to specify the format for the filenames for individual pages. A %d placeholder may be included to indicate where the page number should be placed. The placeholder supports a limited subset of normal numerical placeholders, including specified width and zero padding.

-If <output-filename> does not contain a placeholder for the page number, the page number will be inserted directly before the file extension. If the filename does not have an extension, the page number will be placed at the end of the file name.
+If --page-filename does not contain a placeholder for the page number, the page number will be inserted directly before the file extension. If the filename does not have an extension, the page number will be placed at the end of the file name.

-If <output-filename> is not specified, <input-filename> will be used for the output filename, replacing the extension with .page and adding the page number directly before the extension.
+If --page-filename is not specified, <input-filename> will be used for the output filename, replacing the extension with .page and adding the page number directly before the extension.

-This switch is useful if you want pages to be loaded separately & dynamically -- in which case you need to compose the page yourself, and a supporting backend might be necessary.
+This switch is useful if you want pages to be loaded separately & dynamically -- a supporting server might be necessary.

 .B Examples

@ -103,6 +103,12 @@ Specify the filename of the generated css file, if not embedded.

 If it's empty, the file name will be determined automatically.

+.TP
+.B --page-filename <filename> (Default: <none>)
+Specify the filename template for pages. This is only useful when --split-pages is 1
+
+If it's empty, a default one will be used, see description of --split-pages
+
 .TP
 .B --outline-filename <filename> (Default: <none>)
 Specify the filename of the generated outline file, if not embedded. 
--- a/share/manifest
+++ b/share/manifest
@ -38,7 +38,12 @@ $css
 new pdf2htmlEX.Viewer({
    container_id : 'page-container', 
    sidebar_id : 'sidebar',
-    outline_id : 'outline'
+    outline_id : 'outline',
+    page_urls : [
+"""
+$page_urls
+"""
+]
 });
 </script>
 """
--- a/share/pdf2htmlEX.js.in
+++ b/share/pdf2htmlEX.js.in
@ -107,6 +107,7 @@ var pdf2htmlEX = (function(){
    this.container_id = config['container_id'];
    this.sidebar_id = config['sidebar_id'];
    this.outline_id = config['outline_id'];
+    this.page_urls = config['page_urls'];
    this.init_before_loading_content();

    var _ = this;
@ -122,7 +123,6 @@ var pdf2htmlEX = (function(){
      /*hide all pages before loading, will reveal only visible ones later */
      this.pre_hide_pages();
    },
-
    init_after_loading_content : function() {
      this.sidebar = $('#'+this.sidebar_id);
      this.outline = $('#'+this.outline_id);
@ -133,15 +133,7 @@ var pdf2htmlEX = (function(){
        this.sidebar.addClass('opened');
      }
      
-      // collect pages
-      var new_pages = new Array();
-      var pl= $('.'+CSS_CLASS_NAMES['page_frame'], this.container);
-      /* don't use for(..in..) */
-      for(var i = 0, l = pl.length; i < l; ++i) {
-        var p = new Page(pl[i], this.container);
-        new_pages[p.n] = p;
-      }
-      this.pages = new_pages;
+      this.find_pages();

      // register schedule rendering
      var _ = this;
@ -156,6 +148,34 @@ var pdf2htmlEX = (function(){
      $('.'+CSS_CLASS_NAMES['background_image'], this.container).on('dragstart', function(e){return false;});

      this.render();
+
+      // load split pages
+      // has no effect if --split-pages is 0
+      this.load_page(0);
+    },
+    find_pages : function() {
+      var new_pages = new Array();
+      var pl= $('.'+CSS_CLASS_NAMES['page_frame'], this.container);
+      /* don't use for(..in..) */
+      for(var i = 0, l = pl.length; i < l; ++i) {
+        var p = new Page(pl[i], this.container);
+        new_pages[p.n] = p;
+      }
+      this.pages = new_pages;
+    },
+    load_page : function(idx) {
+      if(idx < this.page_urls.length){
+        var _ = this;
+        $.ajax({
+          url: this.page_urls[idx],
+          dataType: 'text'
+        }).done(function(data){
+          $('#'+_.container_id).append(data);
+          _.find_pages();
+          _.schedule_render();
+          _.load_page(idx+1);
+        });
+      }
    },
    pre_hide_pages : function() {
      /* pages might have not been loaded yet, so add a CSS rule */
--- a/src/HTMLRenderer/HTMLRenderer.h
+++ b/src/HTMLRenderer/HTMLRenderer.h
@ -309,6 +309,8 @@ protected:

    Preprocessor preprocessor;
    TmpFiles tmp_files;
+    // for splitted pages
+    std::vector<std::string> page_filenames;

    // for string formatting
    StringFormatter str_fmt;
--- a/src/HTMLRenderer/general.cc
+++ b/src/HTMLRenderer/general.cc
@ -110,12 +110,14 @@ void HTMLRenderer::process(PDFDoc *doc)

        if(param.split_pages)
        {
-            string filled_template_filename = (char*)str_fmt(param.output_filename.c_str(), i);
+            string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i);
            auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str());
            f_pages.fs.open((char*)page_fn, ofstream::binary); 
            if(!f_pages.fs)
                throw string("Cannot open ") + (char*)page_fn + " for writing";
            set_stream_flags(f_pages.fs);
+
+            page_filenames.push_back(filled_template_filename);
        }

        if(param.process_nontext)
@ -273,23 +275,23 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
    // we may output utf8 characters, so always use binary
    {
        /*
-         * If single-html && !split-pages
+         * If single-html
         * we have to keep the generated css file into a temporary place
         * and embed it into the main html later
         *
         *
-         * If single-html && split-page
+         * If single-html
         * as there's no place to embed the css file, just leave it alone (into param.dest_dir)
         *
         * If !single-html
         * leave it in param.dest_dir
         */

-        auto fn = (param.single_html && (!param.split_pages))
+        auto fn = (param.single_html)
            ? str_fmt("%s/__css", param.tmp_dir.c_str())
            : str_fmt("%s/%s", param.dest_dir.c_str(), param.css_filename.c_str());

-        if(param.single_html && (!param.split_pages))
+        if(param.single_html)
            tmp_files.add((char*)fn);

        f_css.path = (char*)fn;
@ -305,11 +307,11 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
         * The logic for outline is similar to css
         */

-        auto fn = (param.single_html && (!param.split_pages))
+        auto fn = (param.single_html)
            ? str_fmt("%s/__outline", param.tmp_dir.c_str())
            : str_fmt("%s/%s", param.dest_dir.c_str(), param.outline_filename.c_str());

-        if(param.single_html && (!param.split_pages))
+        if(param.single_html)
            tmp_files.add((char*)fn);

        f_outline.path = (char*)fn;
@ -355,10 +357,7 @@ void HTMLRenderer::post_process(void)
    f_pages.fs.close(); 
    f_css.fs.close();

-    //only when split-page == 0, do we have some work left to do
-    if(param.split_pages)
-        return;
-
+    // build the main HTML file
    ofstream output;
    {
        auto fn = str_fmt("%s/%s", param.dest_dir.c_str(), param.output_filename.c_str());
@ -375,8 +374,11 @@ void HTMLRenderer::post_process(void)

    bool embed_string = false;
    string line;
+    long line_no = 0;
    while(getline(manifest_fin, line))
    {
+        ++line_no;
+
        if(line == "\"\"\"")
        {
            embed_string = !embed_string;
@ -420,15 +422,29 @@ void HTMLRenderer::post_process(void)
            }
            else if (line == "$pages")
            {
-                ifstream fin(f_pages.path, ifstream::binary);
-                if(!fin)
-                    throw "Cannot open pages for reading";
-                output << fin.rdbuf();
-                output.clear(); // output will set fail big if fin is empty
+                if(!param.split_pages)
+                {
+                    ifstream fin(f_pages.path, ifstream::binary);
+                    if(!fin)
+                        throw "Cannot open pages for reading";
+                    output << fin.rdbuf();
+                    output.clear(); // output will set fail bit if fin is empty
+                }
+            }
+            else if (line == "$page_urls")
+            {
+                for(auto iter = page_filenames.begin(); iter != page_filenames.end(); ++iter)
+                {
+                    if(iter != page_filenames.begin())
+                        output << ",";
+                    output << "'";
+                    outputURL(output, *iter);
+                    output << "'";
+                }
            }
            else
            {
-                cerr << "Warning: unknown line in manifest: " << line << endl;
+                cerr << "Warning: manifest line " << line_no << ": Unknown content \"" << line << "\"" << endl;
            }
            continue;
        }
--- a/src/Param.h
+++ b/src/Param.h
@ -29,6 +29,7 @@ struct Param
    int split_pages;
    std::string dest_dir;
    std::string css_filename;
+    std::string page_filename;
    std::string outline_filename;
    int process_nontext;
    int process_outline;
--- a/src/pdf2htmlEX.cc
+++ b/src/pdf2htmlEX.cc
@ -74,6 +74,7 @@ void parse_options (int argc, char **argv)
        .add("split-pages", &param.split_pages, 0, "split pages into separate files")
        .add("dest-dir", &param.dest_dir, ".", "specify destination directory")
        .add("css-filename", &param.css_filename, "", "filename of the generated css file")
+        .add("page-filename", &param.page_filename, "", "filename template for splitted pages ")
        .add("outline-filename", &param.outline_filename, "", "filename of the generated outline file")
        .add("process-nontext", &param.process_nontext, 1, "render graphics in addition to text")
        .add("process-outline", &param.process_outline, 1, "show outline in HTML")
@ -215,43 +216,40 @@ int main(int argc, char **argv)
        if(param.output_filename.empty())
        {
            const string s = get_filename(param.input_filename);
-
            if(get_suffix(param.input_filename) == ".pdf")
            {
-                if(param.split_pages)
-                {
-                    param.output_filename = s.substr(0, s.size() - 4) + "%d.page";
-                    sanitize_filename(param.output_filename);
-                }
-                else
-                {
-                    param.output_filename = s.substr(0, s.size() - 4) + ".html";
-                }
+                param.output_filename = s.substr(0, s.size() - 4) + ".html";

            }
            else
            {
-                if(param.split_pages)
-                {
-                    param.output_filename = s + "%d.page";
-                    sanitize_filename(param.output_filename);
-                }
-                else
-                {
-                    param.output_filename = s + ".html";
-                }
-                
+                param.output_filename = s + ".html";
            }
        }
-		else if(param.split_pages)
+
+        if(param.page_filename.empty())
+        {
+            const string s = get_filename(param.input_filename);
+            if(get_suffix(param.input_filename) == ".pdf")
+            {
+                param.page_filename = s.substr(0, s.size() - 4) + "%d.page";
+            }
+            else
+            {
+                param.page_filename = s + "%d.page";
+            }
+            sanitize_filename(param.page_filename);
+        }
+
+		else
        {
            // Need to make sure we have a page number placeholder in the filename
-            if(!sanitize_filename(param.output_filename))
+            if(!sanitize_filename(param.page_filename))
            {
                // Inject the placeholder just before the file extension
-                const string suffix = get_suffix(param.output_filename);
-                param.output_filename = param.output_filename.substr(0, param.output_filename.size() - suffix.size()) + "%d" + suffix;
-                sanitize_filename(param.output_filename);
+                const string suffix = get_suffix(param.page_filename);
+                param.page_filename = param.page_filename.substr(0, param.page_filename.size() - suffix.size()) + "%d" + suffix;
+                sanitize_filename(param.page_filename);
            }
        }
        if(param.css_filename.empty())