From 57c02b19727acb067e3ead7702579425ef91517f Mon Sep 17 00:00:00 2001 From: Marc Sanfacon Date: Fri, 22 Nov 2013 16:39:28 -0500 Subject: [PATCH] Added a parameter to limit the output size. This is an estimate, but should be good enough. --- src/HTMLRenderer/general.cc | 57 ++++++++++++++++++++----------------- src/Param.h | 18 ++++++------ src/TmpFiles.cc | 16 +++++++++-- src/TmpFiles.h | 11 +++---- src/pdf2htmlEX.cc | 3 +- 5 files changed, 62 insertions(+), 43 deletions(-) diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 416725f..2d5721a 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -45,7 +45,7 @@ HTMLRenderer::HTMLRenderer(const Param & param) ,param(param) ,html_text_page(param, all_manager) ,preprocessor(param) - ,tmp_files(param) + ,tmp_files(param) { if(!(param.debug)) { @@ -79,7 +79,7 @@ HTMLRenderer::HTMLRenderer(const Param & param) } HTMLRenderer::~HTMLRenderer() -{ +{ ffw_finalize(); delete [] cur_mapping; delete [] cur_mapping2; @@ -96,7 +96,7 @@ void HTMLRenderer::process(PDFDoc *doc) /////////////////// // Process pages - + bg_renderer = nullptr; if(param.process_nontext) { @@ -107,15 +107,20 @@ void HTMLRenderer::process(PDFDoc *doc) } int page_count = (param.last_page - param.first_page + 1); - for(int i = param.first_page; i <= param.last_page ; ++i) + for(int i = param.first_page; i <= param.last_page ; ++i) { + if (param.max_size != -1 && tmp_files.get_total_size() > param.max_size * 1024) { + cerr << "Stop processing, reach max size\n"; + break; + } + cerr << "Working: " << (i-param.first_page) << "/" << page_count << '\r' << flush; if(param.split_pages) { string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i); auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str()); - f_curpage = new ofstream((char*)page_fn, ofstream::binary); + f_curpage = new ofstream((char*)page_fn, ofstream::binary); if(!(*f_curpage)) throw string("Cannot open ") + (char*)page_fn + " for writing"; set_stream_flags((*f_curpage)); @@ -128,9 +133,9 @@ void HTMLRenderer::process(PDFDoc *doc) bg_renderer->render_page(doc, i); } - doc->displayPage(this, i, + doc->displayPage(this, i, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, - 0, + 0, (!(param.use_cropbox)), true, // crop false, // printing @@ -149,7 +154,7 @@ void HTMLRenderer::process(PDFDoc *doc) //////////////////////// // Process Outline if(param.process_outline) - process_outline(); + process_outline(); post_process(); @@ -170,7 +175,7 @@ void HTMLRenderer::setDefaultCTM(double *ctm) #if POPPLER_OLDER_THAN_0_23_0 void HTMLRenderer::startPage(int pageNum, GfxState *state) #else -void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) +void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) #endif { this->pageNum = pageNum; @@ -183,12 +188,12 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) long long wid = all_manager.width.install(pageWidth); long long hid = all_manager.height.install(pageHeight); (*f_curpage) - << "
" - << "
"; - + // close page (*f_curpage) << "
" << endl; @@ -266,7 +271,7 @@ void HTMLRenderer::pre_process(PDFDoc * doc) */ { vector zoom_factors; - + if(is_positive(param.zoom)) { zoom_factors.push_back(param.zoom); @@ -283,8 +288,8 @@ void HTMLRenderer::pre_process(PDFDoc * doc) } double zoom = (zoom_factors.empty() ? 1.0 : (*min_element(zoom_factors.begin(), zoom_factors.end()))); - - text_scale_factor1 = max(zoom, param.font_size_multiplier); + + text_scale_factor1 = max(zoom, param.font_size_multiplier); text_scale_factor2 = zoom / text_scale_factor1; } @@ -340,13 +345,13 @@ void HTMLRenderer::pre_process(PDFDoc * doc) * we have to keep the html file for pages into a temporary place * because we'll have to embed css before it * - * Otherwise just generate it + * Otherwise just generate it */ auto fn = str_fmt("%s/__pages", param.tmp_dir.c_str()); tmp_files.add((char*)fn); f_pages.path = (char*)fn; - f_pages.fs.open(f_pages.path, ofstream::binary); + f_pages.fs.open(f_pages.path, ofstream::binary); if(!f_pages.fs) throw string("Cannot open ") + (char*)fn + " for writing"; set_stream_flags(f_pages.fs); @@ -371,7 +376,7 @@ void HTMLRenderer::post_process(void) { f_outline.fs.close(); } - f_pages.fs.close(); + f_pages.fs.close(); f_css.fs.close(); // build the main HTML file @@ -492,7 +497,7 @@ void HTMLRenderer::dump_css (void) all_manager.width .dump_css(f_css.fs); all_manager.left .dump_css(f_css.fs); all_manager.bgimage_size .dump_css(f_css.fs); - + // print css if(param.printing) { @@ -518,8 +523,8 @@ void HTMLRenderer::dump_css (void) void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy) { string fn = get_filename(path); - string suffix = (type == "") ? get_suffix(fn) : type; - + string suffix = (type == "") ? get_suffix(fn) : type; + // TODO auto iter = EMBED_STRING_MAP.find(suffix); if(iter == EMBED_STRING_MAP.end()) @@ -529,14 +534,14 @@ void HTMLRenderer::embed_file(ostream & out, const string & path, const string & } const auto & entry = iter->second; - + if(param.*(entry.embed_flag)) { ifstream fin(path, ifstream::binary); if(!fin) throw string("Cannot open file ") + path + " for embedding"; out << entry.prefix_embed; - + if(entry.base64_encode) { out << Base64Stream(fin); diff --git a/src/Param.h b/src/Param.h index 8a566e7..4816f72 100644 --- a/src/Param.h +++ b/src/Param.h @@ -17,20 +17,21 @@ struct Param { // pages int first_page, last_page; - + // dimensions double zoom; double fit_width, fit_height; int use_cropbox; double h_dpi, v_dpi; - - // output + + // output int embed_css; int embed_font; int embed_image; int embed_javascript; int embed_outline; int split_pages; + int max_size; std::string dest_dir; std::string css_filename; std::string page_filename; @@ -39,7 +40,7 @@ struct Param int process_outline; int printing; int fallback; - + // fonts int embed_external_font; std::string font_format; @@ -50,7 +51,7 @@ struct Param int squeeze_wide_glyph; int override_fstype; int process_type3; - + // text double h_eps, v_eps; double space_threshold; @@ -61,17 +62,18 @@ struct Param // background image std::string bg_format; - + // encryption std::string owner_password, user_password; int no_drm; - + // misc. int clean_tmp; std::string data_dir; + std::string basetmp_dir; int css_draw; int debug; - + std::string input_filename, output_filename; // not a paramater diff --git a/src/TmpFiles.cc b/src/TmpFiles.cc index efaf0cf..b55e341 100644 --- a/src/TmpFiles.cc +++ b/src/TmpFiles.cc @@ -9,6 +9,7 @@ #include #include +#include #include "TmpFiles.h" #include "Param.h" @@ -19,11 +20,11 @@ namespace pdf2htmlEX { TmpFiles::TmpFiles( const Param& param ) - : param( param ) + : param( param ) { } TmpFiles::~TmpFiles() -{ +{ clean(); } @@ -54,5 +55,16 @@ void TmpFiles::clean() cerr << "Remove temporary directory: " << param.tmp_dir << endl; } +double TmpFiles::get_total_size() const +{ + double total_size = 0; + struct _stat st; + for(auto iter = tmp_files.begin(); iter != tmp_files.end(); ++iter) { + _stat(iter->c_str(), &st); + total_size += st.st_size; + } + + return total_size; +} } // namespace pdf2htmlEX diff --git a/src/TmpFiles.h b/src/TmpFiles.h index b7ad46c..277281d 100644 --- a/src/TmpFiles.h +++ b/src/TmpFiles.h @@ -7,19 +7,20 @@ namespace pdf2htmlEX { -class TmpFiles +class TmpFiles { public: explicit TmpFiles( const Param& param ); ~TmpFiles(); - void add( const std::string& fn); + void add( const std::string& fn); + double get_total_size() const; private: - void clean(); - + void clean(); + const Param& param; - std::set tmp_files; + std::set tmp_files; }; } // namespace pdf2htmlEX diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index cbfce42..e6d47b1 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -141,7 +141,6 @@ void prepare_directories() stringstream ss; ss << setw(6) << rand_value; - std::cout << "1- " << tmp_dir << endl; tmp_dir.erase(tmp_dir.size() - 6); param.tmp_dir = tmp_dir + ss.str(); ::CreateDirectory(param.tmp_dir.c_str(), NULL); @@ -170,6 +169,7 @@ void parse_options (int argc, char **argv) .add("embed-image", ¶m.embed_image, 1, "embed image files into output") .add("embed-javascript", ¶m.embed_javascript, 1, "embed JavaScript files into output") .add("embed-outline", ¶m.embed_outline, 1, "embed outlines into output") + .add("max-output-size", ¶m.max_size, -1, "maximum output size, in KB (-1 for no max)") .add("split-pages", ¶m.split_pages, 0, "split pages into separate files") .add("dest-dir", ¶m.dest_dir, ".", "specify destination directory") .add("css-filename", ¶m.css_filename, "", "filename of the generated css file") @@ -390,7 +390,6 @@ int main(int argc, char **argv) cerr << "temporary dir: " << (param.tmp_dir) << endl; } - exit(0); try { create_directories(param.dest_dir);