1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-08 10:50:33 +00:00

Added a parameter to limit the output size. This is an estimate, but should be good enough.

This commit is contained in:
Marc Sanfacon 2013-11-22 16:39:28 -05:00
parent 589047144a
commit 57c02b1972
5 changed files with 62 additions and 43 deletions

View File

@ -45,7 +45,7 @@ HTMLRenderer::HTMLRenderer(const Param & param)
,param(param) ,param(param)
,html_text_page(param, all_manager) ,html_text_page(param, all_manager)
,preprocessor(param) ,preprocessor(param)
,tmp_files(param) ,tmp_files(param)
{ {
if(!(param.debug)) if(!(param.debug))
{ {
@ -79,7 +79,7 @@ HTMLRenderer::HTMLRenderer(const Param & param)
} }
HTMLRenderer::~HTMLRenderer() HTMLRenderer::~HTMLRenderer()
{ {
ffw_finalize(); ffw_finalize();
delete [] cur_mapping; delete [] cur_mapping;
delete [] cur_mapping2; delete [] cur_mapping2;
@ -96,7 +96,7 @@ void HTMLRenderer::process(PDFDoc *doc)
/////////////////// ///////////////////
// Process pages // Process pages
bg_renderer = nullptr; bg_renderer = nullptr;
if(param.process_nontext) if(param.process_nontext)
{ {
@ -107,15 +107,20 @@ void HTMLRenderer::process(PDFDoc *doc)
} }
int page_count = (param.last_page - param.first_page + 1); int page_count = (param.last_page - param.first_page + 1);
for(int i = param.first_page; i <= param.last_page ; ++i) for(int i = param.first_page; i <= param.last_page ; ++i)
{ {
if (param.max_size != -1 && tmp_files.get_total_size() > param.max_size * 1024) {
cerr << "Stop processing, reach max size\n";
break;
}
cerr << "Working: " << (i-param.first_page) << "/" << page_count << '\r' << flush; cerr << "Working: " << (i-param.first_page) << "/" << page_count << '\r' << flush;
if(param.split_pages) if(param.split_pages)
{ {
string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i); string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i);
auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str()); auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str());
f_curpage = new ofstream((char*)page_fn, ofstream::binary); f_curpage = new ofstream((char*)page_fn, ofstream::binary);
if(!(*f_curpage)) if(!(*f_curpage))
throw string("Cannot open ") + (char*)page_fn + " for writing"; throw string("Cannot open ") + (char*)page_fn + " for writing";
set_stream_flags((*f_curpage)); set_stream_flags((*f_curpage));
@ -128,9 +133,9 @@ void HTMLRenderer::process(PDFDoc *doc)
bg_renderer->render_page(doc, i); bg_renderer->render_page(doc, i);
} }
doc->displayPage(this, i, doc->displayPage(this, i,
text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI,
0, 0,
(!(param.use_cropbox)), (!(param.use_cropbox)),
true, // crop true, // crop
false, // printing false, // printing
@ -149,7 +154,7 @@ void HTMLRenderer::process(PDFDoc *doc)
//////////////////////// ////////////////////////
// Process Outline // Process Outline
if(param.process_outline) if(param.process_outline)
process_outline(); process_outline();
post_process(); post_process();
@ -170,7 +175,7 @@ void HTMLRenderer::setDefaultCTM(double *ctm)
#if POPPLER_OLDER_THAN_0_23_0 #if POPPLER_OLDER_THAN_0_23_0
void HTMLRenderer::startPage(int pageNum, GfxState *state) void HTMLRenderer::startPage(int pageNum, GfxState *state)
#else #else
void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
#endif #endif
{ {
this->pageNum = pageNum; this->pageNum = pageNum;
@ -183,12 +188,12 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
long long wid = all_manager.width.install(pageWidth); long long wid = all_manager.width.install(pageWidth);
long long hid = all_manager.height.install(pageHeight); long long hid = all_manager.height.install(pageHeight);
(*f_curpage) (*f_curpage)
<< "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum << "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
<< "\" class=\"" << CSS::PAGE_FRAME_CN << "\" class=\"" << CSS::PAGE_FRAME_CN
<< " " << CSS::WIDTH_CN << wid << " " << CSS::WIDTH_CN << wid
<< " " << CSS::HEIGHT_CN << hid << " " << CSS::HEIGHT_CN << hid
<< "\" data-page-no=\"" << pageNum << "\">" << "\" data-page-no=\"" << pageNum << "\">"
<< "<div class=\"" << CSS::PAGE_CONTENT_BOX_CN << "<div class=\"" << CSS::PAGE_CONTENT_BOX_CN
<< " " << CSS::PAGE_CONTENT_BOX_CN << pageNum << " " << CSS::PAGE_CONTENT_BOX_CN << pageNum
<< " " << CSS::WIDTH_CN << wid << " " << CSS::WIDTH_CN << wid
<< " " << CSS::HEIGHT_CN << hid << " " << CSS::HEIGHT_CN << hid
@ -201,11 +206,11 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
if(param.split_pages) if(param.split_pages)
{ {
f_pages.fs f_pages.fs
<< "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum << "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
<< "\" class=\"" << CSS::PAGE_FRAME_CN << "\" class=\"" << CSS::PAGE_FRAME_CN
<< " " << CSS::WIDTH_CN << wid << " " << CSS::WIDTH_CN << wid
<< " " << CSS::HEIGHT_CN << hid << " " << CSS::HEIGHT_CN << hid
<< "\" data-page-no=\"" << pageNum << "\" data-page-no=\"" << pageNum
<< "\" data-page-url=\""; << "\" data-page-url=\"";
writeAttribute(f_pages.fs, cur_page_filename); writeAttribute(f_pages.fs, cur_page_filename);
@ -236,7 +241,7 @@ void HTMLRenderer::endPage() {
// TODO: create a function for this // TODO: create a function for this
// BE CAREFUL WITH ESCAPES // BE CAREFUL WITH ESCAPES
(*f_curpage) << "<div class=\"" << CSS::PAGE_DATA_CN << "\" data-data='{"; (*f_curpage) << "<div class=\"" << CSS::PAGE_DATA_CN << "\" data-data='{";
//default CTM //default CTM
(*f_curpage) << "\"ctm\":["; (*f_curpage) << "\"ctm\":[";
for(int i = 0; i < 6; ++i) for(int i = 0; i < 6; ++i)
@ -247,7 +252,7 @@ void HTMLRenderer::endPage() {
(*f_curpage) << "]"; (*f_curpage) << "]";
(*f_curpage) << "}'></div>"; (*f_curpage) << "}'></div>";
// close page // close page
(*f_curpage) << "</div>" << endl; (*f_curpage) << "</div>" << endl;
@ -266,7 +271,7 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
*/ */
{ {
vector<double> zoom_factors; vector<double> zoom_factors;
if(is_positive(param.zoom)) if(is_positive(param.zoom))
{ {
zoom_factors.push_back(param.zoom); zoom_factors.push_back(param.zoom);
@ -283,8 +288,8 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
} }
double zoom = (zoom_factors.empty() ? 1.0 : (*min_element(zoom_factors.begin(), zoom_factors.end()))); double zoom = (zoom_factors.empty() ? 1.0 : (*min_element(zoom_factors.begin(), zoom_factors.end())));
text_scale_factor1 = max<double>(zoom, param.font_size_multiplier); text_scale_factor1 = max<double>(zoom, param.font_size_multiplier);
text_scale_factor2 = zoom / text_scale_factor1; text_scale_factor2 = zoom / text_scale_factor1;
} }
@ -340,13 +345,13 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
* we have to keep the html file for pages into a temporary place * we have to keep the html file for pages into a temporary place
* because we'll have to embed css before it * because we'll have to embed css before it
* *
* Otherwise just generate it * Otherwise just generate it
*/ */
auto fn = str_fmt("%s/__pages", param.tmp_dir.c_str()); auto fn = str_fmt("%s/__pages", param.tmp_dir.c_str());
tmp_files.add((char*)fn); tmp_files.add((char*)fn);
f_pages.path = (char*)fn; f_pages.path = (char*)fn;
f_pages.fs.open(f_pages.path, ofstream::binary); f_pages.fs.open(f_pages.path, ofstream::binary);
if(!f_pages.fs) if(!f_pages.fs)
throw string("Cannot open ") + (char*)fn + " for writing"; throw string("Cannot open ") + (char*)fn + " for writing";
set_stream_flags(f_pages.fs); set_stream_flags(f_pages.fs);
@ -371,7 +376,7 @@ void HTMLRenderer::post_process(void)
{ {
f_outline.fs.close(); f_outline.fs.close();
} }
f_pages.fs.close(); f_pages.fs.close();
f_css.fs.close(); f_css.fs.close();
// build the main HTML file // build the main HTML file
@ -492,7 +497,7 @@ void HTMLRenderer::dump_css (void)
all_manager.width .dump_css(f_css.fs); all_manager.width .dump_css(f_css.fs);
all_manager.left .dump_css(f_css.fs); all_manager.left .dump_css(f_css.fs);
all_manager.bgimage_size .dump_css(f_css.fs); all_manager.bgimage_size .dump_css(f_css.fs);
// print css // print css
if(param.printing) if(param.printing)
{ {
@ -518,8 +523,8 @@ void HTMLRenderer::dump_css (void)
void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy) void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy)
{ {
string fn = get_filename(path); string fn = get_filename(path);
string suffix = (type == "") ? get_suffix(fn) : type; string suffix = (type == "") ? get_suffix(fn) : type;
// TODO // TODO
auto iter = EMBED_STRING_MAP.find(suffix); auto iter = EMBED_STRING_MAP.find(suffix);
if(iter == EMBED_STRING_MAP.end()) if(iter == EMBED_STRING_MAP.end())
@ -529,14 +534,14 @@ void HTMLRenderer::embed_file(ostream & out, const string & path, const string &
} }
const auto & entry = iter->second; const auto & entry = iter->second;
if(param.*(entry.embed_flag)) if(param.*(entry.embed_flag))
{ {
ifstream fin(path, ifstream::binary); ifstream fin(path, ifstream::binary);
if(!fin) if(!fin)
throw string("Cannot open file ") + path + " for embedding"; throw string("Cannot open file ") + path + " for embedding";
out << entry.prefix_embed; out << entry.prefix_embed;
if(entry.base64_encode) if(entry.base64_encode)
{ {
out << Base64Stream(fin); out << Base64Stream(fin);

View File

@ -17,20 +17,21 @@ struct Param
{ {
// pages // pages
int first_page, last_page; int first_page, last_page;
// dimensions // dimensions
double zoom; double zoom;
double fit_width, fit_height; double fit_width, fit_height;
int use_cropbox; int use_cropbox;
double h_dpi, v_dpi; double h_dpi, v_dpi;
// output // output
int embed_css; int embed_css;
int embed_font; int embed_font;
int embed_image; int embed_image;
int embed_javascript; int embed_javascript;
int embed_outline; int embed_outline;
int split_pages; int split_pages;
int max_size;
std::string dest_dir; std::string dest_dir;
std::string css_filename; std::string css_filename;
std::string page_filename; std::string page_filename;
@ -39,7 +40,7 @@ struct Param
int process_outline; int process_outline;
int printing; int printing;
int fallback; int fallback;
// fonts // fonts
int embed_external_font; int embed_external_font;
std::string font_format; std::string font_format;
@ -50,7 +51,7 @@ struct Param
int squeeze_wide_glyph; int squeeze_wide_glyph;
int override_fstype; int override_fstype;
int process_type3; int process_type3;
// text // text
double h_eps, v_eps; double h_eps, v_eps;
double space_threshold; double space_threshold;
@ -61,17 +62,18 @@ struct Param
// background image // background image
std::string bg_format; std::string bg_format;
// encryption // encryption
std::string owner_password, user_password; std::string owner_password, user_password;
int no_drm; int no_drm;
// misc. // misc.
int clean_tmp; int clean_tmp;
std::string data_dir; std::string data_dir;
std::string basetmp_dir;
int css_draw; int css_draw;
int debug; int debug;
std::string input_filename, output_filename; std::string input_filename, output_filename;
// not a paramater // not a paramater

View File

@ -9,6 +9,7 @@
#include <iostream> #include <iostream>
#include <cstdio> #include <cstdio>
#include <sys/stat.h>
#include "TmpFiles.h" #include "TmpFiles.h"
#include "Param.h" #include "Param.h"
@ -19,11 +20,11 @@ namespace pdf2htmlEX {
TmpFiles::TmpFiles( const Param& param ) TmpFiles::TmpFiles( const Param& param )
: param( param ) : param( param )
{ } { }
TmpFiles::~TmpFiles() TmpFiles::~TmpFiles()
{ {
clean(); clean();
} }
@ -54,5 +55,16 @@ void TmpFiles::clean()
cerr << "Remove temporary directory: " << param.tmp_dir << endl; cerr << "Remove temporary directory: " << param.tmp_dir << endl;
} }
double TmpFiles::get_total_size() const
{
double total_size = 0;
struct _stat st;
for(auto iter = tmp_files.begin(); iter != tmp_files.end(); ++iter) {
_stat(iter->c_str(), &st);
total_size += st.st_size;
}
return total_size;
}
} // namespace pdf2htmlEX } // namespace pdf2htmlEX

View File

@ -7,19 +7,20 @@
namespace pdf2htmlEX { namespace pdf2htmlEX {
class TmpFiles class TmpFiles
{ {
public: public:
explicit TmpFiles( const Param& param ); explicit TmpFiles( const Param& param );
~TmpFiles(); ~TmpFiles();
void add( const std::string& fn); void add( const std::string& fn);
double get_total_size() const;
private: private:
void clean(); void clean();
const Param& param; const Param& param;
std::set<std::string> tmp_files; std::set<std::string> tmp_files;
}; };
} // namespace pdf2htmlEX } // namespace pdf2htmlEX

View File

@ -141,7 +141,6 @@ void prepare_directories()
stringstream ss; stringstream ss;
ss << setw(6) << rand_value; ss << setw(6) << rand_value;
std::cout << "1- " << tmp_dir << endl;
tmp_dir.erase(tmp_dir.size() - 6); tmp_dir.erase(tmp_dir.size() - 6);
param.tmp_dir = tmp_dir + ss.str(); param.tmp_dir = tmp_dir + ss.str();
::CreateDirectory(param.tmp_dir.c_str(), NULL); ::CreateDirectory(param.tmp_dir.c_str(), NULL);
@ -170,6 +169,7 @@ void parse_options (int argc, char **argv)
.add("embed-image", &param.embed_image, 1, "embed image files into output") .add("embed-image", &param.embed_image, 1, "embed image files into output")
.add("embed-javascript", &param.embed_javascript, 1, "embed JavaScript files into output") .add("embed-javascript", &param.embed_javascript, 1, "embed JavaScript files into output")
.add("embed-outline", &param.embed_outline, 1, "embed outlines into output") .add("embed-outline", &param.embed_outline, 1, "embed outlines into output")
.add("max-output-size", &param.max_size, -1, "maximum output size, in KB (-1 for no max)")
.add("split-pages", &param.split_pages, 0, "split pages into separate files") .add("split-pages", &param.split_pages, 0, "split pages into separate files")
.add("dest-dir", &param.dest_dir, ".", "specify destination directory") .add("dest-dir", &param.dest_dir, ".", "specify destination directory")
.add("css-filename", &param.css_filename, "", "filename of the generated css file") .add("css-filename", &param.css_filename, "", "filename of the generated css file")
@ -390,7 +390,6 @@ int main(int argc, char **argv)
cerr << "temporary dir: " << (param.tmp_dir) << endl; cerr << "temporary dir: " << (param.tmp_dir) << endl;
} }
exit(0);
try try
{ {
create_directories(param.dest_dir); create_directories(param.dest_dir);