mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
Added a parameter to limit the output size. This is an estimate, but should be good enough.
This commit is contained in:
parent
589047144a
commit
57c02b1972
@ -45,7 +45,7 @@ HTMLRenderer::HTMLRenderer(const Param & param)
|
|||||||
,param(param)
|
,param(param)
|
||||||
,html_text_page(param, all_manager)
|
,html_text_page(param, all_manager)
|
||||||
,preprocessor(param)
|
,preprocessor(param)
|
||||||
,tmp_files(param)
|
,tmp_files(param)
|
||||||
{
|
{
|
||||||
if(!(param.debug))
|
if(!(param.debug))
|
||||||
{
|
{
|
||||||
@ -79,7 +79,7 @@ HTMLRenderer::HTMLRenderer(const Param & param)
|
|||||||
}
|
}
|
||||||
|
|
||||||
HTMLRenderer::~HTMLRenderer()
|
HTMLRenderer::~HTMLRenderer()
|
||||||
{
|
{
|
||||||
ffw_finalize();
|
ffw_finalize();
|
||||||
delete [] cur_mapping;
|
delete [] cur_mapping;
|
||||||
delete [] cur_mapping2;
|
delete [] cur_mapping2;
|
||||||
@ -96,7 +96,7 @@ void HTMLRenderer::process(PDFDoc *doc)
|
|||||||
|
|
||||||
///////////////////
|
///////////////////
|
||||||
// Process pages
|
// Process pages
|
||||||
|
|
||||||
bg_renderer = nullptr;
|
bg_renderer = nullptr;
|
||||||
if(param.process_nontext)
|
if(param.process_nontext)
|
||||||
{
|
{
|
||||||
@ -107,15 +107,20 @@ void HTMLRenderer::process(PDFDoc *doc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
int page_count = (param.last_page - param.first_page + 1);
|
int page_count = (param.last_page - param.first_page + 1);
|
||||||
for(int i = param.first_page; i <= param.last_page ; ++i)
|
for(int i = param.first_page; i <= param.last_page ; ++i)
|
||||||
{
|
{
|
||||||
|
if (param.max_size != -1 && tmp_files.get_total_size() > param.max_size * 1024) {
|
||||||
|
cerr << "Stop processing, reach max size\n";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
cerr << "Working: " << (i-param.first_page) << "/" << page_count << '\r' << flush;
|
cerr << "Working: " << (i-param.first_page) << "/" << page_count << '\r' << flush;
|
||||||
|
|
||||||
if(param.split_pages)
|
if(param.split_pages)
|
||||||
{
|
{
|
||||||
string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i);
|
string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i);
|
||||||
auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str());
|
auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str());
|
||||||
f_curpage = new ofstream((char*)page_fn, ofstream::binary);
|
f_curpage = new ofstream((char*)page_fn, ofstream::binary);
|
||||||
if(!(*f_curpage))
|
if(!(*f_curpage))
|
||||||
throw string("Cannot open ") + (char*)page_fn + " for writing";
|
throw string("Cannot open ") + (char*)page_fn + " for writing";
|
||||||
set_stream_flags((*f_curpage));
|
set_stream_flags((*f_curpage));
|
||||||
@ -128,9 +133,9 @@ void HTMLRenderer::process(PDFDoc *doc)
|
|||||||
bg_renderer->render_page(doc, i);
|
bg_renderer->render_page(doc, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
doc->displayPage(this, i,
|
doc->displayPage(this, i,
|
||||||
text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI,
|
text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI,
|
||||||
0,
|
0,
|
||||||
(!(param.use_cropbox)),
|
(!(param.use_cropbox)),
|
||||||
true, // crop
|
true, // crop
|
||||||
false, // printing
|
false, // printing
|
||||||
@ -149,7 +154,7 @@ void HTMLRenderer::process(PDFDoc *doc)
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
// Process Outline
|
// Process Outline
|
||||||
if(param.process_outline)
|
if(param.process_outline)
|
||||||
process_outline();
|
process_outline();
|
||||||
|
|
||||||
post_process();
|
post_process();
|
||||||
|
|
||||||
@ -170,7 +175,7 @@ void HTMLRenderer::setDefaultCTM(double *ctm)
|
|||||||
#if POPPLER_OLDER_THAN_0_23_0
|
#if POPPLER_OLDER_THAN_0_23_0
|
||||||
void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
||||||
#else
|
#else
|
||||||
void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
|
void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
this->pageNum = pageNum;
|
this->pageNum = pageNum;
|
||||||
@ -183,12 +188,12 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
|
|||||||
long long wid = all_manager.width.install(pageWidth);
|
long long wid = all_manager.width.install(pageWidth);
|
||||||
long long hid = all_manager.height.install(pageHeight);
|
long long hid = all_manager.height.install(pageHeight);
|
||||||
(*f_curpage)
|
(*f_curpage)
|
||||||
<< "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
|
<< "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
|
||||||
<< "\" class=\"" << CSS::PAGE_FRAME_CN
|
<< "\" class=\"" << CSS::PAGE_FRAME_CN
|
||||||
<< " " << CSS::WIDTH_CN << wid
|
<< " " << CSS::WIDTH_CN << wid
|
||||||
<< " " << CSS::HEIGHT_CN << hid
|
<< " " << CSS::HEIGHT_CN << hid
|
||||||
<< "\" data-page-no=\"" << pageNum << "\">"
|
<< "\" data-page-no=\"" << pageNum << "\">"
|
||||||
<< "<div class=\"" << CSS::PAGE_CONTENT_BOX_CN
|
<< "<div class=\"" << CSS::PAGE_CONTENT_BOX_CN
|
||||||
<< " " << CSS::PAGE_CONTENT_BOX_CN << pageNum
|
<< " " << CSS::PAGE_CONTENT_BOX_CN << pageNum
|
||||||
<< " " << CSS::WIDTH_CN << wid
|
<< " " << CSS::WIDTH_CN << wid
|
||||||
<< " " << CSS::HEIGHT_CN << hid
|
<< " " << CSS::HEIGHT_CN << hid
|
||||||
@ -201,11 +206,11 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
|
|||||||
if(param.split_pages)
|
if(param.split_pages)
|
||||||
{
|
{
|
||||||
f_pages.fs
|
f_pages.fs
|
||||||
<< "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
|
<< "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
|
||||||
<< "\" class=\"" << CSS::PAGE_FRAME_CN
|
<< "\" class=\"" << CSS::PAGE_FRAME_CN
|
||||||
<< " " << CSS::WIDTH_CN << wid
|
<< " " << CSS::WIDTH_CN << wid
|
||||||
<< " " << CSS::HEIGHT_CN << hid
|
<< " " << CSS::HEIGHT_CN << hid
|
||||||
<< "\" data-page-no=\"" << pageNum
|
<< "\" data-page-no=\"" << pageNum
|
||||||
<< "\" data-page-url=\"";
|
<< "\" data-page-url=\"";
|
||||||
|
|
||||||
writeAttribute(f_pages.fs, cur_page_filename);
|
writeAttribute(f_pages.fs, cur_page_filename);
|
||||||
@ -236,7 +241,7 @@ void HTMLRenderer::endPage() {
|
|||||||
// TODO: create a function for this
|
// TODO: create a function for this
|
||||||
// BE CAREFUL WITH ESCAPES
|
// BE CAREFUL WITH ESCAPES
|
||||||
(*f_curpage) << "<div class=\"" << CSS::PAGE_DATA_CN << "\" data-data='{";
|
(*f_curpage) << "<div class=\"" << CSS::PAGE_DATA_CN << "\" data-data='{";
|
||||||
|
|
||||||
//default CTM
|
//default CTM
|
||||||
(*f_curpage) << "\"ctm\":[";
|
(*f_curpage) << "\"ctm\":[";
|
||||||
for(int i = 0; i < 6; ++i)
|
for(int i = 0; i < 6; ++i)
|
||||||
@ -247,7 +252,7 @@ void HTMLRenderer::endPage() {
|
|||||||
(*f_curpage) << "]";
|
(*f_curpage) << "]";
|
||||||
|
|
||||||
(*f_curpage) << "}'></div>";
|
(*f_curpage) << "}'></div>";
|
||||||
|
|
||||||
// close page
|
// close page
|
||||||
(*f_curpage) << "</div>" << endl;
|
(*f_curpage) << "</div>" << endl;
|
||||||
|
|
||||||
@ -266,7 +271,7 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
|
|||||||
*/
|
*/
|
||||||
{
|
{
|
||||||
vector<double> zoom_factors;
|
vector<double> zoom_factors;
|
||||||
|
|
||||||
if(is_positive(param.zoom))
|
if(is_positive(param.zoom))
|
||||||
{
|
{
|
||||||
zoom_factors.push_back(param.zoom);
|
zoom_factors.push_back(param.zoom);
|
||||||
@ -283,8 +288,8 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
double zoom = (zoom_factors.empty() ? 1.0 : (*min_element(zoom_factors.begin(), zoom_factors.end())));
|
double zoom = (zoom_factors.empty() ? 1.0 : (*min_element(zoom_factors.begin(), zoom_factors.end())));
|
||||||
|
|
||||||
text_scale_factor1 = max<double>(zoom, param.font_size_multiplier);
|
text_scale_factor1 = max<double>(zoom, param.font_size_multiplier);
|
||||||
text_scale_factor2 = zoom / text_scale_factor1;
|
text_scale_factor2 = zoom / text_scale_factor1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -340,13 +345,13 @@ void HTMLRenderer::pre_process(PDFDoc * doc)
|
|||||||
* we have to keep the html file for pages into a temporary place
|
* we have to keep the html file for pages into a temporary place
|
||||||
* because we'll have to embed css before it
|
* because we'll have to embed css before it
|
||||||
*
|
*
|
||||||
* Otherwise just generate it
|
* Otherwise just generate it
|
||||||
*/
|
*/
|
||||||
auto fn = str_fmt("%s/__pages", param.tmp_dir.c_str());
|
auto fn = str_fmt("%s/__pages", param.tmp_dir.c_str());
|
||||||
tmp_files.add((char*)fn);
|
tmp_files.add((char*)fn);
|
||||||
|
|
||||||
f_pages.path = (char*)fn;
|
f_pages.path = (char*)fn;
|
||||||
f_pages.fs.open(f_pages.path, ofstream::binary);
|
f_pages.fs.open(f_pages.path, ofstream::binary);
|
||||||
if(!f_pages.fs)
|
if(!f_pages.fs)
|
||||||
throw string("Cannot open ") + (char*)fn + " for writing";
|
throw string("Cannot open ") + (char*)fn + " for writing";
|
||||||
set_stream_flags(f_pages.fs);
|
set_stream_flags(f_pages.fs);
|
||||||
@ -371,7 +376,7 @@ void HTMLRenderer::post_process(void)
|
|||||||
{
|
{
|
||||||
f_outline.fs.close();
|
f_outline.fs.close();
|
||||||
}
|
}
|
||||||
f_pages.fs.close();
|
f_pages.fs.close();
|
||||||
f_css.fs.close();
|
f_css.fs.close();
|
||||||
|
|
||||||
// build the main HTML file
|
// build the main HTML file
|
||||||
@ -492,7 +497,7 @@ void HTMLRenderer::dump_css (void)
|
|||||||
all_manager.width .dump_css(f_css.fs);
|
all_manager.width .dump_css(f_css.fs);
|
||||||
all_manager.left .dump_css(f_css.fs);
|
all_manager.left .dump_css(f_css.fs);
|
||||||
all_manager.bgimage_size .dump_css(f_css.fs);
|
all_manager.bgimage_size .dump_css(f_css.fs);
|
||||||
|
|
||||||
// print css
|
// print css
|
||||||
if(param.printing)
|
if(param.printing)
|
||||||
{
|
{
|
||||||
@ -518,8 +523,8 @@ void HTMLRenderer::dump_css (void)
|
|||||||
void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy)
|
void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy)
|
||||||
{
|
{
|
||||||
string fn = get_filename(path);
|
string fn = get_filename(path);
|
||||||
string suffix = (type == "") ? get_suffix(fn) : type;
|
string suffix = (type == "") ? get_suffix(fn) : type;
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
auto iter = EMBED_STRING_MAP.find(suffix);
|
auto iter = EMBED_STRING_MAP.find(suffix);
|
||||||
if(iter == EMBED_STRING_MAP.end())
|
if(iter == EMBED_STRING_MAP.end())
|
||||||
@ -529,14 +534,14 @@ void HTMLRenderer::embed_file(ostream & out, const string & path, const string &
|
|||||||
}
|
}
|
||||||
|
|
||||||
const auto & entry = iter->second;
|
const auto & entry = iter->second;
|
||||||
|
|
||||||
if(param.*(entry.embed_flag))
|
if(param.*(entry.embed_flag))
|
||||||
{
|
{
|
||||||
ifstream fin(path, ifstream::binary);
|
ifstream fin(path, ifstream::binary);
|
||||||
if(!fin)
|
if(!fin)
|
||||||
throw string("Cannot open file ") + path + " for embedding";
|
throw string("Cannot open file ") + path + " for embedding";
|
||||||
out << entry.prefix_embed;
|
out << entry.prefix_embed;
|
||||||
|
|
||||||
if(entry.base64_encode)
|
if(entry.base64_encode)
|
||||||
{
|
{
|
||||||
out << Base64Stream(fin);
|
out << Base64Stream(fin);
|
||||||
|
18
src/Param.h
18
src/Param.h
@ -17,20 +17,21 @@ struct Param
|
|||||||
{
|
{
|
||||||
// pages
|
// pages
|
||||||
int first_page, last_page;
|
int first_page, last_page;
|
||||||
|
|
||||||
// dimensions
|
// dimensions
|
||||||
double zoom;
|
double zoom;
|
||||||
double fit_width, fit_height;
|
double fit_width, fit_height;
|
||||||
int use_cropbox;
|
int use_cropbox;
|
||||||
double h_dpi, v_dpi;
|
double h_dpi, v_dpi;
|
||||||
|
|
||||||
// output
|
// output
|
||||||
int embed_css;
|
int embed_css;
|
||||||
int embed_font;
|
int embed_font;
|
||||||
int embed_image;
|
int embed_image;
|
||||||
int embed_javascript;
|
int embed_javascript;
|
||||||
int embed_outline;
|
int embed_outline;
|
||||||
int split_pages;
|
int split_pages;
|
||||||
|
int max_size;
|
||||||
std::string dest_dir;
|
std::string dest_dir;
|
||||||
std::string css_filename;
|
std::string css_filename;
|
||||||
std::string page_filename;
|
std::string page_filename;
|
||||||
@ -39,7 +40,7 @@ struct Param
|
|||||||
int process_outline;
|
int process_outline;
|
||||||
int printing;
|
int printing;
|
||||||
int fallback;
|
int fallback;
|
||||||
|
|
||||||
// fonts
|
// fonts
|
||||||
int embed_external_font;
|
int embed_external_font;
|
||||||
std::string font_format;
|
std::string font_format;
|
||||||
@ -50,7 +51,7 @@ struct Param
|
|||||||
int squeeze_wide_glyph;
|
int squeeze_wide_glyph;
|
||||||
int override_fstype;
|
int override_fstype;
|
||||||
int process_type3;
|
int process_type3;
|
||||||
|
|
||||||
// text
|
// text
|
||||||
double h_eps, v_eps;
|
double h_eps, v_eps;
|
||||||
double space_threshold;
|
double space_threshold;
|
||||||
@ -61,17 +62,18 @@ struct Param
|
|||||||
|
|
||||||
// background image
|
// background image
|
||||||
std::string bg_format;
|
std::string bg_format;
|
||||||
|
|
||||||
// encryption
|
// encryption
|
||||||
std::string owner_password, user_password;
|
std::string owner_password, user_password;
|
||||||
int no_drm;
|
int no_drm;
|
||||||
|
|
||||||
// misc.
|
// misc.
|
||||||
int clean_tmp;
|
int clean_tmp;
|
||||||
std::string data_dir;
|
std::string data_dir;
|
||||||
|
std::string basetmp_dir;
|
||||||
int css_draw;
|
int css_draw;
|
||||||
int debug;
|
int debug;
|
||||||
|
|
||||||
std::string input_filename, output_filename;
|
std::string input_filename, output_filename;
|
||||||
|
|
||||||
// not a paramater
|
// not a paramater
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
|
||||||
#include "TmpFiles.h"
|
#include "TmpFiles.h"
|
||||||
#include "Param.h"
|
#include "Param.h"
|
||||||
@ -19,11 +20,11 @@ namespace pdf2htmlEX {
|
|||||||
|
|
||||||
|
|
||||||
TmpFiles::TmpFiles( const Param& param )
|
TmpFiles::TmpFiles( const Param& param )
|
||||||
: param( param )
|
: param( param )
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
TmpFiles::~TmpFiles()
|
TmpFiles::~TmpFiles()
|
||||||
{
|
{
|
||||||
clean();
|
clean();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,5 +55,16 @@ void TmpFiles::clean()
|
|||||||
cerr << "Remove temporary directory: " << param.tmp_dir << endl;
|
cerr << "Remove temporary directory: " << param.tmp_dir << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double TmpFiles::get_total_size() const
|
||||||
|
{
|
||||||
|
double total_size = 0;
|
||||||
|
struct _stat st;
|
||||||
|
for(auto iter = tmp_files.begin(); iter != tmp_files.end(); ++iter) {
|
||||||
|
_stat(iter->c_str(), &st);
|
||||||
|
total_size += st.st_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
return total_size;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace pdf2htmlEX
|
} // namespace pdf2htmlEX
|
||||||
|
@ -7,19 +7,20 @@
|
|||||||
|
|
||||||
namespace pdf2htmlEX {
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
class TmpFiles
|
class TmpFiles
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit TmpFiles( const Param& param );
|
explicit TmpFiles( const Param& param );
|
||||||
~TmpFiles();
|
~TmpFiles();
|
||||||
|
|
||||||
void add( const std::string& fn);
|
void add( const std::string& fn);
|
||||||
|
double get_total_size() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void clean();
|
void clean();
|
||||||
|
|
||||||
const Param& param;
|
const Param& param;
|
||||||
std::set<std::string> tmp_files;
|
std::set<std::string> tmp_files;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace pdf2htmlEX
|
} // namespace pdf2htmlEX
|
||||||
|
@ -141,7 +141,6 @@ void prepare_directories()
|
|||||||
stringstream ss;
|
stringstream ss;
|
||||||
ss << setw(6) << rand_value;
|
ss << setw(6) << rand_value;
|
||||||
|
|
||||||
std::cout << "1- " << tmp_dir << endl;
|
|
||||||
tmp_dir.erase(tmp_dir.size() - 6);
|
tmp_dir.erase(tmp_dir.size() - 6);
|
||||||
param.tmp_dir = tmp_dir + ss.str();
|
param.tmp_dir = tmp_dir + ss.str();
|
||||||
::CreateDirectory(param.tmp_dir.c_str(), NULL);
|
::CreateDirectory(param.tmp_dir.c_str(), NULL);
|
||||||
@ -170,6 +169,7 @@ void parse_options (int argc, char **argv)
|
|||||||
.add("embed-image", ¶m.embed_image, 1, "embed image files into output")
|
.add("embed-image", ¶m.embed_image, 1, "embed image files into output")
|
||||||
.add("embed-javascript", ¶m.embed_javascript, 1, "embed JavaScript files into output")
|
.add("embed-javascript", ¶m.embed_javascript, 1, "embed JavaScript files into output")
|
||||||
.add("embed-outline", ¶m.embed_outline, 1, "embed outlines into output")
|
.add("embed-outline", ¶m.embed_outline, 1, "embed outlines into output")
|
||||||
|
.add("max-output-size", ¶m.max_size, -1, "maximum output size, in KB (-1 for no max)")
|
||||||
.add("split-pages", ¶m.split_pages, 0, "split pages into separate files")
|
.add("split-pages", ¶m.split_pages, 0, "split pages into separate files")
|
||||||
.add("dest-dir", ¶m.dest_dir, ".", "specify destination directory")
|
.add("dest-dir", ¶m.dest_dir, ".", "specify destination directory")
|
||||||
.add("css-filename", ¶m.css_filename, "", "filename of the generated css file")
|
.add("css-filename", ¶m.css_filename, "", "filename of the generated css file")
|
||||||
@ -390,7 +390,6 @@ int main(int argc, char **argv)
|
|||||||
cerr << "temporary dir: " << (param.tmp_dir) << endl;
|
cerr << "temporary dir: " << (param.tmp_dir) << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
exit(0);
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
create_directories(param.dest_dir);
|
create_directories(param.dest_dir);
|
||||||
|
Loading…
Reference in New Issue
Block a user