mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
add manifest & split-pages
This commit is contained in:
parent
78bf7fcaa5
commit
61cdce7e7d
10
pdf2htmlEX.1
10
pdf2htmlEX.1
@ -58,6 +58,11 @@ Whether to embed everything into one HTML file.
|
||||
|
||||
If switched out, there will be several files generated along with the HTML file including files for fonts, css, images.
|
||||
.TP
|
||||
.B --split-pages <0|1> (Default: 0)
|
||||
If turned on, each page is saved in a separated files, also the generated css file will be store separatedly as if single-html=0
|
||||
|
||||
The output files will be named as <output-filename>0.page, <output-file>1.page, ...
|
||||
.TP
|
||||
.B --embed-base-font <0|1> (Default: 1)
|
||||
Whether to embed base 14 fonts.
|
||||
|
||||
@ -100,6 +105,11 @@ Treat space characters as offsets, which may increase the size of the output.
|
||||
|
||||
Turn it on if space characters are not displayed correctly, or you want to remove positional spaces.
|
||||
.TP
|
||||
.B --css-filename <filename> (Default: "")
|
||||
Specify the filename of the generated css file, if not embedded.
|
||||
|
||||
If it's empty, the file name will be determined automatically.
|
||||
.TP
|
||||
.B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
|
||||
Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
|
||||
.TP
|
||||
|
@ -1,7 +0,0 @@
|
||||
<!-- Created by pdf2htmlEX (http://github.com/coolwanglu/pdf2htmlEX) -->
|
||||
<!-- head.html by WangLu 2012.08.14 -->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<!-- head.html END -->
|
2
share/jquery.js
vendored
Normal file
2
share/jquery.js
vendored
Normal file
File diff suppressed because one or more lines are too long
48
share/manifest
Normal file
48
share/manifest
Normal file
@ -0,0 +1,48 @@
|
||||
# manifest
|
||||
# by WangLu
|
||||
# 2012.09.12
|
||||
#
|
||||
# Syntax
|
||||
# The first char of each line is the command
|
||||
# Empty lines are ignored
|
||||
#
|
||||
# # - comment
|
||||
# @ - include file
|
||||
# $ - special use for pdf2htmlEX
|
||||
#
|
||||
# Special
|
||||
# If a line contains """ only, all text until next """ will be included
|
||||
|
||||
"""
|
||||
<!-- Created by pdf2htmlEX (http://github.com/coolwanglu/pdf2htmlEX) -->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
"""
|
||||
|
||||
@base.css
|
||||
$css
|
||||
@jquery.js
|
||||
|
||||
"""
|
||||
<title></title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="pdf-main">
|
||||
"""
|
||||
|
||||
$pages
|
||||
|
||||
"""
|
||||
</div>
|
||||
"""
|
||||
|
||||
@scroll.js
|
||||
|
||||
"""
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# MANIFEST END
|
@ -1,7 +0,0 @@
|
||||
<!-- neck.html by WangLu 2012.08.15 -->
|
||||
<title>pdf2htmlEX</title>
|
||||
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.8.1/jquery.min.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div id="pdf-main">
|
||||
<!-- neck.html END -->
|
@ -1,10 +1,6 @@
|
||||
<!-- tail.html by Hongliang Tian 2012.09.11 -->
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
$(function() {
|
||||
var $pages = $(".p"),
|
||||
$pageWrappers = $(".b"),
|
||||
$pageWrappers = $(".pw"),
|
||||
$main = $("#pdf-main"),
|
||||
l = $pages.length;
|
||||
|
||||
@ -63,7 +59,3 @@ $(function() {
|
||||
// Trigger the event
|
||||
$("#pdf-main").scroll();
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
<!-- tail.html END -->
|
@ -12,6 +12,8 @@
|
||||
#include "HTMLRenderer.h"
|
||||
#include "namespace.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
using std::min;
|
||||
using std::max;
|
||||
using std::vector;
|
||||
@ -272,3 +274,4 @@ int HTMLRenderer::LineBuffer::State::diff(const State & s) const
|
||||
}
|
||||
|
||||
const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr";
|
||||
} //namespace pdf2htmlEX
|
||||
|
@ -13,23 +13,25 @@
|
||||
#include "HTMLRenderer.h"
|
||||
#include "namespace.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, const string & fontfileformat, GfxFont * font)
|
||||
{
|
||||
allcss_fout << "@font-face{font-family:f" << info.id << ";src:url(";
|
||||
css_fout << "@font-face{font-family:f" << info.id << ";src:url(";
|
||||
|
||||
{
|
||||
auto fn = str_fmt("f%llx%s", info.id, suffix.c_str());
|
||||
if(param->single_html)
|
||||
{
|
||||
allcss_fout << "'data:font/opentype;base64," << base64stream(ifstream(tmp_dir + "/" + (char*)fn, ifstream::binary)) << "'";
|
||||
css_fout << "'data:font/opentype;base64," << base64stream(ifstream(tmp_dir + "/" + (char*)fn, ifstream::binary)) << "'";
|
||||
}
|
||||
else
|
||||
{
|
||||
allcss_fout << (char*)fn;
|
||||
css_fout << (char*)fn;
|
||||
}
|
||||
}
|
||||
|
||||
allcss_fout << ")format(\"" << fontfileformat << "\");}.f" << info.id << "{font-family:f" << info.id << ";line-height:" << (info.ascent - info.descent) << ";}" << endl;
|
||||
css_fout << ")format(\"" << fontfileformat << "\");}.f" << info.id << "{font-family:f" << info.id << ";line-height:" << (info.ascent - info.descent) << ";}" << endl;
|
||||
}
|
||||
|
||||
static string general_font_family(GfxFont * font)
|
||||
@ -45,39 +47,39 @@ static string general_font_family(GfxFont * font)
|
||||
// TODO: this function is called when some font is unable to process, may use the name there as a hint
|
||||
void HTMLRenderer::export_remote_default_font(long long fn_id)
|
||||
{
|
||||
allcss_fout << ".f" << fn_id << "{font-family:sans-serif;color:transparent;visibility:hidden;}" << endl;
|
||||
css_fout << ".f" << fn_id << "{font-family:sans-serif;color:transparent;visibility:hidden;}" << endl;
|
||||
}
|
||||
|
||||
void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont)
|
||||
{
|
||||
allcss_fout << ".f" << info.id << "{";
|
||||
allcss_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
|
||||
css_fout << ".f" << info.id << "{";
|
||||
css_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
|
||||
|
||||
string fn = original_font_name;
|
||||
for(auto iter = fn.begin(); iter != fn.end(); ++iter)
|
||||
*iter = tolower(*iter);
|
||||
|
||||
if(font->isBold() || (fn.find("bold") != string::npos))
|
||||
allcss_fout << "font-weight:bold;";
|
||||
css_fout << "font-weight:bold;";
|
||||
|
||||
if(fn.find("oblique") != string::npos)
|
||||
allcss_fout << "font-style:oblique;";
|
||||
css_fout << "font-style:oblique;";
|
||||
else if(font->isItalic() || (fn.find("italic") != string::npos))
|
||||
allcss_fout << "font-style:italic;";
|
||||
css_fout << "font-style:italic;";
|
||||
|
||||
allcss_fout << "line-height:" << (info.ascent - info.descent) << ";";
|
||||
css_fout << "line-height:" << (info.ascent - info.descent) << ";";
|
||||
|
||||
allcss_fout << "}" << endl;
|
||||
css_fout << "}" << endl;
|
||||
}
|
||||
|
||||
void HTMLRenderer::export_font_size (long long fs_id, double font_size)
|
||||
{
|
||||
allcss_fout << ".s" << fs_id << "{font-size:" << font_size << "px;}" << endl;
|
||||
css_fout << ".s" << fs_id << "{font-size:" << font_size << "px;}" << endl;
|
||||
}
|
||||
|
||||
void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
|
||||
{
|
||||
allcss_fout << ".t" << tm_id << "{";
|
||||
css_fout << ".t" << tm_id << "{";
|
||||
|
||||
// always ignore tm[4] and tm[5] because
|
||||
// we have already shifted the origin
|
||||
@ -88,30 +90,30 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
|
||||
{
|
||||
const auto & prefix = *iter;
|
||||
// PDF use a different coordinate system from Web
|
||||
allcss_fout << prefix << "transform:matrix("
|
||||
css_fout << prefix << "transform:matrix("
|
||||
<< tm[0] << ','
|
||||
<< -tm[1] << ','
|
||||
<< -tm[2] << ','
|
||||
<< tm[3] << ',';
|
||||
|
||||
allcss_fout << "0,0);";
|
||||
css_fout << "0,0);";
|
||||
}
|
||||
allcss_fout << "}" << endl;
|
||||
css_fout << "}" << endl;
|
||||
}
|
||||
|
||||
void HTMLRenderer::export_letter_space (long long ls_id, double letter_space)
|
||||
{
|
||||
allcss_fout << ".l" << ls_id << "{letter-spacing:" << letter_space << "px;}" << endl;
|
||||
css_fout << ".l" << ls_id << "{letter-spacing:" << letter_space << "px;}" << endl;
|
||||
}
|
||||
|
||||
void HTMLRenderer::export_word_space (long long ws_id, double word_space)
|
||||
{
|
||||
allcss_fout << ".w" << ws_id << "{word-spacing:" << word_space << "px;}" << endl;
|
||||
css_fout << ".w" << ws_id << "{word-spacing:" << word_space << "px;}" << endl;
|
||||
}
|
||||
|
||||
void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
|
||||
{
|
||||
allcss_fout << ".c" << color_id << "{color:rgb("
|
||||
css_fout << ".c" << color_id << "{color:rgb("
|
||||
<< dec << (int)colToByte(rgb->r) << "," << (int)colToByte(rgb->g) << "," << (int)colToByte(rgb->b) << ");}" << hex
|
||||
<< endl;
|
||||
}
|
||||
@ -119,13 +121,14 @@ void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
|
||||
void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
|
||||
{
|
||||
if(ws_width > 0)
|
||||
allcss_fout << "._" << ws_id << "{display:inline-block;width:" << ws_width << "px;}" << endl;
|
||||
css_fout << "._" << ws_id << "{display:inline-block;width:" << ws_width << "px;}" << endl;
|
||||
else
|
||||
allcss_fout << "._" << ws_id << "{display:inline;margin-left:" << ws_width << "px;}" << endl;
|
||||
css_fout << "._" << ws_id << "{display:inline;margin-left:" << ws_width << "px;}" << endl;
|
||||
}
|
||||
|
||||
void HTMLRenderer::export_rise (long long rise_id, double rise)
|
||||
{
|
||||
allcss_fout << ".r" << rise_id << "{top:" << (-rise) << "px;}" << endl;
|
||||
css_fout << ".r" << rise_id << "{top:" << (-rise) << "px;}" << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -8,6 +8,7 @@
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <ostream>
|
||||
|
||||
#include <splash/SplashBitmap.h>
|
||||
|
||||
@ -17,8 +18,11 @@
|
||||
#include "ff.h"
|
||||
#include "pdf2htmlEX-config.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
using std::fixed;
|
||||
using std::flush;
|
||||
using std::ostream;
|
||||
|
||||
static void dummy(void *, enum ErrorCategory, int pos, char *)
|
||||
{
|
||||
@ -83,6 +87,14 @@ void HTMLRenderer::process(PDFDoc *doc)
|
||||
|
||||
for(int i = param->first_page; i <= param->last_page ; ++i)
|
||||
{
|
||||
if(param->split_pages)
|
||||
{
|
||||
auto page_fn = str_fmt("%s/__pages%x", tmp_dir.c_str(), i);
|
||||
html_fout.open((char*)page_fn, ofstream::binary);
|
||||
add_tmp_file((char*)page_fn);
|
||||
fix_stream(html_fout);
|
||||
}
|
||||
|
||||
if(param->process_nontext)
|
||||
{
|
||||
doc->displayPage(bg_renderer, i, param->h_dpi, param->v_dpi,
|
||||
@ -104,8 +116,14 @@ void HTMLRenderer::process(PDFDoc *doc)
|
||||
0, true, false, false,
|
||||
nullptr, nullptr, nullptr, nullptr);
|
||||
|
||||
if(param->split_pages)
|
||||
{
|
||||
html_fout.close();
|
||||
}
|
||||
|
||||
cerr << "." << flush;
|
||||
}
|
||||
|
||||
post_process();
|
||||
|
||||
if(bg_renderer)
|
||||
@ -116,53 +134,116 @@ void HTMLRenderer::process(PDFDoc *doc)
|
||||
|
||||
void HTMLRenderer::pre_process()
|
||||
{
|
||||
// we may output utf8 characters, so use binary
|
||||
if(param->single_html)
|
||||
// we may output utf8 characters, so always use binary
|
||||
{
|
||||
{
|
||||
auto fn = str_fmt("%s/%s", tmp_dir.c_str(), CSS_FILENAME.c_str());
|
||||
allcss_fout.open((char*)fn, ofstream::binary);
|
||||
/*
|
||||
* If single-html && !split-pages
|
||||
* we have to keep the generated css file into a temporary place
|
||||
* and embed it into the main html later
|
||||
*
|
||||
*
|
||||
* If single-html && split-page
|
||||
* as there's no place to embed the css file, just leave it alone (into dest_dir)
|
||||
*
|
||||
* If !single-html
|
||||
* leave it in dest_dir
|
||||
*/
|
||||
|
||||
auto fn = (param->single_html && (!param->split_pages))
|
||||
? str_fmt("%s/__css", tmp_dir.c_str())
|
||||
: str_fmt("%s/%s", dest_dir.c_str(), param->css_filename.c_str());
|
||||
|
||||
if(param->single_html)
|
||||
add_tmp_file((char*)fn);
|
||||
}
|
||||
|
||||
{
|
||||
// don't use output_file directly
|
||||
// otherwise it'll be a disaster when tmp_dir == dest_dir
|
||||
auto tmp_output_fn = str_fmt("%s/%s.part", tmp_dir.c_str(), param->output_filename.c_str());
|
||||
add_tmp_file((char*)tmp_output_fn);
|
||||
|
||||
html_fout.open((char*)tmp_output_fn, ofstream::binary);
|
||||
}
|
||||
css_path = (char*)fn,
|
||||
css_fout.open(css_path, ofstream::binary);
|
||||
fix_stream(css_fout);
|
||||
}
|
||||
else
|
||||
|
||||
// if split-pages is specified, open & close the file in the process loop
|
||||
// if not, open the file here:
|
||||
if(!param->split_pages);
|
||||
{
|
||||
html_fout.open(str_fmt("%s/%s", dest_dir.c_str(), param->output_filename.c_str()), ofstream::binary);
|
||||
allcss_fout.open(str_fmt("%s/%s", dest_dir.c_str(), CSS_FILENAME.c_str()), ofstream::binary);
|
||||
/*
|
||||
* If single-html
|
||||
* we have to keep the html file (for page) into a temporary place
|
||||
* because we'll have to embed css before it
|
||||
*
|
||||
* Otherwise just generate it
|
||||
*/
|
||||
auto fn = (param->single_html)
|
||||
? str_fmt("%s/__pages", tmp_dir.c_str())
|
||||
: str_fmt("%s/%s", dest_dir.c_str(), param->output_filename.c_str());
|
||||
|
||||
html_fout << ifstream(str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), HEAD_HTML_FILENAME.c_str()), ifstream::binary).rdbuf();
|
||||
html_fout << "<link rel=\"stylesheet\" type=\"text/css\" href=\"" << CSS_FILENAME << "\"/>" << endl;
|
||||
html_fout << ifstream(str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), NECK_HTML_FILENAME.c_str()), ifstream::binary).rdbuf();
|
||||
if(param->single_html)
|
||||
add_tmp_file((char*)fn);
|
||||
|
||||
html_path = (char*)fn;
|
||||
html_fout.open(html_path, ofstream::binary);
|
||||
fix_stream(html_fout);
|
||||
}
|
||||
|
||||
fix_stream(html_fout);
|
||||
fix_stream(allcss_fout);
|
||||
|
||||
allcss_fout << ifstream(str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), CSS_FILENAME.c_str()), ifstream::binary).rdbuf();
|
||||
}
|
||||
|
||||
void HTMLRenderer::post_process()
|
||||
{
|
||||
if(!param->single_html)
|
||||
{
|
||||
html_fout << ifstream(str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), TAIL_HTML_FILENAME.c_str()), ifstream::binary).rdbuf();
|
||||
}
|
||||
|
||||
// close files
|
||||
html_fout.close();
|
||||
allcss_fout.close();
|
||||
css_fout.close();
|
||||
|
||||
if(param->single_html)
|
||||
//only when !split-page, do we have some work left to do
|
||||
if(!param->split_pages)
|
||||
return;
|
||||
|
||||
ofstream output((char*)str_fmt("%s/%s", dest_dir.c_str(), param->output_filename.c_str()));
|
||||
fix_stream(output);
|
||||
|
||||
// apply manifest
|
||||
ifstream manifest_fin((char*)str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), MANIFEST_FILENAME.c_str()));
|
||||
|
||||
bool embed_string = false;
|
||||
string line;
|
||||
while(getline(manifest_fin, line))
|
||||
{
|
||||
process_single_html();
|
||||
if(embed_string)
|
||||
{
|
||||
output << line << endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(line.empty() || line[0] == '#')
|
||||
continue;
|
||||
|
||||
if(line == "\"\"\"")
|
||||
{
|
||||
embed_string = !embed_string;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(line[0] == '@')
|
||||
{
|
||||
embed_file(output, PDF2HTMLEX_DATA_PATH + "/" + line.substr(1), true);
|
||||
continue;
|
||||
}
|
||||
|
||||
if(line[0] == '$')
|
||||
{
|
||||
if(line == "$css")
|
||||
{
|
||||
embed_file(output, css_path, false);
|
||||
}
|
||||
else if (line == "$pages")
|
||||
{
|
||||
output << ifstream(html_path, ifstream::binary).rdbuf();
|
||||
}
|
||||
else
|
||||
{
|
||||
cerr << "Warning: unknown line in manifest: " << line << endl;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
cerr << "Warning: unknown line in manifest: " << line << endl;
|
||||
}
|
||||
}
|
||||
|
||||
@ -182,7 +263,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
||||
{
|
||||
if(param->single_html)
|
||||
{
|
||||
html_fout << "'data:image/png;base64," << base64stream(ifstream(str_fmt("%s/p%llx.png", tmp_dir.c_str(), pageNum) , ifstream::binary)) << "'";
|
||||
html_fout << "'data:image/png;base64," << base64stream(ifstream((char*)str_fmt("%s/p%llx.png", tmp_dir.c_str(), pageNum) , ifstream::binary)) << "'";
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -225,23 +306,6 @@ void HTMLRenderer::endPage() {
|
||||
html_fout << "</div></div>" << endl;
|
||||
}
|
||||
|
||||
void HTMLRenderer::process_single_html()
|
||||
{
|
||||
ofstream out (dest_dir + "/" + param->output_filename, ofstream::binary);
|
||||
|
||||
out << ifstream(PDF2HTMLEX_DATA_PATH + "/" + HEAD_HTML_FILENAME , ifstream::binary).rdbuf();
|
||||
|
||||
out << "<style type=\"text/css\">" << endl;
|
||||
out << ifstream(tmp_dir + "/" + CSS_FILENAME, ifstream::binary).rdbuf();
|
||||
out << "</style>" << endl;
|
||||
|
||||
out << ifstream(PDF2HTMLEX_DATA_PATH + "/" + NECK_HTML_FILENAME, ifstream::binary).rdbuf();
|
||||
|
||||
out << ifstream(tmp_dir + "/" + (param->output_filename + ".part"), ifstream::binary).rdbuf();
|
||||
|
||||
out << ifstream(PDF2HTMLEX_DATA_PATH + "/" + TAIL_HTML_FILENAME, ifstream::binary).rdbuf();
|
||||
}
|
||||
|
||||
void HTMLRenderer::fix_stream (std::ostream & out)
|
||||
{
|
||||
out << fixed << hex;
|
||||
@ -274,7 +338,32 @@ void HTMLRenderer::clean_tmp_files()
|
||||
cerr << "Remove temporary directory: " << tmp_dir << endl;
|
||||
}
|
||||
|
||||
const std::string HTMLRenderer::HEAD_HTML_FILENAME = "head.html";
|
||||
const std::string HTMLRenderer::NECK_HTML_FILENAME = "neck.html";
|
||||
const std::string HTMLRenderer::TAIL_HTML_FILENAME = "tail.html";
|
||||
const std::string HTMLRenderer::CSS_FILENAME = "all.css";
|
||||
void HTMLRenderer::embed_file(ostream & out, const string & path, bool copy)
|
||||
{
|
||||
string fn = get_filename(path);
|
||||
string suffix = get_suffix(fn);
|
||||
|
||||
auto iter = EMBED_STRING_MAP.find(make_pair(suffix, param->single_html));
|
||||
if(iter == EMBED_STRING_MAP.end())
|
||||
{
|
||||
cerr << "Warning: unknown suffix in manifest: " << suffix << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
if(param->single_html)
|
||||
{
|
||||
cerr << iter->second.first << endl
|
||||
<< ifstream(path, ifstream::binary).rdbuf()
|
||||
<< iter->second.second << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
cerr << iter->second.first
|
||||
<< fn
|
||||
<< iter->second.second;
|
||||
}
|
||||
}
|
||||
|
||||
const std::string HTMLRenderer::MANIFEST_FILENAME = "manifest";
|
||||
|
||||
}// namespace pdf2htmlEX
|
||||
|
@ -10,6 +10,8 @@
|
||||
#include "HTMLRenderer.h"
|
||||
#include "namespace.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg)
|
||||
{
|
||||
return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg);
|
||||
@ -59,3 +61,5 @@ void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int w
|
||||
++ image_count;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
@ -17,6 +17,8 @@
|
||||
#include "namespace.h"
|
||||
#include "util.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
using std::abs;
|
||||
|
||||
const FontInfo * HTMLRenderer::install_font(GfxFont * font)
|
||||
@ -290,3 +292,5 @@ long long HTMLRenderer::install_rise(double rise)
|
||||
export_rise(new_rise_id, rise);
|
||||
return new_rise_id;
|
||||
}
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
@ -17,6 +17,9 @@
|
||||
#include "HTMLRenderer.h"
|
||||
#include "namespace.h"
|
||||
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
using std::max;
|
||||
using std::abs;
|
||||
|
||||
@ -358,3 +361,5 @@ void HTMLRenderer::close_line()
|
||||
line_buf.flush();
|
||||
}
|
||||
}
|
||||
|
||||
} //namespace pdf2htmlEX
|
||||
|
@ -19,6 +19,8 @@
|
||||
#include "HTMLRenderer.h"
|
||||
#include "namespace.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
using std::unordered_set;
|
||||
using std::min;
|
||||
using std::all_of;
|
||||
@ -486,3 +488,5 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
draw_tx += dx + dxerr * cur_font_size * hs;
|
||||
draw_ty += dy;
|
||||
}
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
@ -84,7 +84,6 @@ class HTMLRenderer : public OutputDev
|
||||
|
||||
virtual void pre_process();
|
||||
virtual void post_process();
|
||||
virtual void process_single_html();
|
||||
|
||||
// Start a page.
|
||||
virtual void startPage(int pageNum, GfxState *state);
|
||||
@ -169,6 +168,11 @@ class HTMLRenderer : public OutputDev
|
||||
void export_whitespace(long long ws_id, double ws_width);
|
||||
void export_rise(long long rise_id, double rise);
|
||||
|
||||
|
||||
// depending on single-html, to embed the content or add a link to it
|
||||
// "copy": indicates whether to copy the file into dest_dir, if not embedded
|
||||
void embed_file(std::ostream & out, const std::string & path, bool copy);
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
// state tracking
|
||||
////////////////////////////////////////////////////
|
||||
@ -360,13 +364,11 @@ class HTMLRenderer : public OutputDev
|
||||
|
||||
const Param * param;
|
||||
std::string dest_dir, tmp_dir;
|
||||
std::ofstream html_fout, allcss_fout;
|
||||
std::ofstream html_fout, css_fout;
|
||||
std::string html_path, css_path;
|
||||
std::set<std::string> tmp_files;
|
||||
|
||||
static const std::string HEAD_HTML_FILENAME;
|
||||
static const std::string NECK_HTML_FILENAME;
|
||||
static const std::string TAIL_HTML_FILENAME;
|
||||
static const std::string CSS_FILENAME;
|
||||
static const std::string MANIFEST_FILENAME;
|
||||
};
|
||||
|
||||
} //namespace pdf2htmlEX
|
||||
|
@ -30,6 +30,7 @@ struct Param
|
||||
|
||||
int process_nontext;
|
||||
int single_html;
|
||||
int split_pages;
|
||||
int embed_base_font;
|
||||
int embed_external_font;
|
||||
int decompose_ligature;
|
||||
@ -41,6 +42,7 @@ struct Param
|
||||
int tounicode;
|
||||
int space_as_offset;
|
||||
|
||||
std::string css_filename;
|
||||
std::string font_suffix, font_format;
|
||||
|
||||
int debug;
|
||||
|
@ -19,7 +19,5 @@ using std::make_pair;
|
||||
using std::ifstream;
|
||||
using std::ofstream;
|
||||
|
||||
using namespace pdf2htmlEX;
|
||||
|
||||
#endif // NAMESPACE_H__
|
||||
|
||||
|
@ -33,6 +33,10 @@ static const double DEFAULT_DPI = 72.0;
|
||||
|
||||
extern const std::map<std::string, std::string> BASE_14_FONT_CSS_FONT_MAP;
|
||||
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
|
||||
// map to embed files into html
|
||||
// key: (suffix, if_embed_content)
|
||||
// value: (prefix string, suffix string)
|
||||
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP;
|
||||
|
||||
// mute gcc warning of unused function
|
||||
namespace
|
||||
|
@ -64,6 +64,7 @@ void parse_options (int argc, char **argv)
|
||||
|
||||
.add("process-nontext", ¶m.process_nontext, 1, "process nontext objects")
|
||||
.add("single-html", ¶m.single_html, 1, "combine everything into one single HTML file")
|
||||
.add("split-pages", ¶m.split_pages, 0, "split pages into separated files")
|
||||
.add("embed-base-font", ¶m.embed_base_font, 0, "embed local matched font for base 14 fonts in the PDF file")
|
||||
.add("embed-external-font", ¶m.embed_external_font, 0, "embed local matched font for external fonts in the PDF file")
|
||||
.add("decompose-ligature", ¶m.decompose_ligature, 0, "decompose ligatures, for example 'fi' -> 'f''i'")
|
||||
@ -75,6 +76,7 @@ void parse_options (int argc, char **argv)
|
||||
.add("tounicode", ¶m.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled")
|
||||
.add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets")
|
||||
|
||||
.add("css-filename", ¶m.css_filename, "", "Specify the file name of the generated css file")
|
||||
.add("font-suffix", ¶m.font_suffix, ".ttf", "suffix for extracted font files")
|
||||
.add("font-format", ¶m.font_format, "opentype", "format for extracted font files")
|
||||
|
||||
@ -180,11 +182,28 @@ int main(int argc, char **argv)
|
||||
|
||||
if(get_suffix(param.input_filename) == ".pdf")
|
||||
{
|
||||
param.output_filename = s.substr(0, s.size() - 4) + ".html";
|
||||
param.output_filename = s.substr(0, s.size() - 4);
|
||||
if(!param.split_pages)
|
||||
param.output_filename = s.substr(0, s.size() - 4) + ".html";
|
||||
}
|
||||
else
|
||||
{
|
||||
param.output_filename = s + ".html";
|
||||
if(!param.split_pages)
|
||||
param.output_filename = s + ".html";
|
||||
}
|
||||
}
|
||||
if(param.css_filename == "")
|
||||
{
|
||||
const string s = get_filename(param.input_filename);
|
||||
|
||||
if(get_suffix(param.input_filename) == ".pdf")
|
||||
{
|
||||
param.output_filename = s.substr(0, s.size() - 4) + ".css";
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!param.split_pages)
|
||||
param.output_filename = s + ".css";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -46,6 +46,13 @@ const map<string, string> GB_ENCODED_FONT_NAME_MAP({
|
||||
{"\xC1\xA5\xCA\xE9", "SimLi"},
|
||||
});
|
||||
|
||||
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({
|
||||
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}},
|
||||
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}},
|
||||
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}},
|
||||
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}}
|
||||
});
|
||||
|
||||
bool isLegalUnicode(Unicode u)
|
||||
{
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user