mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
add manifest & split-pages
This commit is contained in:
parent
78bf7fcaa5
commit
61cdce7e7d
10
pdf2htmlEX.1
10
pdf2htmlEX.1
@ -58,6 +58,11 @@ Whether to embed everything into one HTML file.
|
|||||||
|
|
||||||
If switched out, there will be several files generated along with the HTML file including files for fonts, css, images.
|
If switched out, there will be several files generated along with the HTML file including files for fonts, css, images.
|
||||||
.TP
|
.TP
|
||||||
|
.B --split-pages <0|1> (Default: 0)
|
||||||
|
If turned on, each page is saved in a separated files, also the generated css file will be store separatedly as if single-html=0
|
||||||
|
|
||||||
|
The output files will be named as <output-filename>0.page, <output-file>1.page, ...
|
||||||
|
.TP
|
||||||
.B --embed-base-font <0|1> (Default: 1)
|
.B --embed-base-font <0|1> (Default: 1)
|
||||||
Whether to embed base 14 fonts.
|
Whether to embed base 14 fonts.
|
||||||
|
|
||||||
@ -100,6 +105,11 @@ Treat space characters as offsets, which may increase the size of the output.
|
|||||||
|
|
||||||
Turn it on if space characters are not displayed correctly, or you want to remove positional spaces.
|
Turn it on if space characters are not displayed correctly, or you want to remove positional spaces.
|
||||||
.TP
|
.TP
|
||||||
|
.B --css-filename <filename> (Default: "")
|
||||||
|
Specify the filename of the generated css file, if not embedded.
|
||||||
|
|
||||||
|
If it's empty, the file name will be determined automatically.
|
||||||
|
.TP
|
||||||
.B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
|
.B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
|
||||||
Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
|
Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
|
||||||
.TP
|
.TP
|
||||||
|
@ -1,7 +0,0 @@
|
|||||||
<!-- Created by pdf2htmlEX (http://github.com/coolwanglu/pdf2htmlEX) -->
|
|
||||||
<!-- head.html by WangLu 2012.08.14 -->
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<meta charset="utf-8">
|
|
||||||
<!-- head.html END -->
|
|
2
share/jquery.js
vendored
Normal file
2
share/jquery.js
vendored
Normal file
File diff suppressed because one or more lines are too long
48
share/manifest
Normal file
48
share/manifest
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
# manifest
|
||||||
|
# by WangLu
|
||||||
|
# 2012.09.12
|
||||||
|
#
|
||||||
|
# Syntax
|
||||||
|
# The first char of each line is the command
|
||||||
|
# Empty lines are ignored
|
||||||
|
#
|
||||||
|
# # - comment
|
||||||
|
# @ - include file
|
||||||
|
# $ - special use for pdf2htmlEX
|
||||||
|
#
|
||||||
|
# Special
|
||||||
|
# If a line contains """ only, all text until next """ will be included
|
||||||
|
|
||||||
|
"""
|
||||||
|
<!-- Created by pdf2htmlEX (http://github.com/coolwanglu/pdf2htmlEX) -->
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
"""
|
||||||
|
|
||||||
|
@base.css
|
||||||
|
$css
|
||||||
|
@jquery.js
|
||||||
|
|
||||||
|
"""
|
||||||
|
<title></title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="pdf-main">
|
||||||
|
"""
|
||||||
|
|
||||||
|
$pages
|
||||||
|
|
||||||
|
"""
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
@scroll.js
|
||||||
|
|
||||||
|
"""
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# MANIFEST END
|
@ -1,7 +0,0 @@
|
|||||||
<!-- neck.html by WangLu 2012.08.15 -->
|
|
||||||
<title>pdf2htmlEX</title>
|
|
||||||
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.8.1/jquery.min.js"></script>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<div id="pdf-main">
|
|
||||||
<!-- neck.html END -->
|
|
@ -1,10 +1,6 @@
|
|||||||
<!-- tail.html by Hongliang Tian 2012.09.11 -->
|
|
||||||
</div>
|
|
||||||
<script type="text/javascript">
|
|
||||||
$(function() {
|
$(function() {
|
||||||
var $pages = $(".p"),
|
var $pages = $(".p"),
|
||||||
$pageWrappers = $(".b"),
|
$pageWrappers = $(".b"),
|
||||||
$pageWrappers = $(".pw"),
|
|
||||||
$main = $("#pdf-main"),
|
$main = $("#pdf-main"),
|
||||||
l = $pages.length;
|
l = $pages.length;
|
||||||
|
|
||||||
@ -63,7 +59,3 @@ $(function() {
|
|||||||
// Trigger the event
|
// Trigger the event
|
||||||
$("#pdf-main").scroll();
|
$("#pdf-main").scroll();
|
||||||
});
|
});
|
||||||
</script>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
<!-- tail.html END -->
|
|
@ -12,6 +12,8 @@
|
|||||||
#include "HTMLRenderer.h"
|
#include "HTMLRenderer.h"
|
||||||
#include "namespace.h"
|
#include "namespace.h"
|
||||||
|
|
||||||
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
using std::min;
|
using std::min;
|
||||||
using std::max;
|
using std::max;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
@ -272,3 +274,4 @@ int HTMLRenderer::LineBuffer::State::diff(const State & s) const
|
|||||||
}
|
}
|
||||||
|
|
||||||
const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr";
|
const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr";
|
||||||
|
} //namespace pdf2htmlEX
|
||||||
|
@ -13,23 +13,25 @@
|
|||||||
#include "HTMLRenderer.h"
|
#include "HTMLRenderer.h"
|
||||||
#include "namespace.h"
|
#include "namespace.h"
|
||||||
|
|
||||||
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, const string & fontfileformat, GfxFont * font)
|
void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, const string & fontfileformat, GfxFont * font)
|
||||||
{
|
{
|
||||||
allcss_fout << "@font-face{font-family:f" << info.id << ";src:url(";
|
css_fout << "@font-face{font-family:f" << info.id << ";src:url(";
|
||||||
|
|
||||||
{
|
{
|
||||||
auto fn = str_fmt("f%llx%s", info.id, suffix.c_str());
|
auto fn = str_fmt("f%llx%s", info.id, suffix.c_str());
|
||||||
if(param->single_html)
|
if(param->single_html)
|
||||||
{
|
{
|
||||||
allcss_fout << "'data:font/opentype;base64," << base64stream(ifstream(tmp_dir + "/" + (char*)fn, ifstream::binary)) << "'";
|
css_fout << "'data:font/opentype;base64," << base64stream(ifstream(tmp_dir + "/" + (char*)fn, ifstream::binary)) << "'";
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
allcss_fout << (char*)fn;
|
css_fout << (char*)fn;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
allcss_fout << ")format(\"" << fontfileformat << "\");}.f" << info.id << "{font-family:f" << info.id << ";line-height:" << (info.ascent - info.descent) << ";}" << endl;
|
css_fout << ")format(\"" << fontfileformat << "\");}.f" << info.id << "{font-family:f" << info.id << ";line-height:" << (info.ascent - info.descent) << ";}" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
static string general_font_family(GfxFont * font)
|
static string general_font_family(GfxFont * font)
|
||||||
@ -45,39 +47,39 @@ static string general_font_family(GfxFont * font)
|
|||||||
// TODO: this function is called when some font is unable to process, may use the name there as a hint
|
// TODO: this function is called when some font is unable to process, may use the name there as a hint
|
||||||
void HTMLRenderer::export_remote_default_font(long long fn_id)
|
void HTMLRenderer::export_remote_default_font(long long fn_id)
|
||||||
{
|
{
|
||||||
allcss_fout << ".f" << fn_id << "{font-family:sans-serif;color:transparent;visibility:hidden;}" << endl;
|
css_fout << ".f" << fn_id << "{font-family:sans-serif;color:transparent;visibility:hidden;}" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont)
|
void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont)
|
||||||
{
|
{
|
||||||
allcss_fout << ".f" << info.id << "{";
|
css_fout << ".f" << info.id << "{";
|
||||||
allcss_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
|
css_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
|
||||||
|
|
||||||
string fn = original_font_name;
|
string fn = original_font_name;
|
||||||
for(auto iter = fn.begin(); iter != fn.end(); ++iter)
|
for(auto iter = fn.begin(); iter != fn.end(); ++iter)
|
||||||
*iter = tolower(*iter);
|
*iter = tolower(*iter);
|
||||||
|
|
||||||
if(font->isBold() || (fn.find("bold") != string::npos))
|
if(font->isBold() || (fn.find("bold") != string::npos))
|
||||||
allcss_fout << "font-weight:bold;";
|
css_fout << "font-weight:bold;";
|
||||||
|
|
||||||
if(fn.find("oblique") != string::npos)
|
if(fn.find("oblique") != string::npos)
|
||||||
allcss_fout << "font-style:oblique;";
|
css_fout << "font-style:oblique;";
|
||||||
else if(font->isItalic() || (fn.find("italic") != string::npos))
|
else if(font->isItalic() || (fn.find("italic") != string::npos))
|
||||||
allcss_fout << "font-style:italic;";
|
css_fout << "font-style:italic;";
|
||||||
|
|
||||||
allcss_fout << "line-height:" << (info.ascent - info.descent) << ";";
|
css_fout << "line-height:" << (info.ascent - info.descent) << ";";
|
||||||
|
|
||||||
allcss_fout << "}" << endl;
|
css_fout << "}" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::export_font_size (long long fs_id, double font_size)
|
void HTMLRenderer::export_font_size (long long fs_id, double font_size)
|
||||||
{
|
{
|
||||||
allcss_fout << ".s" << fs_id << "{font-size:" << font_size << "px;}" << endl;
|
css_fout << ".s" << fs_id << "{font-size:" << font_size << "px;}" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
|
void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
|
||||||
{
|
{
|
||||||
allcss_fout << ".t" << tm_id << "{";
|
css_fout << ".t" << tm_id << "{";
|
||||||
|
|
||||||
// always ignore tm[4] and tm[5] because
|
// always ignore tm[4] and tm[5] because
|
||||||
// we have already shifted the origin
|
// we have already shifted the origin
|
||||||
@ -88,30 +90,30 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
|
|||||||
{
|
{
|
||||||
const auto & prefix = *iter;
|
const auto & prefix = *iter;
|
||||||
// PDF use a different coordinate system from Web
|
// PDF use a different coordinate system from Web
|
||||||
allcss_fout << prefix << "transform:matrix("
|
css_fout << prefix << "transform:matrix("
|
||||||
<< tm[0] << ','
|
<< tm[0] << ','
|
||||||
<< -tm[1] << ','
|
<< -tm[1] << ','
|
||||||
<< -tm[2] << ','
|
<< -tm[2] << ','
|
||||||
<< tm[3] << ',';
|
<< tm[3] << ',';
|
||||||
|
|
||||||
allcss_fout << "0,0);";
|
css_fout << "0,0);";
|
||||||
}
|
}
|
||||||
allcss_fout << "}" << endl;
|
css_fout << "}" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::export_letter_space (long long ls_id, double letter_space)
|
void HTMLRenderer::export_letter_space (long long ls_id, double letter_space)
|
||||||
{
|
{
|
||||||
allcss_fout << ".l" << ls_id << "{letter-spacing:" << letter_space << "px;}" << endl;
|
css_fout << ".l" << ls_id << "{letter-spacing:" << letter_space << "px;}" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::export_word_space (long long ws_id, double word_space)
|
void HTMLRenderer::export_word_space (long long ws_id, double word_space)
|
||||||
{
|
{
|
||||||
allcss_fout << ".w" << ws_id << "{word-spacing:" << word_space << "px;}" << endl;
|
css_fout << ".w" << ws_id << "{word-spacing:" << word_space << "px;}" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
|
void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
|
||||||
{
|
{
|
||||||
allcss_fout << ".c" << color_id << "{color:rgb("
|
css_fout << ".c" << color_id << "{color:rgb("
|
||||||
<< dec << (int)colToByte(rgb->r) << "," << (int)colToByte(rgb->g) << "," << (int)colToByte(rgb->b) << ");}" << hex
|
<< dec << (int)colToByte(rgb->r) << "," << (int)colToByte(rgb->g) << "," << (int)colToByte(rgb->b) << ");}" << hex
|
||||||
<< endl;
|
<< endl;
|
||||||
}
|
}
|
||||||
@ -119,13 +121,14 @@ void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
|
|||||||
void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
|
void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
|
||||||
{
|
{
|
||||||
if(ws_width > 0)
|
if(ws_width > 0)
|
||||||
allcss_fout << "._" << ws_id << "{display:inline-block;width:" << ws_width << "px;}" << endl;
|
css_fout << "._" << ws_id << "{display:inline-block;width:" << ws_width << "px;}" << endl;
|
||||||
else
|
else
|
||||||
allcss_fout << "._" << ws_id << "{display:inline;margin-left:" << ws_width << "px;}" << endl;
|
css_fout << "._" << ws_id << "{display:inline;margin-left:" << ws_width << "px;}" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::export_rise (long long rise_id, double rise)
|
void HTMLRenderer::export_rise (long long rise_id, double rise)
|
||||||
{
|
{
|
||||||
allcss_fout << ".r" << rise_id << "{top:" << (-rise) << "px;}" << endl;
|
css_fout << ".r" << rise_id << "{top:" << (-rise) << "px;}" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <ostream>
|
||||||
|
|
||||||
#include <splash/SplashBitmap.h>
|
#include <splash/SplashBitmap.h>
|
||||||
|
|
||||||
@ -17,8 +18,11 @@
|
|||||||
#include "ff.h"
|
#include "ff.h"
|
||||||
#include "pdf2htmlEX-config.h"
|
#include "pdf2htmlEX-config.h"
|
||||||
|
|
||||||
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
using std::fixed;
|
using std::fixed;
|
||||||
using std::flush;
|
using std::flush;
|
||||||
|
using std::ostream;
|
||||||
|
|
||||||
static void dummy(void *, enum ErrorCategory, int pos, char *)
|
static void dummy(void *, enum ErrorCategory, int pos, char *)
|
||||||
{
|
{
|
||||||
@ -83,6 +87,14 @@ void HTMLRenderer::process(PDFDoc *doc)
|
|||||||
|
|
||||||
for(int i = param->first_page; i <= param->last_page ; ++i)
|
for(int i = param->first_page; i <= param->last_page ; ++i)
|
||||||
{
|
{
|
||||||
|
if(param->split_pages)
|
||||||
|
{
|
||||||
|
auto page_fn = str_fmt("%s/__pages%x", tmp_dir.c_str(), i);
|
||||||
|
html_fout.open((char*)page_fn, ofstream::binary);
|
||||||
|
add_tmp_file((char*)page_fn);
|
||||||
|
fix_stream(html_fout);
|
||||||
|
}
|
||||||
|
|
||||||
if(param->process_nontext)
|
if(param->process_nontext)
|
||||||
{
|
{
|
||||||
doc->displayPage(bg_renderer, i, param->h_dpi, param->v_dpi,
|
doc->displayPage(bg_renderer, i, param->h_dpi, param->v_dpi,
|
||||||
@ -104,8 +116,14 @@ void HTMLRenderer::process(PDFDoc *doc)
|
|||||||
0, true, false, false,
|
0, true, false, false,
|
||||||
nullptr, nullptr, nullptr, nullptr);
|
nullptr, nullptr, nullptr, nullptr);
|
||||||
|
|
||||||
|
if(param->split_pages)
|
||||||
|
{
|
||||||
|
html_fout.close();
|
||||||
|
}
|
||||||
|
|
||||||
cerr << "." << flush;
|
cerr << "." << flush;
|
||||||
}
|
}
|
||||||
|
|
||||||
post_process();
|
post_process();
|
||||||
|
|
||||||
if(bg_renderer)
|
if(bg_renderer)
|
||||||
@ -116,53 +134,116 @@ void HTMLRenderer::process(PDFDoc *doc)
|
|||||||
|
|
||||||
void HTMLRenderer::pre_process()
|
void HTMLRenderer::pre_process()
|
||||||
{
|
{
|
||||||
// we may output utf8 characters, so use binary
|
// we may output utf8 characters, so always use binary
|
||||||
if(param->single_html)
|
|
||||||
{
|
{
|
||||||
{
|
/*
|
||||||
auto fn = str_fmt("%s/%s", tmp_dir.c_str(), CSS_FILENAME.c_str());
|
* If single-html && !split-pages
|
||||||
allcss_fout.open((char*)fn, ofstream::binary);
|
* we have to keep the generated css file into a temporary place
|
||||||
|
* and embed it into the main html later
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* If single-html && split-page
|
||||||
|
* as there's no place to embed the css file, just leave it alone (into dest_dir)
|
||||||
|
*
|
||||||
|
* If !single-html
|
||||||
|
* leave it in dest_dir
|
||||||
|
*/
|
||||||
|
|
||||||
|
auto fn = (param->single_html && (!param->split_pages))
|
||||||
|
? str_fmt("%s/__css", tmp_dir.c_str())
|
||||||
|
: str_fmt("%s/%s", dest_dir.c_str(), param->css_filename.c_str());
|
||||||
|
|
||||||
|
if(param->single_html)
|
||||||
add_tmp_file((char*)fn);
|
add_tmp_file((char*)fn);
|
||||||
}
|
|
||||||
|
|
||||||
{
|
css_path = (char*)fn,
|
||||||
// don't use output_file directly
|
css_fout.open(css_path, ofstream::binary);
|
||||||
// otherwise it'll be a disaster when tmp_dir == dest_dir
|
fix_stream(css_fout);
|
||||||
auto tmp_output_fn = str_fmt("%s/%s.part", tmp_dir.c_str(), param->output_filename.c_str());
|
|
||||||
add_tmp_file((char*)tmp_output_fn);
|
|
||||||
|
|
||||||
html_fout.open((char*)tmp_output_fn, ofstream::binary);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
|
||||||
|
// if split-pages is specified, open & close the file in the process loop
|
||||||
|
// if not, open the file here:
|
||||||
|
if(!param->split_pages);
|
||||||
{
|
{
|
||||||
html_fout.open(str_fmt("%s/%s", dest_dir.c_str(), param->output_filename.c_str()), ofstream::binary);
|
/*
|
||||||
allcss_fout.open(str_fmt("%s/%s", dest_dir.c_str(), CSS_FILENAME.c_str()), ofstream::binary);
|
* If single-html
|
||||||
|
* we have to keep the html file (for page) into a temporary place
|
||||||
|
* because we'll have to embed css before it
|
||||||
|
*
|
||||||
|
* Otherwise just generate it
|
||||||
|
*/
|
||||||
|
auto fn = (param->single_html)
|
||||||
|
? str_fmt("%s/__pages", tmp_dir.c_str())
|
||||||
|
: str_fmt("%s/%s", dest_dir.c_str(), param->output_filename.c_str());
|
||||||
|
|
||||||
html_fout << ifstream(str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), HEAD_HTML_FILENAME.c_str()), ifstream::binary).rdbuf();
|
if(param->single_html)
|
||||||
html_fout << "<link rel=\"stylesheet\" type=\"text/css\" href=\"" << CSS_FILENAME << "\"/>" << endl;
|
add_tmp_file((char*)fn);
|
||||||
html_fout << ifstream(str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), NECK_HTML_FILENAME.c_str()), ifstream::binary).rdbuf();
|
|
||||||
|
html_path = (char*)fn;
|
||||||
|
html_fout.open(html_path, ofstream::binary);
|
||||||
|
fix_stream(html_fout);
|
||||||
}
|
}
|
||||||
|
|
||||||
fix_stream(html_fout);
|
|
||||||
fix_stream(allcss_fout);
|
|
||||||
|
|
||||||
allcss_fout << ifstream(str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), CSS_FILENAME.c_str()), ifstream::binary).rdbuf();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::post_process()
|
void HTMLRenderer::post_process()
|
||||||
{
|
{
|
||||||
if(!param->single_html)
|
// close files
|
||||||
{
|
|
||||||
html_fout << ifstream(str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), TAIL_HTML_FILENAME.c_str()), ifstream::binary).rdbuf();
|
|
||||||
}
|
|
||||||
|
|
||||||
html_fout.close();
|
html_fout.close();
|
||||||
allcss_fout.close();
|
css_fout.close();
|
||||||
|
|
||||||
if(param->single_html)
|
//only when !split-page, do we have some work left to do
|
||||||
|
if(!param->split_pages)
|
||||||
|
return;
|
||||||
|
|
||||||
|
ofstream output((char*)str_fmt("%s/%s", dest_dir.c_str(), param->output_filename.c_str()));
|
||||||
|
fix_stream(output);
|
||||||
|
|
||||||
|
// apply manifest
|
||||||
|
ifstream manifest_fin((char*)str_fmt("%s/%s", PDF2HTMLEX_DATA_PATH.c_str(), MANIFEST_FILENAME.c_str()));
|
||||||
|
|
||||||
|
bool embed_string = false;
|
||||||
|
string line;
|
||||||
|
while(getline(manifest_fin, line))
|
||||||
{
|
{
|
||||||
process_single_html();
|
if(embed_string)
|
||||||
|
{
|
||||||
|
output << line << endl;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(line.empty() || line[0] == '#')
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if(line == "\"\"\"")
|
||||||
|
{
|
||||||
|
embed_string = !embed_string;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(line[0] == '@')
|
||||||
|
{
|
||||||
|
embed_file(output, PDF2HTMLEX_DATA_PATH + "/" + line.substr(1), true);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(line[0] == '$')
|
||||||
|
{
|
||||||
|
if(line == "$css")
|
||||||
|
{
|
||||||
|
embed_file(output, css_path, false);
|
||||||
|
}
|
||||||
|
else if (line == "$pages")
|
||||||
|
{
|
||||||
|
output << ifstream(html_path, ifstream::binary).rdbuf();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cerr << "Warning: unknown line in manifest: " << line << endl;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
cerr << "Warning: unknown line in manifest: " << line << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -182,7 +263,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
|||||||
{
|
{
|
||||||
if(param->single_html)
|
if(param->single_html)
|
||||||
{
|
{
|
||||||
html_fout << "'data:image/png;base64," << base64stream(ifstream(str_fmt("%s/p%llx.png", tmp_dir.c_str(), pageNum) , ifstream::binary)) << "'";
|
html_fout << "'data:image/png;base64," << base64stream(ifstream((char*)str_fmt("%s/p%llx.png", tmp_dir.c_str(), pageNum) , ifstream::binary)) << "'";
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -225,23 +306,6 @@ void HTMLRenderer::endPage() {
|
|||||||
html_fout << "</div></div>" << endl;
|
html_fout << "</div></div>" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::process_single_html()
|
|
||||||
{
|
|
||||||
ofstream out (dest_dir + "/" + param->output_filename, ofstream::binary);
|
|
||||||
|
|
||||||
out << ifstream(PDF2HTMLEX_DATA_PATH + "/" + HEAD_HTML_FILENAME , ifstream::binary).rdbuf();
|
|
||||||
|
|
||||||
out << "<style type=\"text/css\">" << endl;
|
|
||||||
out << ifstream(tmp_dir + "/" + CSS_FILENAME, ifstream::binary).rdbuf();
|
|
||||||
out << "</style>" << endl;
|
|
||||||
|
|
||||||
out << ifstream(PDF2HTMLEX_DATA_PATH + "/" + NECK_HTML_FILENAME, ifstream::binary).rdbuf();
|
|
||||||
|
|
||||||
out << ifstream(tmp_dir + "/" + (param->output_filename + ".part"), ifstream::binary).rdbuf();
|
|
||||||
|
|
||||||
out << ifstream(PDF2HTMLEX_DATA_PATH + "/" + TAIL_HTML_FILENAME, ifstream::binary).rdbuf();
|
|
||||||
}
|
|
||||||
|
|
||||||
void HTMLRenderer::fix_stream (std::ostream & out)
|
void HTMLRenderer::fix_stream (std::ostream & out)
|
||||||
{
|
{
|
||||||
out << fixed << hex;
|
out << fixed << hex;
|
||||||
@ -274,7 +338,32 @@ void HTMLRenderer::clean_tmp_files()
|
|||||||
cerr << "Remove temporary directory: " << tmp_dir << endl;
|
cerr << "Remove temporary directory: " << tmp_dir << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string HTMLRenderer::HEAD_HTML_FILENAME = "head.html";
|
void HTMLRenderer::embed_file(ostream & out, const string & path, bool copy)
|
||||||
const std::string HTMLRenderer::NECK_HTML_FILENAME = "neck.html";
|
{
|
||||||
const std::string HTMLRenderer::TAIL_HTML_FILENAME = "tail.html";
|
string fn = get_filename(path);
|
||||||
const std::string HTMLRenderer::CSS_FILENAME = "all.css";
|
string suffix = get_suffix(fn);
|
||||||
|
|
||||||
|
auto iter = EMBED_STRING_MAP.find(make_pair(suffix, param->single_html));
|
||||||
|
if(iter == EMBED_STRING_MAP.end())
|
||||||
|
{
|
||||||
|
cerr << "Warning: unknown suffix in manifest: " << suffix << endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(param->single_html)
|
||||||
|
{
|
||||||
|
cerr << iter->second.first << endl
|
||||||
|
<< ifstream(path, ifstream::binary).rdbuf()
|
||||||
|
<< iter->second.second << endl;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cerr << iter->second.first
|
||||||
|
<< fn
|
||||||
|
<< iter->second.second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string HTMLRenderer::MANIFEST_FILENAME = "manifest";
|
||||||
|
|
||||||
|
}// namespace pdf2htmlEX
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
#include "HTMLRenderer.h"
|
#include "HTMLRenderer.h"
|
||||||
#include "namespace.h"
|
#include "namespace.h"
|
||||||
|
|
||||||
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg)
|
void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg)
|
||||||
{
|
{
|
||||||
return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg);
|
return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg);
|
||||||
@ -59,3 +61,5 @@ void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int w
|
|||||||
++ image_count;
|
++ image_count;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} // namespace pdf2htmlEX
|
||||||
|
@ -17,6 +17,8 @@
|
|||||||
#include "namespace.h"
|
#include "namespace.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
|
|
||||||
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
using std::abs;
|
using std::abs;
|
||||||
|
|
||||||
const FontInfo * HTMLRenderer::install_font(GfxFont * font)
|
const FontInfo * HTMLRenderer::install_font(GfxFont * font)
|
||||||
@ -290,3 +292,5 @@ long long HTMLRenderer::install_rise(double rise)
|
|||||||
export_rise(new_rise_id, rise);
|
export_rise(new_rise_id, rise);
|
||||||
return new_rise_id;
|
return new_rise_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} // namespace pdf2htmlEX
|
||||||
|
@ -17,6 +17,9 @@
|
|||||||
#include "HTMLRenderer.h"
|
#include "HTMLRenderer.h"
|
||||||
#include "namespace.h"
|
#include "namespace.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
using std::max;
|
using std::max;
|
||||||
using std::abs;
|
using std::abs;
|
||||||
|
|
||||||
@ -358,3 +361,5 @@ void HTMLRenderer::close_line()
|
|||||||
line_buf.flush();
|
line_buf.flush();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} //namespace pdf2htmlEX
|
||||||
|
@ -19,6 +19,8 @@
|
|||||||
#include "HTMLRenderer.h"
|
#include "HTMLRenderer.h"
|
||||||
#include "namespace.h"
|
#include "namespace.h"
|
||||||
|
|
||||||
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
using std::unordered_set;
|
using std::unordered_set;
|
||||||
using std::min;
|
using std::min;
|
||||||
using std::all_of;
|
using std::all_of;
|
||||||
@ -486,3 +488,5 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
|||||||
draw_tx += dx + dxerr * cur_font_size * hs;
|
draw_tx += dx + dxerr * cur_font_size * hs;
|
||||||
draw_ty += dy;
|
draw_ty += dy;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} // namespace pdf2htmlEX
|
||||||
|
@ -84,7 +84,6 @@ class HTMLRenderer : public OutputDev
|
|||||||
|
|
||||||
virtual void pre_process();
|
virtual void pre_process();
|
||||||
virtual void post_process();
|
virtual void post_process();
|
||||||
virtual void process_single_html();
|
|
||||||
|
|
||||||
// Start a page.
|
// Start a page.
|
||||||
virtual void startPage(int pageNum, GfxState *state);
|
virtual void startPage(int pageNum, GfxState *state);
|
||||||
@ -169,6 +168,11 @@ class HTMLRenderer : public OutputDev
|
|||||||
void export_whitespace(long long ws_id, double ws_width);
|
void export_whitespace(long long ws_id, double ws_width);
|
||||||
void export_rise(long long rise_id, double rise);
|
void export_rise(long long rise_id, double rise);
|
||||||
|
|
||||||
|
|
||||||
|
// depending on single-html, to embed the content or add a link to it
|
||||||
|
// "copy": indicates whether to copy the file into dest_dir, if not embedded
|
||||||
|
void embed_file(std::ostream & out, const std::string & path, bool copy);
|
||||||
|
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
// state tracking
|
// state tracking
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
@ -360,13 +364,11 @@ class HTMLRenderer : public OutputDev
|
|||||||
|
|
||||||
const Param * param;
|
const Param * param;
|
||||||
std::string dest_dir, tmp_dir;
|
std::string dest_dir, tmp_dir;
|
||||||
std::ofstream html_fout, allcss_fout;
|
std::ofstream html_fout, css_fout;
|
||||||
|
std::string html_path, css_path;
|
||||||
std::set<std::string> tmp_files;
|
std::set<std::string> tmp_files;
|
||||||
|
|
||||||
static const std::string HEAD_HTML_FILENAME;
|
static const std::string MANIFEST_FILENAME;
|
||||||
static const std::string NECK_HTML_FILENAME;
|
|
||||||
static const std::string TAIL_HTML_FILENAME;
|
|
||||||
static const std::string CSS_FILENAME;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} //namespace pdf2htmlEX
|
} //namespace pdf2htmlEX
|
||||||
|
@ -30,6 +30,7 @@ struct Param
|
|||||||
|
|
||||||
int process_nontext;
|
int process_nontext;
|
||||||
int single_html;
|
int single_html;
|
||||||
|
int split_pages;
|
||||||
int embed_base_font;
|
int embed_base_font;
|
||||||
int embed_external_font;
|
int embed_external_font;
|
||||||
int decompose_ligature;
|
int decompose_ligature;
|
||||||
@ -41,6 +42,7 @@ struct Param
|
|||||||
int tounicode;
|
int tounicode;
|
||||||
int space_as_offset;
|
int space_as_offset;
|
||||||
|
|
||||||
|
std::string css_filename;
|
||||||
std::string font_suffix, font_format;
|
std::string font_suffix, font_format;
|
||||||
|
|
||||||
int debug;
|
int debug;
|
||||||
|
@ -19,7 +19,5 @@ using std::make_pair;
|
|||||||
using std::ifstream;
|
using std::ifstream;
|
||||||
using std::ofstream;
|
using std::ofstream;
|
||||||
|
|
||||||
using namespace pdf2htmlEX;
|
|
||||||
|
|
||||||
#endif // NAMESPACE_H__
|
#endif // NAMESPACE_H__
|
||||||
|
|
||||||
|
@ -33,6 +33,10 @@ static const double DEFAULT_DPI = 72.0;
|
|||||||
|
|
||||||
extern const std::map<std::string, std::string> BASE_14_FONT_CSS_FONT_MAP;
|
extern const std::map<std::string, std::string> BASE_14_FONT_CSS_FONT_MAP;
|
||||||
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
|
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
|
||||||
|
// map to embed files into html
|
||||||
|
// key: (suffix, if_embed_content)
|
||||||
|
// value: (prefix string, suffix string)
|
||||||
|
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP;
|
||||||
|
|
||||||
// mute gcc warning of unused function
|
// mute gcc warning of unused function
|
||||||
namespace
|
namespace
|
||||||
|
@ -64,6 +64,7 @@ void parse_options (int argc, char **argv)
|
|||||||
|
|
||||||
.add("process-nontext", ¶m.process_nontext, 1, "process nontext objects")
|
.add("process-nontext", ¶m.process_nontext, 1, "process nontext objects")
|
||||||
.add("single-html", ¶m.single_html, 1, "combine everything into one single HTML file")
|
.add("single-html", ¶m.single_html, 1, "combine everything into one single HTML file")
|
||||||
|
.add("split-pages", ¶m.split_pages, 0, "split pages into separated files")
|
||||||
.add("embed-base-font", ¶m.embed_base_font, 0, "embed local matched font for base 14 fonts in the PDF file")
|
.add("embed-base-font", ¶m.embed_base_font, 0, "embed local matched font for base 14 fonts in the PDF file")
|
||||||
.add("embed-external-font", ¶m.embed_external_font, 0, "embed local matched font for external fonts in the PDF file")
|
.add("embed-external-font", ¶m.embed_external_font, 0, "embed local matched font for external fonts in the PDF file")
|
||||||
.add("decompose-ligature", ¶m.decompose_ligature, 0, "decompose ligatures, for example 'fi' -> 'f''i'")
|
.add("decompose-ligature", ¶m.decompose_ligature, 0, "decompose ligatures, for example 'fi' -> 'f''i'")
|
||||||
@ -75,6 +76,7 @@ void parse_options (int argc, char **argv)
|
|||||||
.add("tounicode", ¶m.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled")
|
.add("tounicode", ¶m.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled")
|
||||||
.add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets")
|
.add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets")
|
||||||
|
|
||||||
|
.add("css-filename", ¶m.css_filename, "", "Specify the file name of the generated css file")
|
||||||
.add("font-suffix", ¶m.font_suffix, ".ttf", "suffix for extracted font files")
|
.add("font-suffix", ¶m.font_suffix, ".ttf", "suffix for extracted font files")
|
||||||
.add("font-format", ¶m.font_format, "opentype", "format for extracted font files")
|
.add("font-format", ¶m.font_format, "opentype", "format for extracted font files")
|
||||||
|
|
||||||
@ -180,11 +182,28 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
if(get_suffix(param.input_filename) == ".pdf")
|
if(get_suffix(param.input_filename) == ".pdf")
|
||||||
{
|
{
|
||||||
param.output_filename = s.substr(0, s.size() - 4) + ".html";
|
param.output_filename = s.substr(0, s.size() - 4);
|
||||||
|
if(!param.split_pages)
|
||||||
|
param.output_filename = s.substr(0, s.size() - 4) + ".html";
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
param.output_filename = s + ".html";
|
if(!param.split_pages)
|
||||||
|
param.output_filename = s + ".html";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(param.css_filename == "")
|
||||||
|
{
|
||||||
|
const string s = get_filename(param.input_filename);
|
||||||
|
|
||||||
|
if(get_suffix(param.input_filename) == ".pdf")
|
||||||
|
{
|
||||||
|
param.output_filename = s.substr(0, s.size() - 4) + ".css";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if(!param.split_pages)
|
||||||
|
param.output_filename = s + ".css";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,6 +46,13 @@ const map<string, string> GB_ENCODED_FONT_NAME_MAP({
|
|||||||
{"\xC1\xA5\xCA\xE9", "SimLi"},
|
{"\xC1\xA5\xCA\xE9", "SimLi"},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({
|
||||||
|
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}},
|
||||||
|
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}},
|
||||||
|
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}},
|
||||||
|
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}}
|
||||||
|
});
|
||||||
|
|
||||||
bool isLegalUnicode(Unicode u)
|
bool isLegalUnicode(Unicode u)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
Loading…
Reference in New Issue
Block a user