1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

removed 'optmized' option, use greedy method automatically

This commit is contained in:
Lu Wang 2012-09-06 14:37:09 +08:00
parent 95331d061b
commit e191f8127d
6 changed files with 50 additions and 134 deletions

View File

@ -89,9 +89,6 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o
However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch. However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch.
.TP .TP
.B --optimize <0|1> (Default: 0)
Try to optimize the output HTML file, might be slow.
.TP
.B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype") .B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
.TP .TP

View File

@ -293,7 +293,6 @@ class HTMLRenderer : public OutputDev
size_t start_idx; // index of the first Text using this state size_t start_idx; // index of the first Text using this state
// for optimzation // for optimzation
long long hash_value; long long hash_value;
int depth; // the depth in the state tree
bool need_close; bool need_close;
static const char * format_str; // class names for each id static const char * format_str; // class names for each id
@ -315,8 +314,6 @@ class HTMLRenderer : public OutputDev
private: private:
// retrieve state from renderer // retrieve state from renderer
void set_state(State & state); void set_state(State & state);
// build the state tree in order to minimize the size of output
void optimize_states(void);
HTMLRenderer * renderer; HTMLRenderer * renderer;
@ -327,6 +324,9 @@ class HTMLRenderer : public OutputDev
std::vector<Offset> offsets; std::vector<Offset> offsets;
std::vector<Unicode> text; std::vector<Unicode> text;
// for flush
std::vector<State*> stack;
} line_buf; } line_buf;
friend class LineBuffer; friend class LineBuffer;

View File

@ -8,7 +8,6 @@
*/ */
#include <vector> #include <vector>
#include <stack>
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "HTMLRenderer/namespace.h" #include "HTMLRenderer/namespace.h"
@ -16,7 +15,6 @@
using std::min; using std::min;
using std::max; using std::max;
using std::vector; using std::vector;
using std::stack;
using std::function; using std::function;
void HTMLRenderer::LineBuffer::reset(GfxState * state) void HTMLRenderer::LineBuffer::reset(GfxState * state)
@ -66,19 +64,8 @@ void HTMLRenderer::LineBuffer::flush(void)
for(auto & s : states) for(auto & s : states)
s.hash(); s.hash();
if((renderer->param->optimize) && (states.size() > 2))
{
optimize_states();
}
else
{
for(size_t i = 0; i < states.size(); ++i)
states[i].depth = i;
}
states.resize(states.size() + 1); states.resize(states.size() + 1);
states.back().start_idx = text.size(); states.back().start_idx = text.size();
states.back().depth = 0;
offsets.push_back({text.size(), 0}); offsets.push_back({text.size(), 0});
@ -100,27 +87,44 @@ void HTMLRenderer::LineBuffer::flush(void)
//accumulated horizontal offset; //accumulated horizontal offset;
double dx = 0; double dx = 0;
stack<State*> stack; stack.clear();
stack.push(nullptr); stack.push_back(nullptr);
int last_depth = -1;
// whenever a negative offset appears, we should not pop out that <span>
// otherwise the effect of negative margin-left would disappear
size_t last_text_pos_with_negative_offset = -1;
size_t cur_text_idx = 0; size_t cur_text_idx = 0;
while(cur_text_idx < text.size()) while(cur_text_idx < text.size())
{ {
if(cur_text_idx >= cur_state_iter->start_idx) if(cur_text_idx >= cur_state_iter->start_idx)
{ {
int depth = cur_state_iter -> depth; // greedy
int cnt = last_depth + 1 - depth; int best_cost = State::ID_COUNT;
assert(cnt >= 0);
while(cnt--) // we have a nullptr at the beginning, so no need to check for rend
for(auto iter = stack.rbegin(); *iter; ++iter)
{ {
stack.top()->end(out); int cost = cur_state_iter->diff(**iter);
stack.pop(); if(cost < best_cost)
} {
while(stack.back() != *iter)
{
stack.back()->end(out);
stack.pop_back();
}
best_cost = cost;
cur_state_iter->begin(out, stack.top()); if(best_cost == 0)
stack.push(&*cur_state_iter); break;
last_depth = depth; }
// cannot go further
if((*iter)->start_idx <= last_text_pos_with_negative_offset)
break;
}
cur_state_iter->begin(out, stack.back());
stack.push_back(&*cur_state_iter);
++ cur_state_iter; ++ cur_state_iter;
} }
@ -132,6 +136,9 @@ void HTMLRenderer::LineBuffer::flush(void)
auto wid = renderer->install_whitespace(target, w); auto wid = renderer->install_whitespace(target, w);
if(w < 0)
last_text_pos_with_negative_offset = cur_text_idx;
double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold); double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold);
out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : ""); out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
@ -146,10 +153,10 @@ void HTMLRenderer::LineBuffer::flush(void)
} }
// we have a nullptr in the bottom // we have a nullptr in the bottom
while(stack.top()) while(stack.back())
{ {
stack.top()->end(out); stack.back()->end(out);
stack.pop(); stack.pop_back();
} }
out << "</div>"; out << "</div>";
@ -176,103 +183,8 @@ void HTMLRenderer::LineBuffer::set_state (State & state)
state.draw_font_size = renderer->draw_font_size; state.draw_font_size = renderer->draw_font_size;
} }
class DPBufferEntry
{
public:
int last_child;
int min_cost;
};
static vector<DPBufferEntry> flattened_dp_buffer;
static vector<DPBufferEntry*> dp_buffer;
void HTMLRenderer::LineBuffer::optimize_states (void)
{
int n = states.size();
flattened_dp_buffer.resize(n*(n+1)/2);
dp_buffer.resize(n);
{
int incre = n;
auto iter = dp_buffer.begin();
DPBufferEntry * p = flattened_dp_buffer.data();
while(incre > 0)
{
*(iter++) = p;
p += (incre--);
}
}
// depth 0
for(int i = 0; i < n; ++i)
flattened_dp_buffer[i].min_cost = 0;
int last_at_this_depth = n;
for(int depth = 1; depth < n; ++depth)
{
--last_at_this_depth;
for(int i = 0; i < last_at_this_depth; ++i)
{
//determine dp_buffer[depth][i]
int best_last_child = i+1;
int best_min_cost = states[i].diff(states[i+1]) + dp_buffer[depth-1][i+1].min_cost;
// at depth, we consider [i+1, i+depth+1) as possible children of i
for(int j = 2; j <= depth; ++j)
{
int cost = dp_buffer[j-1][i].min_cost + dp_buffer[depth-j][i+j].min_cost;
// avoid calling diff() when possible
if (cost >= best_min_cost) continue;
cost += states[i].diff(states[i+j]);
if(cost < best_min_cost)
{
best_last_child = i+j;
best_min_cost = cost;
}
}
dp_buffer[depth][i] = {best_last_child, best_min_cost};
}
}
// now fill in the depths
// use recursion for now, until someone finds a PDF that would causes this overflow
function<void(int,int,int)> func = [&](int idx, int depth, int tree_depth) -> void {
states[idx].depth = tree_depth;
while(depth > 0)
{
int last_child = dp_buffer[depth][idx].last_child;
assert((last_child > idx) && (last_child <= idx + depth));
func(last_child, idx + depth - last_child, tree_depth + 1);
depth = last_child - idx - 1;
}
};
func(0, n-1, 0);
}
void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state) void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state)
{ {
if(prev_state && (prev_state->hash_value == hash_value))
{
// check ids again
int i;
for(i = 0; i < ID_COUNT; ++i)
if(ids[i] != prev_state->ids[i])
break;
if(i == ID_COUNT)
{
need_close = false;
return;
}
}
need_close = true;
out << "<span class=\"";
bool first = true; bool first = true;
for(int i = 0; i < ID_COUNT; ++i) for(int i = 0; i < ID_COUNT; ++i)
{ {
@ -281,6 +193,7 @@ void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_s
if(first) if(first)
{ {
out << "<span class=\"";
first = false; first = false;
} }
else else
@ -291,7 +204,15 @@ void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_s
out << format("%1%%|2$x|") % format_str[i] % ids[i]; out << format("%1%%|2$x|") % format_str[i] % ids[i];
} }
out << "\">"; if(first)
{
need_close = false;
}
else
{
out << "\">";
need_close = true;
}
} }
void HTMLRenderer::LineBuffer::State::end(ostream & out) const void HTMLRenderer::LineBuffer::State::end(ostream & out) const

View File

@ -36,7 +36,6 @@ struct Param
double space_threshold; double space_threshold;
double font_size_multiplier; double font_size_multiplier;
int always_apply_tounicode; int always_apply_tounicode;
int optimize;
std::string font_suffix, font_format; std::string font_suffix, font_format;

View File

@ -84,7 +84,6 @@ po::variables_map parse_options (int argc, char **argv)
("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character") ("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character")
("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy") ("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy")
("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on") ("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
("optimize", po::value<int>(&param.optimize)->default_value(0), "Optimize HTML, might be very slow")
("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files") ("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files")
("font-format", po::value<string>(&param.font_format)->default_value("truetype"), "format for extracted font files") ("font-format", po::value<string>(&param.font_format)->default_value("truetype"), "format for extracted font files")

View File

@ -12,7 +12,7 @@ for f in os.listdir(DIR):
if not f.lower().endswith('.pdf'): if not f.lower().endswith('.pdf'):
continue continue
print f print f
os.system('pdf2htmlEX --optimize 1 -l 10 --dest-dir html "%s/%s"' % (DIR,f)) os.system('pdf2htmlEX -l 10 --dest-dir html "%s/%s"' % (DIR,f))
ff = f[:-3]+'html' ff = f[:-3]+'html'
outf.write('<a href="html/%s" target="pdf">%s</a><br/>' % (ff,ff)) outf.write('<a href="html/%s" target="pdf">%s</a><br/>' % (ff,ff))
outf.flush(); outf.flush();