mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
removed 'optmized' option, use greedy method automatically
This commit is contained in:
parent
95331d061b
commit
e191f8127d
@ -89,9 +89,6 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o
|
||||
|
||||
However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch.
|
||||
.TP
|
||||
.B --optimize <0|1> (Default: 0)
|
||||
Try to optimize the output HTML file, might be slow.
|
||||
.TP
|
||||
.B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
|
||||
Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
|
||||
.TP
|
||||
|
@ -293,7 +293,6 @@ class HTMLRenderer : public OutputDev
|
||||
size_t start_idx; // index of the first Text using this state
|
||||
// for optimzation
|
||||
long long hash_value;
|
||||
int depth; // the depth in the state tree
|
||||
bool need_close;
|
||||
|
||||
static const char * format_str; // class names for each id
|
||||
@ -315,8 +314,6 @@ class HTMLRenderer : public OutputDev
|
||||
private:
|
||||
// retrieve state from renderer
|
||||
void set_state(State & state);
|
||||
// build the state tree in order to minimize the size of output
|
||||
void optimize_states(void);
|
||||
|
||||
HTMLRenderer * renderer;
|
||||
|
||||
@ -327,6 +324,9 @@ class HTMLRenderer : public OutputDev
|
||||
std::vector<Offset> offsets;
|
||||
std::vector<Unicode> text;
|
||||
|
||||
// for flush
|
||||
std::vector<State*> stack;
|
||||
|
||||
} line_buf;
|
||||
friend class LineBuffer;
|
||||
|
||||
|
@ -8,7 +8,6 @@
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <stack>
|
||||
|
||||
#include "HTMLRenderer.h"
|
||||
#include "HTMLRenderer/namespace.h"
|
||||
@ -16,7 +15,6 @@
|
||||
using std::min;
|
||||
using std::max;
|
||||
using std::vector;
|
||||
using std::stack;
|
||||
using std::function;
|
||||
|
||||
void HTMLRenderer::LineBuffer::reset(GfxState * state)
|
||||
@ -66,19 +64,8 @@ void HTMLRenderer::LineBuffer::flush(void)
|
||||
for(auto & s : states)
|
||||
s.hash();
|
||||
|
||||
if((renderer->param->optimize) && (states.size() > 2))
|
||||
{
|
||||
optimize_states();
|
||||
}
|
||||
else
|
||||
{
|
||||
for(size_t i = 0; i < states.size(); ++i)
|
||||
states[i].depth = i;
|
||||
}
|
||||
|
||||
states.resize(states.size() + 1);
|
||||
states.back().start_idx = text.size();
|
||||
states.back().depth = 0;
|
||||
|
||||
offsets.push_back({text.size(), 0});
|
||||
|
||||
@ -100,27 +87,44 @@ void HTMLRenderer::LineBuffer::flush(void)
|
||||
//accumulated horizontal offset;
|
||||
double dx = 0;
|
||||
|
||||
stack<State*> stack;
|
||||
stack.push(nullptr);
|
||||
int last_depth = -1;
|
||||
stack.clear();
|
||||
stack.push_back(nullptr);
|
||||
|
||||
// whenever a negative offset appears, we should not pop out that <span>
|
||||
// otherwise the effect of negative margin-left would disappear
|
||||
size_t last_text_pos_with_negative_offset = -1;
|
||||
|
||||
size_t cur_text_idx = 0;
|
||||
while(cur_text_idx < text.size())
|
||||
{
|
||||
if(cur_text_idx >= cur_state_iter->start_idx)
|
||||
{
|
||||
int depth = cur_state_iter -> depth;
|
||||
int cnt = last_depth + 1 - depth;
|
||||
assert(cnt >= 0);
|
||||
while(cnt--)
|
||||
// greedy
|
||||
int best_cost = State::ID_COUNT;
|
||||
|
||||
// we have a nullptr at the beginning, so no need to check for rend
|
||||
for(auto iter = stack.rbegin(); *iter; ++iter)
|
||||
{
|
||||
stack.top()->end(out);
|
||||
stack.pop();
|
||||
int cost = cur_state_iter->diff(**iter);
|
||||
if(cost < best_cost)
|
||||
{
|
||||
while(stack.back() != *iter)
|
||||
{
|
||||
stack.back()->end(out);
|
||||
stack.pop_back();
|
||||
}
|
||||
best_cost = cost;
|
||||
|
||||
if(best_cost == 0)
|
||||
break;
|
||||
}
|
||||
|
||||
cur_state_iter->begin(out, stack.top());
|
||||
stack.push(&*cur_state_iter);
|
||||
last_depth = depth;
|
||||
// cannot go further
|
||||
if((*iter)->start_idx <= last_text_pos_with_negative_offset)
|
||||
break;
|
||||
}
|
||||
cur_state_iter->begin(out, stack.back());
|
||||
stack.push_back(&*cur_state_iter);
|
||||
|
||||
++ cur_state_iter;
|
||||
}
|
||||
@ -132,6 +136,9 @@ void HTMLRenderer::LineBuffer::flush(void)
|
||||
|
||||
auto wid = renderer->install_whitespace(target, w);
|
||||
|
||||
if(w < 0)
|
||||
last_text_pos_with_negative_offset = cur_text_idx;
|
||||
|
||||
double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold);
|
||||
out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
|
||||
|
||||
@ -146,10 +153,10 @@ void HTMLRenderer::LineBuffer::flush(void)
|
||||
}
|
||||
|
||||
// we have a nullptr in the bottom
|
||||
while(stack.top())
|
||||
while(stack.back())
|
||||
{
|
||||
stack.top()->end(out);
|
||||
stack.pop();
|
||||
stack.back()->end(out);
|
||||
stack.pop_back();
|
||||
}
|
||||
|
||||
out << "</div>";
|
||||
@ -176,103 +183,8 @@ void HTMLRenderer::LineBuffer::set_state (State & state)
|
||||
state.draw_font_size = renderer->draw_font_size;
|
||||
}
|
||||
|
||||
class DPBufferEntry
|
||||
{
|
||||
public:
|
||||
int last_child;
|
||||
int min_cost;
|
||||
};
|
||||
|
||||
static vector<DPBufferEntry> flattened_dp_buffer;
|
||||
static vector<DPBufferEntry*> dp_buffer;
|
||||
|
||||
void HTMLRenderer::LineBuffer::optimize_states (void)
|
||||
{
|
||||
int n = states.size();
|
||||
|
||||
flattened_dp_buffer.resize(n*(n+1)/2);
|
||||
dp_buffer.resize(n);
|
||||
|
||||
{
|
||||
int incre = n;
|
||||
auto iter = dp_buffer.begin();
|
||||
DPBufferEntry * p = flattened_dp_buffer.data();
|
||||
while(incre > 0)
|
||||
{
|
||||
*(iter++) = p;
|
||||
p += (incre--);
|
||||
}
|
||||
}
|
||||
|
||||
// depth 0
|
||||
for(int i = 0; i < n; ++i)
|
||||
flattened_dp_buffer[i].min_cost = 0;
|
||||
|
||||
int last_at_this_depth = n;
|
||||
for(int depth = 1; depth < n; ++depth)
|
||||
{
|
||||
--last_at_this_depth;
|
||||
for(int i = 0; i < last_at_this_depth; ++i)
|
||||
{
|
||||
//determine dp_buffer[depth][i]
|
||||
int best_last_child = i+1;
|
||||
int best_min_cost = states[i].diff(states[i+1]) + dp_buffer[depth-1][i+1].min_cost;
|
||||
// at depth, we consider [i+1, i+depth+1) as possible children of i
|
||||
for(int j = 2; j <= depth; ++j)
|
||||
{
|
||||
int cost = dp_buffer[j-1][i].min_cost + dp_buffer[depth-j][i+j].min_cost;
|
||||
// avoid calling diff() when possible
|
||||
if (cost >= best_min_cost) continue;
|
||||
|
||||
cost += states[i].diff(states[i+j]);
|
||||
|
||||
if(cost < best_min_cost)
|
||||
{
|
||||
best_last_child = i+j;
|
||||
best_min_cost = cost;
|
||||
}
|
||||
}
|
||||
|
||||
dp_buffer[depth][i] = {best_last_child, best_min_cost};
|
||||
}
|
||||
}
|
||||
|
||||
// now fill in the depths
|
||||
// use recursion for now, until someone finds a PDF that would causes this overflow
|
||||
function<void(int,int,int)> func = [&](int idx, int depth, int tree_depth) -> void {
|
||||
states[idx].depth = tree_depth;
|
||||
while(depth > 0)
|
||||
{
|
||||
int last_child = dp_buffer[depth][idx].last_child;
|
||||
assert((last_child > idx) && (last_child <= idx + depth));
|
||||
func(last_child, idx + depth - last_child, tree_depth + 1);
|
||||
depth = last_child - idx - 1;
|
||||
}
|
||||
};
|
||||
|
||||
func(0, n-1, 0);
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_state)
|
||||
{
|
||||
if(prev_state && (prev_state->hash_value == hash_value))
|
||||
{
|
||||
// check ids again
|
||||
int i;
|
||||
for(i = 0; i < ID_COUNT; ++i)
|
||||
if(ids[i] != prev_state->ids[i])
|
||||
break;
|
||||
|
||||
if(i == ID_COUNT)
|
||||
{
|
||||
need_close = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
need_close = true;
|
||||
|
||||
out << "<span class=\"";
|
||||
bool first = true;
|
||||
for(int i = 0; i < ID_COUNT; ++i)
|
||||
{
|
||||
@ -281,6 +193,7 @@ void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_s
|
||||
|
||||
if(first)
|
||||
{
|
||||
out << "<span class=\"";
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
@ -291,7 +204,15 @@ void HTMLRenderer::LineBuffer::State::begin (ostream & out, const State * prev_s
|
||||
out << format("%1%%|2$x|") % format_str[i] % ids[i];
|
||||
}
|
||||
|
||||
if(first)
|
||||
{
|
||||
need_close = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
out << "\">";
|
||||
need_close = true;
|
||||
}
|
||||
}
|
||||
|
||||
void HTMLRenderer::LineBuffer::State::end(ostream & out) const
|
||||
|
@ -36,7 +36,6 @@ struct Param
|
||||
double space_threshold;
|
||||
double font_size_multiplier;
|
||||
int always_apply_tounicode;
|
||||
int optimize;
|
||||
|
||||
std::string font_suffix, font_format;
|
||||
|
||||
|
@ -84,7 +84,6 @@ po::variables_map parse_options (int argc, char **argv)
|
||||
("space-threshold", po::value<double>(¶m.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character")
|
||||
("font-size-multiplier", po::value<double>(¶m.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy")
|
||||
("always-apply-tounicode", po::value<int>(¶m.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
|
||||
("optimize", po::value<int>(¶m.optimize)->default_value(0), "Optimize HTML, might be very slow")
|
||||
|
||||
("font-suffix", po::value<string>(¶m.font_suffix)->default_value(".ttf"), "suffix for extracted font files")
|
||||
("font-format", po::value<string>(¶m.font_format)->default_value("truetype"), "format for extracted font files")
|
||||
|
@ -12,7 +12,7 @@ for f in os.listdir(DIR):
|
||||
if not f.lower().endswith('.pdf'):
|
||||
continue
|
||||
print f
|
||||
os.system('pdf2htmlEX --optimize 1 -l 10 --dest-dir html "%s/%s"' % (DIR,f))
|
||||
os.system('pdf2htmlEX -l 10 --dest-dir html "%s/%s"' % (DIR,f))
|
||||
ff = f[:-3]+'html'
|
||||
outf.write('<a href="html/%s" target="pdf">%s</a><br/>' % (ff,ff))
|
||||
outf.flush();
|
||||
|
Loading…
Reference in New Issue
Block a user