1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

add an option 'optimize'

This commit is contained in:
Lu Wang 2012-09-05 16:19:01 +08:00
parent bbe9b99b4e
commit 2d558253a5
5 changed files with 17 additions and 8 deletions

2
TODO
View File

@ -6,7 +6,7 @@ option to break ligatures
detect duplicate base fonts when embedding detect duplicate base fonts when embedding
compress div/span states consider left-shift in optimization
multiple charcode mapped to a same glyph multiple charcode mapped to a same glyph
re-encoded only used glyphs re-encoded only used glyphs

View File

@ -89,6 +89,9 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o
However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch. However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch.
.TP .TP
.B --optimize <0|1> (Default: 0)
Try to optimize the output HTML file, might be slow.
.TP
.B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype") .B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
.TP .TP

View File

@ -66,14 +66,14 @@ void HTMLRenderer::LineBuffer::flush(void)
for(auto & s : states) for(auto & s : states)
s.hash(); s.hash();
if(states.size() < 3) if((renderer->param->optimize) && (states.size() > 2))
{ {
for(size_t i = 0; i < states.size(); ++i) optimize_states();
states[i].depth = i;
} }
else else
{ {
optimize_states(); for(size_t i = 0; i < states.size(); ++i)
states[i].depth = i;
} }
states.resize(states.size() + 1); states.resize(states.size() + 1);
@ -203,6 +203,10 @@ void HTMLRenderer::LineBuffer::optimize_states (void)
p += (incre--); p += (incre--);
} }
} }
// depth 0
for(int i = 0; i < n; ++i)
flattened_dp_buffer[i].min_cost = 0;
int last_at_this_depth = n; int last_at_this_depth = n;
for(int depth = 1; depth < n; ++depth) for(int depth = 1; depth < n; ++depth)
@ -240,8 +244,8 @@ void HTMLRenderer::LineBuffer::optimize_states (void)
while(depth > 0) while(depth > 0)
{ {
int last_child = dp_buffer[depth][idx].last_child; int last_child = dp_buffer[depth][idx].last_child;
assert(last_child > idx); assert((last_child > idx) && (last_child <= idx + depth));
func(last_child, depth - last_child, tree_depth + 1); func(last_child, idx + depth - last_child, tree_depth + 1);
depth = last_child - idx - 1; depth = last_child - idx - 1;
} }
}; };
@ -310,7 +314,7 @@ int HTMLRenderer::LineBuffer::State::diff(const State & s) const
/* /*
* A quick check based on hash_value * A quick check based on hash_value
* it could be wrong when there are more then 256 classes, * it could be wrong when there are more then 256 classes,
* in which case the output may not be optimal, but still 'correct' * in which case the output may not be optimal, but still 'correct' in terms of HTML
*/ */
if(hash_value == s.hash_value) return 0; if(hash_value == s.hash_value) return 0;

View File

@ -36,6 +36,7 @@ struct Param
double space_threshold; double space_threshold;
double font_size_multiplier; double font_size_multiplier;
int always_apply_tounicode; int always_apply_tounicode;
int optimize;
std::string font_suffix, font_format; std::string font_suffix, font_format;

View File

@ -84,6 +84,7 @@ po::variables_map parse_options (int argc, char **argv)
("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character") ("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character")
("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy") ("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy")
("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on") ("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
("optimize", po::value<int>(&param.optimize)->default_value(0), "Optimize HTML, might be very slow")
("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files") ("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files")
("font-format", po::value<string>(&param.font_format)->default_value("truetype"), "format for extracted font files") ("font-format", po::value<string>(&param.font_format)->default_value("truetype"), "format for extracted font files")