1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

working on new --tounicode

This commit is contained in:
Lu Wang 2012-09-09 01:49:47 +08:00
parent 634394d771
commit 0eb3c5af99
4 changed files with 28 additions and 5 deletions

19
ChangeLog Normal file
View File

@ -0,0 +1,19 @@
Latest
* Removed dependency of boost::format and boost::algorithm
* New option --space-as-offset
* A font preprocessor, for solving encoding problems
* Better HTML optimization, states are reused
* HTML should work when Javascript is disabled
v0.2
2012.09.06
* Fontforge is now linked with, not called with scripts
* Better accuracy of rendering, with a new line model
* New option --decompose-ligature
v0.1
2012.08.28
* The first release

View File

@ -89,10 +89,14 @@ Specify a ratio greater than 1 would resolve this issue.
For some versions of Firefox, however, there will be a problem when the font size is too large, in which case a smaller value should be specified here. For some versions of Firefox, however, there will be a problem when the font size is too large, in which case a smaller value should be specified here.
.TP .TP
.B --always-apply-tounicode <0|1> (Default: 0) .B --tounicode <-1|0|1> (Default: 0)
A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' of the characters. A ToUnicode map may be provided for each font in PDF which indicates the 'meaning' of the characters. However often there is better "ToUnicode" info in Type 0/1 fonts, and sometimes the ToUnicode map provided is wrong.
However often there is better "ToUnicode" info in Type 0/1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch. If this value is set to 1, the ToUnicode Map is always applied, if provided in PDF, and characters may not render correctly in HTML if there are collisions.
If set to -1, a customized map is used such that rendering will be correct in HTML (visually the same), but you may not get correct characters by select & copy & paste.
If set to 0, pdf2htmlEX would try it best to balance the two methods above.
.TP .TP
.B --space-as-offset <0|1> (Default: 0) .B --space-as-offset <0|1> (Default: 0)
Treat space characters as offsets, which may increase the size of the output. Treat space characters as offsets, which may increase the size of the output.

View File

@ -36,7 +36,7 @@ struct Param
double h_eps, v_eps; double h_eps, v_eps;
double space_threshold; double space_threshold;
double font_size_multiplier; double font_size_multiplier;
int always_apply_tounicode; int tounicode;
int space_as_offset; int space_as_offset;
std::string font_suffix, font_format; std::string font_suffix, font_format;

View File

@ -83,7 +83,7 @@ po::variables_map parse_options (int argc, char **argv)
("veps", po::value<double>(&param.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)") ("veps", po::value<double>(&param.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")
("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/8), "distance no thiner than (threshold * em) will be considered as a space character") ("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/8), "distance no thiner than (threshold * em) will be considered as a space character")
("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy") ("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy")
("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on") ("tounicode", po::value<int>(&param.tounicode)->default_value(0), "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled")
("space-as-offset", po::value<int>(&param.space_as_offset)->default_value(0), "treat space characters as offsets") ("space-as-offset", po::value<int>(&param.space_as_offset)->default_value(0), "treat space characters as offsets")
("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files") ("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files")