1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

add option 'decompose-ligature'

This commit is contained in:
Lu Wang 2012-09-06 15:09:47 +08:00
parent 61bd8d6919
commit 79778e0ff3
5 changed files with 19 additions and 10 deletions

4
TODO
View File

@ -1,13 +1,9 @@
cmake - pkgconfig - fontforge - python
fix glyph width as provided in pdf fix glyph width as provided in pdf
option to break ligatures option to break ligatures
detect duplicate base fonts when embedding detect duplicate base fonts when embedding
consider left-shift in optimization
multiple charcode mapped to a same glyph multiple charcode mapped to a same glyph
re-encoded only used glyphs re-encoded only used glyphs

View File

@ -69,7 +69,10 @@ There are several base font defined in PDF standards, which are supposed to be p
If this switch is on, local matched font will be used and embedded; otherwise only font names are exported such that web browsers may try to find proper fonts themselves. If this switch is on, local matched font will be used and embedded; otherwise only font names are exported such that web browsers may try to find proper fonts themselves.
.TP .TP
.B --embed-external-font <0|1> (Default: 0) .B --embed-external-font <0|1> (Default: 0)
Similar as above but for non-base fonts Similar as above but for non-base fonts.
.TP
.B --decompose-ligature <0|1> (Default: 0)
Decompose ligatures. For example 'fi' -> 'f''i'.
.TP .TP
.B --heps <len>, --veps <len> (Default: 1) .B --heps <len>, --veps <len> (Default: 1)
Specify the maximum tolerable horizontal/vertical offset (in pixels). Specify the maximum tolerable horizontal/vertical offset (in pixels).
@ -77,7 +80,7 @@ Specify the maximum tolerable horizontal/vertical offset (in pixels).
pdf2htmlEX would try to optimize the generated HTML file moving Text within this distance. pdf2htmlEX would try to optimize the generated HTML file moving Text within this distance.
.TP .TP
.B --space-threshold <ratio> (Default: 1.0/6) .B --space-threshold <ratio> (Default: 1.0/6)
pdf2htmlEX would insert a whitespace character ' ' if the distance between two consecutive letters in the same line is wider than ratio * font_size pdf2htmlEX would insert a whitespace character ' ' if the distance between two consecutive letters in the same line is wider than ratio * font_size.
.TP .TP
.B --font-size-multiplier <ratio> (Default: 10) .B --font-size-multiplier <ratio> (Default: 10)
Many web browsers limit the minimum font size, and many would round the given font size, which results in incorrect rendering. Many web browsers limit the minimum font size, and many would round the given font size, which results in incorrect rendering.
@ -104,10 +107,10 @@ If switched off, intermediate files won't be cleaned in the end.
Convert file.pdf into file.html Convert file.pdf into file.html
.TP .TP
.B pdf2htmlEX --tmp-dir tmp --clean-tmp 0 --debug 1 /path/to/file.pdf .B pdf2htmlEX --tmp-dir tmp --clean-tmp 0 --debug 1 /path/to/file.pdf
Convert file.pdf and leave all intermediate files Convert file.pdf and leave all intermediate files.
.TP .TP
.B pdf2htmlEX --dest-dir out --single-html 0 --debug 1 /path/to/file.pdf .B pdf2htmlEX --dest-dir out --single-html 0 --debug 1 /path/to/file.pdf
Convert file.pdf into out/file.html and leave font/image files separated Convert file.pdf into out/file.html and leave font/image files separated.
.SH COPYRIGHT .SH COPYRIGHT
.PP .PP

View File

@ -26,6 +26,7 @@
using boost::algorithm::to_lower; using boost::algorithm::to_lower;
using std::unordered_set; using std::unordered_set;
using std::min; using std::min;
using std::all_of;
path HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) path HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id)
{ {
@ -422,8 +423,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
++nSpaces; ++nSpaces;
} }
Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font)); if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode))
line_buf.append_unicodes(&uu, 1); {
line_buf.append_unicodes(u, uLen);
}
else
{
Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
line_buf.append_unicodes(&uu, 1);
}
dx += dx1; dx += dx1;
dy += dy1; dy += dy1;

View File

@ -30,6 +30,7 @@ struct Param
int single_html; int single_html;
int embed_base_font; int embed_base_font;
int embed_external_font; int embed_external_font;
int decompose_ligature;
// Advanced tweak // Advanced tweak
double h_eps, v_eps; double h_eps, v_eps;

View File

@ -78,6 +78,7 @@ po::variables_map parse_options (int argc, char **argv)
("single-html", po::value<int>(&param.single_html)->default_value(1), "combine everything into one single HTML file") ("single-html", po::value<int>(&param.single_html)->default_value(1), "combine everything into one single HTML file")
("embed-base-font", po::value<int>(&param.embed_base_font)->default_value(0), "embed local matched font for base 14 fonts in the PDF file") ("embed-base-font", po::value<int>(&param.embed_base_font)->default_value(0), "embed local matched font for base 14 fonts in the PDF file")
("embed-external-font", po::value<int>(&param.embed_external_font)->default_value(0), "embed local matched font for external fonts in the PDF file") ("embed-external-font", po::value<int>(&param.embed_external_font)->default_value(0), "embed local matched font for external fonts in the PDF file")
("decompose-ligature", po::value<int>(&param.decompose_ligature)->default_value(0), "decompose ligatures, for example 'fi' -> 'f''i'")
("heps", po::value<double>(&param.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)") ("heps", po::value<double>(&param.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)")
("veps", po::value<double>(&param.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)") ("veps", po::value<double>(&param.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")