diff --git a/TODO b/TODO index 9058b5c..c8cf3ad 100644 --- a/TODO +++ b/TODO @@ -1,13 +1,9 @@ -cmake - pkgconfig - fontforge - python - fix glyph width as provided in pdf option to break ligatures detect duplicate base fonts when embedding -consider left-shift in optimization - multiple charcode mapped to a same glyph re-encoded only used glyphs diff --git a/pdf2htmlEX.1 b/pdf2htmlEX.1 index 6692517..7798b38 100644 --- a/pdf2htmlEX.1 +++ b/pdf2htmlEX.1 @@ -69,7 +69,10 @@ There are several base font defined in PDF standards, which are supposed to be p If this switch is on, local matched font will be used and embedded; otherwise only font names are exported such that web browsers may try to find proper fonts themselves. .TP .B --embed-external-font <0|1> (Default: 0) -Similar as above but for non-base fonts +Similar as above but for non-base fonts. +.TP +.B --decompose-ligature <0|1> (Default: 0) +Decompose ligatures. For example 'fi' -> 'f''i'. .TP .B --heps , --veps (Default: 1) Specify the maximum tolerable horizontal/vertical offset (in pixels). @@ -77,7 +80,7 @@ Specify the maximum tolerable horizontal/vertical offset (in pixels). pdf2htmlEX would try to optimize the generated HTML file moving Text within this distance. .TP .B --space-threshold (Default: 1.0/6) -pdf2htmlEX would insert a whitespace character ' ' if the distance between two consecutive letters in the same line is wider than ratio * font_size +pdf2htmlEX would insert a whitespace character ' ' if the distance between two consecutive letters in the same line is wider than ratio * font_size. .TP .B --font-size-multiplier (Default: 10) Many web browsers limit the minimum font size, and many would round the given font size, which results in incorrect rendering. @@ -104,10 +107,10 @@ If switched off, intermediate files won't be cleaned in the end. Convert file.pdf into file.html .TP .B pdf2htmlEX --tmp-dir tmp --clean-tmp 0 --debug 1 /path/to/file.pdf -Convert file.pdf and leave all intermediate files +Convert file.pdf and leave all intermediate files. .TP .B pdf2htmlEX --dest-dir out --single-html 0 --debug 1 /path/to/file.pdf -Convert file.pdf into out/file.html and leave font/image files separated +Convert file.pdf into out/file.html and leave font/image files separated. .SH COPYRIGHT .PP diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index a03be44..f329d14 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -26,6 +26,7 @@ using boost::algorithm::to_lower; using std::unordered_set; using std::min; +using std::all_of; path HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) { @@ -422,8 +423,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) ++nSpaces; } - Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font)); - line_buf.append_unicodes(&uu, 1); + if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode)) + { + line_buf.append_unicodes(u, uLen); + } + else + { + Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font)); + line_buf.append_unicodes(&uu, 1); + } dx += dx1; dy += dy1; diff --git a/src/Param.h b/src/Param.h index 136d6d6..2ba865e 100644 --- a/src/Param.h +++ b/src/Param.h @@ -30,6 +30,7 @@ struct Param int single_html; int embed_base_font; int embed_external_font; + int decompose_ligature; // Advanced tweak double h_eps, v_eps; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 4bc971f..1fbf2cf 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -78,6 +78,7 @@ po::variables_map parse_options (int argc, char **argv) ("single-html", po::value(¶m.single_html)->default_value(1), "combine everything into one single HTML file") ("embed-base-font", po::value(¶m.embed_base_font)->default_value(0), "embed local matched font for base 14 fonts in the PDF file") ("embed-external-font", po::value(¶m.embed_external_font)->default_value(0), "embed local matched font for external fonts in the PDF file") + ("decompose-ligature", po::value(¶m.decompose_ligature)->default_value(0), "decompose ligatures, for example 'fi' -> 'f''i'") ("heps", po::value(¶m.h_eps)->default_value(1.0), "max tolerated horizontal offset (in pixels)") ("veps", po::value(¶m.v_eps)->default_value(1.0), "max tolerated vertical offset (in pixels)")