From e85ff593ee873b600b5b3299a848ee38b218a38a Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 27 Sep 2012 17:17:08 +0800 Subject: [PATCH 01/10] always use default font files for base 14 fonts --- src/HTMLRenderer/install.cc | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index 81de4bd..0555b7a 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -11,8 +11,9 @@ #include #include -#include "Param.h" +#include +#include "Param.h" #include "HTMLRenderer.h" #include "namespace.h" #include "util.h" @@ -109,15 +110,24 @@ void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, Font string psname(font_loc->path->getCString()); string basename = psname.substr(0, psname.find('-')); - GfxFontLoc * localfontloc = font->locateFont(xref, gFalse); + GfxFontLoc * localfontloc = font->locateFont(xref, gTrue); if(param->embed_base_font) { if(localfontloc != nullptr) { - embed_font(string(localfontloc->path->getCString()), font, info); - export_remote_font(info, param->font_suffix, param->font_format, font); - delete localfontloc; - return; + GooString * path = globalParams->findBase14FontFile(localfontloc->path, font); + if(path) + { + embed_font(string(path->getCString()), font, info); + export_remote_font(info, param->font_suffix, param->font_format, font); + delete localfontloc; + delete path; + return; + } + else + { + cerr << "Cannot embed base font: f" << hex << info.id << dec << ' ' << psname << endl; + } } else { From 88df24f78fb3fab26ece477dd6cf2f0abdb283ba Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Thu, 27 Sep 2012 20:17:11 +0800 Subject: [PATCH 02/10] avoid using round() --- src/HTMLRenderer/text.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 2750626..5459099 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -25,7 +25,7 @@ namespace pdf2htmlEX { using std::unordered_set; using std::min; using std::all_of; -using std::round; +using std::floor; using std::swap; string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) @@ -384,14 +384,14 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo if(font_8bit) { - width_list[k] = (int)round(font_8bit->getWidth(i) * info.em_size); + width_list[k] = (int)floor(font_8bit->getWidth(i) * info.em_size + 0.5); } else { char buf[2]; buf[0] = (i >> 8) & 0xff; buf[1] = (i & 0xff); - width_list[k] = (int)round(font_cid->getWidth(buf, 2) * info.em_size); + width_list[k] = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5); } } From 10c55cffcb50fe1d0ebf7f145907451398777165 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 28 Sep 2012 17:25:12 +0800 Subject: [PATCH 03/10] don't decompose ligature when tounicode map is not used, need more fix laster --- src/HTMLRenderer/text.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 5459099..5e3408c 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -522,13 +522,21 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) } else { - if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode)) + if(cur_font_info->use_tounicode) { - line_buf.append_unicodes(u, uLen); + if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode)) + { + line_buf.append_unicodes(u, uLen); + } + else + { + Unicode uu = check_unicode(u, uLen, code, font); + line_buf.append_unicodes(&uu, 1); + } } else { - Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font)); + Unicode uu = unicode_from_font(code, font); line_buf.append_unicodes(&uu, 1); } } From f2af4be0946e999f8fef430c259a534f95aebcc2 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Fri, 28 Sep 2012 17:53:36 +0800 Subject: [PATCH 04/10] dirty fix:decompose ligatures when tounicode is not used --- src/HTMLRenderer/text.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 5459099..d876391 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -522,7 +522,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) } else { - if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode)) + if((param->decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) { line_buf.append_unicodes(u, uLen); } From dafc4b86dd22991bd58c1b1ffc3f579f0dc312f5 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Sun, 30 Sep 2012 15:43:23 +0800 Subject: [PATCH 05/10] fix glyph widths when not consistent with PDF [need more work] --- TODO | 8 ++++++++ src/HTMLRenderer/install.cc | 19 +++++-------------- src/HTMLRenderer/text.cc | 10 +++++++--- src/ffw.c | 25 +++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 17 deletions(-) diff --git a/TODO b/TODO index fba441a..e891d9a 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,11 @@ +about glyph width: + - IE + - parameter for transform/trunc when width is too wide/narrow + - stretching ratio might not be correct.. + +better wrapper of ff + - need C++ class + try harder finding glyph names (using fontforge) for CID Type 0 rename single-html -> embed-font/image/css ... create a glyph for ' ', if there is not in a font diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc index 0555b7a..e2bd654 100644 --- a/src/HTMLRenderer/install.cc +++ b/src/HTMLRenderer/install.cc @@ -110,24 +110,15 @@ void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, Font string psname(font_loc->path->getCString()); string basename = psname.substr(0, psname.find('-')); - GfxFontLoc * localfontloc = font->locateFont(xref, gTrue); + GfxFontLoc * localfontloc = font->locateFont(xref, gFalse); if(param->embed_base_font) { if(localfontloc != nullptr) { - GooString * path = globalParams->findBase14FontFile(localfontloc->path, font); - if(path) - { - embed_font(string(path->getCString()), font, info); - export_remote_font(info, param->font_suffix, param->font_format, font); - delete localfontloc; - delete path; - return; - } - else - { - cerr << "Cannot embed base font: f" << hex << info.id << dec << ' ' << psname << endl; - } + embed_font(localfontloc->path->getCString(), font, info); + export_remote_font(info, param->font_suffix, param->font_format, font); + delete localfontloc; + return; } else { diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 1a4b44f..9743fd0 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -189,14 +189,17 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo info.em_size = ffw_get_em_size(); if(get_metric_only) + { + ffw_metric(&info.ascent, &info.descent); + ffw_close(); return; + } used_map = preprocessor.get_code_map(hash_ref(font->getID())); /* * Step 1 - * dump the font file directly from the font descriptor and put the glyphs into the correct slots - * + * dump the font file directly from the font descriptor and put the glyphs into the correct slots * * for 8bit + nonTrueType * re-encoding the font using a PostScript encoding list (glyph id <-> glpyh name) * @@ -395,8 +398,8 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo } } - ffw_reencode_raw(cur_mapping, max_key + 1, 1); ffw_set_widths(width_list, max_key + 1); + ffw_reencode_raw(cur_mapping, max_key + 1, 1); if(ctu) ctu->decRefCnt(); @@ -458,6 +461,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo ffw_load_font(cur_tmp_fn.c_str()); ffw_metric(&info.ascent, &info.descent); ffw_save(fn.c_str()); + ffw_close(); } diff --git a/src/ffw.c b/src/ffw.c index e52e29e..99deb30 100644 --- a/src/ffw.c +++ b/src/ffw.c @@ -268,8 +268,10 @@ void ffw_metric(double * ascent, double * descent) if(a < 0) a = 0; if(d > 0) d = 0; + /* sf->ascent = min(a, em); sf->descent = em - bb.maxy; + */ info->os2_winascent = a; info->os2_typoascent = a; @@ -294,6 +296,15 @@ void ffw_metric(double * ascent, double * descent) */ void ffw_set_widths(int * width_list, int mapping_len) { + memset(cur_fv->selected, 1, cur_fv->map->enccount); + // remove kern + FVRemoveKerns(cur_fv); + FVRemoveVKerns(cur_fv); + // remove bearing + // TODO: optimize this, merge the transform matrix with width setting (below) + //FVSetWidthScript(cur_fv, wt_lbearing, 0, 0); + //FVSetWidthScript(cur_fv, wt_rbearing, 0, 0); + SplineFont * sf = cur_fv->sf; if(sf->onlybitmaps @@ -319,6 +330,20 @@ void ffw_set_widths(int * width_list, int mapping_len) SplineChar * sc = sf->glyphs[j]; if(sc == NULL) continue; + DBounds bb; + SplineCharFindBounds(sc, &bb); + + // TODO: add an option + double glyph_width = bb.maxx - bb.minx; + if(glyph_width > width_list[i]) + { + real transform[6]; + transform[0] = ((double)width_list[i]) / glyph_width; + transform[3] = 1.0; + transform[1] = transform[2] = transform[4] = transform[5] = 0; + FVTrans(cur_fv, sc, transform, NULL, fvt_alllayers | fvt_dontmovewidth); + } + sc->width = width_list[i]; } } From c8133d64e36a3fd70066f5c035c10b8259240aae Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Mon, 1 Oct 2012 00:37:53 +0800 Subject: [PATCH 06/10] new option for fixing glyph width --- pdf2htmlEX.1.in | 18 ++++++++++++------ src/HTMLRenderer/text.cc | 2 +- src/ffw.c | 20 +++++++++++--------- src/include/Param.h | 4 ++++ src/include/ffw.h | 2 +- src/pdf2htmlEX.cc | 4 ++++ 6 files changed, 33 insertions(+), 17 deletions(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index e84d668..f708532 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -35,7 +35,7 @@ Specify owner password .B -u, --user-password Specify user password .TP -.B --dest-dir (Default: ".") +.B --dest-dir (Default: .) Specify destination folder .TP .B --data-dir (Default: @CMAKE_INSTALL_PREFIX@/share/pdf2htmlEX) @@ -117,15 +117,21 @@ Treat space characters as offsets, which may increase the size of the output. Turn it on if space characters are not displayed correctly, or you want to remove positional spaces. .TP -.B --css-filename (Default: "") +.B --stretch-narrow-glyph <0|1> (Default: 0) +If set to 1, glyphs narrower than described in PDF will be strecth; otherwise space will be padded to the right of the glyphs +.TP +.B --squeeze_wide_glyph <0|1> (Default: 0) +If set to 1, glyphs wider than described in PDF will be squeezed; otherwise it will be truncated. +.TP +.B --css-filename (Default: ) Specify the filename of the generated css file, if not embedded. If it's empty, the file name will be determined automatically. .TP -.B --font-suffix (Default: ".ttf"), --font-format (Default: "truetype") +.B --font-suffix (Default: .ttf), --font-format (Default: truetype) Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. .TP -.B --external-hint-tool (Default: "") +.B --external-hint-tool (Default: ) If specified, the tool will be called in order to enhanced hinting for fonts, this will precede --auto-hint. The tool will be called as ' ', where suffix will be the same as specified for --font-suffix. @@ -141,10 +147,10 @@ If switched off, intermediate files won't be cleaned in the end. .B pdf2htmlEX /path/to/file.pdf Convert file.pdf into file.html .TP -.B pdf2htmlEX --tmp-dir tmp --clean-tmp 0 --debug 1 /path/to/file.pdf +.B pdf2htmlEX --clean-tmp 0 --debug 1 /path/to/file.pdf Convert file.pdf and leave all intermediate files. .TP -.B pdf2htmlEX --dest-dir out --single-html 0 --debug 1 /path/to/file.pdf +.B pdf2htmlEX --dest-dir out --single-html 0 /path/to/file.pdf Convert file.pdf into out/file.html and leave font/image files separated. .SH COPYRIGHT diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 9743fd0..47a24ec 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -398,7 +398,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo } } - ffw_set_widths(width_list, max_key + 1); + ffw_set_widths(width_list, max_key + 1, param->stretch_narrow_glyph, param->squeeze_wide_glyph); ffw_reencode_raw(cur_mapping, max_key + 1, 1); if(ctu) diff --git a/src/ffw.c b/src/ffw.c index 99deb30..dc8e913 100644 --- a/src/ffw.c +++ b/src/ffw.c @@ -19,6 +19,8 @@ #include "ffw.h" +static real EPS=1e-6; + static inline int min(int a, int b) { return (aselected, 1, cur_fv->map->enccount); // remove kern FVRemoveKerns(cur_fv); FVRemoveVKerns(cur_fv); - // remove bearing - // TODO: optimize this, merge the transform matrix with width setting (below) - //FVSetWidthScript(cur_fv, wt_lbearing, 0, 0); - //FVSetWidthScript(cur_fv, wt_rbearing, 0, 0); + */ SplineFont * sf = cur_fv->sf; @@ -333,12 +335,12 @@ void ffw_set_widths(int * width_list, int mapping_len) DBounds bb; SplineCharFindBounds(sc, &bb); - // TODO: add an option double glyph_width = bb.maxx - bb.minx; - if(glyph_width > width_list[i]) + if((glyph_width > EPS) + && (((glyph_width > width_list[i] + EPS) && (squeeze_wide)) + || ((glyph_width < width_list[i] - EPS) && (stretch_narrow)))) { - real transform[6]; - transform[0] = ((double)width_list[i]) / glyph_width; + real transform[6]; transform[0] = ((double)width_list[i]) / glyph_width; transform[3] = 1.0; transform[1] = transform[2] = transform[4] = transform[5] = 0; FVTrans(cur_fv, sc, transform, NULL, fvt_alllayers | fvt_dontmovewidth); diff --git a/src/include/Param.h b/src/include/Param.h index 02548dc..db90188 100644 --- a/src/include/Param.h +++ b/src/include/Param.h @@ -40,9 +40,13 @@ struct Param double h_eps, v_eps; double space_threshold; double font_size_multiplier; + int auto_hint; int tounicode; int space_as_offset; + + int stretch_narrow_glyph; + int squeeze_wide_glyph; std::string css_filename; std::string font_suffix, font_format; diff --git a/src/include/ffw.h b/src/include/ffw.h index 6a1f27a..939f241 100644 --- a/src/include/ffw.h +++ b/src/include/ffw.h @@ -34,7 +34,7 @@ int ffw_get_em_size(void); // fix metrics and get them void ffw_metric(double * ascent, double * descent); -void ffw_set_widths(int * width_list, int mapping_len); +void ffw_set_widths(int * width_list, int mapping_len, int stretch_narrow, int squeeze_wide); void ffw_auto_hint(void); diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 5197026..c712df0 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -41,6 +41,8 @@ void show_usage_and_exit(const char * dummy = nullptr) cerr << "Options:" << endl; argparser.show_usage(cerr); cerr << endl; + cerr << "Run 'man pdf2htmlEX' for detailed information" << endl; + cerr << endl; exit(EXIT_FAILURE); } @@ -79,6 +81,8 @@ void parse_options (int argc, char **argv) .add("auto-hint", ¶m.auto_hint, 0, "Whether to generate hints for fonts") .add("tounicode", ¶m.tounicode, 0, "Specify how to deal with ToUnicode map, 0 for auto, 1 for forced, -1 for disabled") .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") + .add("stretch_narrow_glyph", ¶m.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding space") + .add("squeeze_wide_glyph", ¶m.squeeze_wide_glyph, 0, "squeeze wide glyphs instead of truncating") .add("css-filename", ¶m.css_filename, "", "Specify the file name of the generated css file") .add("font-suffix", ¶m.font_suffix, ".ttf", "suffix for extracted font files") From d1adb53a6d3b0e32679f48dc9f1e4cfd04056ce8 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Mon, 1 Oct 2012 00:38:37 +0800 Subject: [PATCH 07/10] changelog --- ChangeLog | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ChangeLog b/ChangeLog index aa4ae51..6d6d3dd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ Latest v0.5 +* New options: --stretch-narrow-glyph, --squeeze-wide-glyph + v0.4 2012.09.26 From 05a77d24cbf6261012cbb0ea026e9e007fa4fc73 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Mon, 1 Oct 2012 00:39:33 +0800 Subject: [PATCH 08/10] TODO --- TODO | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/TODO b/TODO index e891d9a..06ef809 100644 --- a/TODO +++ b/TODO @@ -1,7 +1,6 @@ about glyph width: - IE - - parameter for transform/trunc when width is too wide/narrow - - stretching ratio might not be correct.. + - stretching ratio might not be correct.. letter 'f' better wrapper of ff - need C++ class @@ -14,6 +13,8 @@ merge sub/sup into one line bug found in baidu(ubuntu...) precise link dest: zoom +position history stack (popstate) + ==Future== argument auto-completion From 8e1e77265ff39ee81c22c366a15cbfe842c07d51 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Mon, 1 Oct 2012 00:49:56 +0800 Subject: [PATCH 09/10] TODO --- TODO | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/TODO b/TODO index 06ef809..faf9ff4 100644 --- a/TODO +++ b/TODO @@ -1,20 +1,23 @@ + about glyph width: - IE - stretching ratio might not be correct.. letter 'f' -better wrapper of ff - - need C++ class +draw lines with CSS + +create a glyph for ' ', if there is not in a font + +position history stack (popstate) + +==Wait until someone asks== try harder finding glyph names (using fontforge) for CID Type 0 rename single-html -> embed-font/image/css ... -create a glyph for ' ', if there is not in a font merge sub/sup into one line bug found in baidu(ubuntu...) precise link dest: zoom -position history stack (popstate) - ==Future== argument auto-completion From 9014b274e0b096c4a497fa296417353302190a13 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Mon, 1 Oct 2012 18:08:18 +0800 Subject: [PATCH 10/10] README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index aef2889..7c08ff7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # pdf2html**EX** [**Donate Now**](http://coolwanglu.github.com/pdf2htmlEX/donate.html) +[**Feature Commision**](https://github.com/coolwanglu/pdf2htmlEX/wiki/FAQ#wiki-feature_commission) are now accepted. A beautiful demo is worth a thousand words: