diff --git a/pdf2htmlEX.1 b/pdf2htmlEX.1 index 7798b38..ce36b76 100644 --- a/pdf2htmlEX.1 +++ b/pdf2htmlEX.1 @@ -92,6 +92,11 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch. .TP +.B --space-as-offset <0|1> (Default: 0) +Treat space characters as offsets, which may increase the size of the output. + +Turn it on if space characters are not displayed correctly, or you want to remove positional spaces. +.TP .B --font-suffix (Default: ".ttf"), --font-format (Default: "truetype") Specify the suffix and format of fonts extracted from the PDF file. They should be consistent. .TP diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc index 672b932..959be83 100644 --- a/src/HTMLRenderer/LineBuffer.cc +++ b/src/HTMLRenderer/LineBuffer.cc @@ -139,7 +139,9 @@ void HTMLRenderer::LineBuffer::flush(void) if(w < 0) last_text_pos_with_negative_offset = cur_text_idx; - double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold); + auto * p = stack.back(); + double threshold = p->draw_font_size * (p->ascent - p->descent) * (renderer->param->space_threshold); + out << format("%2%") % wid % (target > (threshold - EPS) ? " " : ""); dx = target - w; diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 9372a36..3f14352 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -426,19 +426,29 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) cerr << "TODO: non-zero origins" << endl; } + bool is_space = false; if (n == 1 && *p == ' ') { ++nSpaces; + is_space = true; } - if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode)) + if(is_space && (param->space_as_offset)) { - line_buf.append_unicodes(u, uLen); + // ignore horiz_scaling, as it's merged in CTM + line_buf.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_scale); } else { - Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font)); - line_buf.append_unicodes(&uu, 1); + if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode)) + { + line_buf.append_unicodes(u, uLen); + } + else + { + Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font)); + line_buf.append_unicodes(&uu, 1); + } } dx += dx1; diff --git a/src/Param.h b/src/Param.h index 2ba865e..35fbce6 100644 --- a/src/Param.h +++ b/src/Param.h @@ -37,6 +37,7 @@ struct Param double space_threshold; double font_size_multiplier; int always_apply_tounicode; + int space_as_offset; std::string font_suffix, font_format; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index 8ef205a..a0e5c70 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -85,6 +85,7 @@ po::variables_map parse_options (int argc, char **argv) ("space-threshold", po::value(¶m.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character") ("font-size-multiplier", po::value(¶m.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy") ("always-apply-tounicode", po::value(¶m.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on") + ("space-as-offset", po::value(¶m.space_as_offset)->default_value(0), "treat space characters as offsets") ("font-suffix", po::value(¶m.font_suffix)->default_value(".ttf"), "suffix for extracted font files") ("font-format", po::value(¶m.font_format)->default_value("truetype"), "format for extracted font files")