option 'space-as-offset'

2024-12-22 04:50:09 +00:00 · 2012-09-07 08:39:21 +08:00 · 2012-09-07 08:39:21 +08:00 · c82b140243
commit c82b140243
parent 9ae3e4dcb9
5 changed files with 24 additions and 5 deletions
--- a/pdf2htmlEX.1
+++ b/pdf2htmlEX.1
@ -92,6 +92,11 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o

 However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch.
 .TP
+.B --space-as-offset <0|1> (Default: 0)
+Treat space characters as offsets, which may increase the size of the output.
+
+Turn it on if space characters are not displayed correctly, or you want to remove positional spaces.
+.TP
 .B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
 Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
 .TP
--- a/src/HTMLRenderer/LineBuffer.cc
+++ b/src/HTMLRenderer/LineBuffer.cc
@ -139,7 +139,9 @@ void HTMLRenderer::LineBuffer::flush(void)
            if(w < 0)
                last_text_pos_with_negative_offset = cur_text_idx;

-            double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold);
+            auto * p = stack.back();
+            double threshold = p->draw_font_size * (p->ascent - p->descent) * (renderer->param->space_threshold);
+
            out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");

            dx = target - w;
--- a/src/HTMLRenderer/text.cc
+++ b/src/HTMLRenderer/text.cc
@ -426,19 +426,29 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
            cerr << "TODO: non-zero origins" << endl;
        }

+        bool is_space = false;
        if (n == 1 && *p == ' ') 
        {
            ++nSpaces;
+            is_space = true;
        }
        
-        if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode))
+        if(is_space && (param->space_as_offset))
        {
-            line_buf.append_unicodes(u, uLen);
+            // ignore horiz_scaling, as it's merged in CTM
+            line_buf.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_scale); 
        }
        else
        {
-            Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
-            line_buf.append_unicodes(&uu, 1);
+            if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode))
+            {
+                line_buf.append_unicodes(u, uLen);
+            }
+            else
+            {
+                Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
+                line_buf.append_unicodes(&uu, 1);
+            }
        }

        dx += dx1;
--- a/src/Param.h
+++ b/src/Param.h
@ -37,6 +37,7 @@ struct Param
    double space_threshold;
    double font_size_multiplier;
    int always_apply_tounicode;
+    int space_as_offset;

    std::string font_suffix, font_format;

--- a/src/pdf2htmlEX.cc
+++ b/src/pdf2htmlEX.cc
@ -85,6 +85,7 @@ po::variables_map parse_options (int argc, char **argv)
        ("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character")
        ("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy")
        ("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
+        ("space-as-offset", po::value<int>(&param.space_as_offset)->default_value(0), "treat space characters as offsets")

        ("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files")
        ("font-format", po::value<string>(&param.font_format)->default_value("truetype"), "format for extracted font files")