WIP: HTMLTextLine

2024-12-22 13:00:08 +00:00 · 2013-12-22 16:59:59 +08:00 · 2013-12-22 16:59:59 +08:00 · aea00515fe
commit aea00515fe
parent 11b488e236
5 changed files with 83 additions and 38 deletions
--- a/src/HTMLRenderer/text.cc
+++ b/src/HTMLRenderer/text.cc
@ -29,6 +29,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
    // unscaled
    double cur_letter_space = state->getCharSpace();
    double cur_word_space   = state->getWordSpace();
+    double cur_horiz_scaling = state->getHorizScaling();
+

    // Writing mode fonts and Type 3 fonts are rendered as images
    // I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
@ -89,15 +91,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
        
        if(is_space && (param.space_as_offset))
        {
-            // ignore horiz_scaling, as it's merged in CTM
+            // ignore horiz_scaling, as it has been merged into CTM
            html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); 
        }
        else
        {
            if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
            {
-                html_text_page.get_cur_line()->append_unicodes(u, uLen);
-                // TODO: decomposed characters may be not with the same width as the original ligature, need to fix it.
+                // TODO: why multiply cur_horiz_scaling here?
+                html_text_page.get_cur_line()->append_unicodes(u, uLen, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling);
            }
            else
            {
@ -110,7 +112,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
                {
                    uu = unicode_from_font(code, font);
                }
-                html_text_page.get_cur_line()->append_unicodes(&uu, 1);
+                // TODO: why multiply cur_horiz_scaling here?
+                html_text_page.get_cur_line()->append_unicodes(&uu, 1, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling);
                /*
                 * In PDF, word_space is appended if (n == 1 and *p = ' ')
                 * but in HTML, word_space is appended if (uu == ' ')
@ -131,12 +134,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
        len -= n;
    }

-    double hs = state->getHorizScaling();
-
    // horiz_scaling is merged into ctm now, 
    // so the coordinate system is ugly
-    // TODO: why multiply hs here
-    dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs;
+    // TODO: why multiply cur_horiz_scaling here
+    dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * cur_horiz_scaling;
    
    dy *= cur_font_size;

--- a/src/HTMLTextLine.cc
+++ b/src/HTMLTextLine.cc
@ -31,11 +31,13 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
    ,line_state(line_state)
    ,clip_x1(0)
    ,clip_y1(0)
+    ,width(0)
 { }

-void HTMLTextLine::append_unicodes(const Unicode * u, int l)
+void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
 {
    text.insert(text.end(), u, u+l);
+    this->width += width;
 }

 void HTMLTextLine::append_offset(double width)
@ -49,6 +51,7 @@ void HTMLTextLine::append_offset(double width)
        offsets.back().width += width;
    else
        offsets.emplace_back(text.size(), width);
+    this->width += width;
 }

 void HTMLTextLine::append_state(const HTMLTextState & text_state)
@ -188,28 +191,17 @@ void HTMLTextLine::dump_text(ostream & out)
                    // finally, just dump it
                    if(!done)
                    {
-                        if(param.optimize_text < 3) 
+                        long long wid = all_manager.whitespace.install(target, &actual_offset);
+
+                        if(!equal(actual_offset, 0))
                        {
-                            long long wid = all_manager.whitespace.install(target, &actual_offset);
+                            if(is_positive(-actual_offset))
+                                last_text_pos_with_negative_offset = cur_text_idx;

-                            if(!equal(actual_offset, 0))
-                            {
-                                if(is_positive(-actual_offset))
-                                    last_text_pos_with_negative_offset = cur_text_idx;
-
-                                double threshold = state_iter1->em_size() * (param.space_threshold);
-
-                                out << "<span class=\"" << CSS::WHITESPACE_CN
-                                    << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
-                            }
-                        }
-                        else 
-                        {
-                            // aggressive optimization
                            double threshold = state_iter1->em_size() * (param.space_threshold);
-                            if(target > threshold)
-                                out << ' ';
-                            actual_offset = target;
+
+                            out << "<span class=\"" << CSS::WHITESPACE_CN
+                                << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
                        }
                    }
                }
@ -255,9 +247,6 @@ void HTMLTextLine::clip(const HTMLClipState & clip_state)

 void HTMLTextLine::prepare(void)
 {
-    if(param.optimize_text)
-        optimize();
-
    // max_ascent determines the height of the div
    double accum_vertical_align = 0; // accumulated
    ascent = 0;
@ -285,11 +274,22 @@ void HTMLTextLine::prepare(void)
 }


+void HTMLTextLine::optimize(std::vector<HTMLTextLine*> & lines)
+{
+    if(param.optimize_text == 3)
+    {
+        optimize_aggressive(lines);
+    }
+    else
+    {
+        optimize_normal(lines);
+    }
+}
 /*
 * Adjust letter space and word space in order to reduce the number of HTML elements
 * May also unmask word space
 */
-void HTMLTextLine::optimize()
+void HTMLTextLine::optimize_normal(std::vector<HTMLTextLine*> & lines)
 {
    // remove unuseful states in the end
    while((!states.empty()) && (states.back().start_idx >= text.size()))
@ -465,6 +465,32 @@ void HTMLTextLine::optimize()
    
    // apply optimization
    std::swap(offsets, new_offsets);
+
+    lines.push_back(this);
+}
+
+// for optimize-text == 3
+void HTMLTextLine::optimize_aggressive(std::vector<HTMLTextLine*> & lines)
+{
+    HTMLTextLine *cur_line = this;
+    while(true) 
+    {
+        lines.push_back(cur_line);
+
+        // break the line if there are a large (positive or negative) shift
+        // letter space / word space are not taken into consideration (yet)
+        
+
+        // TODO
+    }
+
+    /*
+    // aggressive optimization
+    if(target > state_iter1->em_size() * (param.space_threshold) - EPS)
+        out << ' ';
+    dx = 0;
+    lines.push_back(this);
+    */
 }

 // this state will be converted to a child node of the node of prev_state
--- a/src/HTMLTextLine.h
+++ b/src/HTMLTextLine.h
@ -73,7 +73,7 @@ public:
        double width;
    };

-    void append_unicodes(const Unicode * u, int l);
+    void append_unicodes(const Unicode * u, int l, double width);
    void append_offset(double width);
    void append_state(const HTMLTextState & text_state);
    void dump_text(std::ostream & out);
@ -87,8 +87,10 @@ public:
     * Optimize and calculate necessary values
     */
    void prepare(void);
+    void optimize(std::vector<HTMLTextLine*> &);
 private:
-    void optimize(void);
+    void optimize_normal(std::vector<HTMLTextLine*> &);
+    void optimize_aggressive(std::vector<HTMLTextLine*> &);

    const Param & param;
    AllStateManager & all_manager;
@ -96,6 +98,7 @@ private:
    HTMLLineState line_state;
    double ascent, descent;
    double clip_x1, clip_y1;
+    double width;

    std::vector<State> states;
    std::vector<Offset> offsets;
--- a/src/HTMLTextPage.cc
+++ b/src/HTMLTextPage.cc
@ -12,7 +12,6 @@
 namespace pdf2htmlEX {

 using std::ostream;
-using std::unique_ptr;

 HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
    : param(param)
@ -22,8 +21,24 @@ HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
    , page_height(0)
 { } 

+HTMLTextPage::~HTMLTextPage()
+{
+    for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
+    {
+        delete (*iter);
+    }
+}
+
 void HTMLTextPage::dump_text(ostream & out)
 {
+    if(param.optimize_text)
+    {
+        // text lines may be split during optimization, collect them
+        std::vector<HTMLTextLine*> new_text_lines;
+        for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
+            (*iter)->optimize(new_text_lines);
+        std::swap(text_lines, new_text_lines);
+    }
    for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
        (*iter)->prepare();
    if(param.optimize_text)
@ -98,7 +113,7 @@ void HTMLTextPage::open_new_line(const HTMLLineState & line_state)
    // do not reused the last text_line even if it's empty
    // because the clip states may point to the next index
    text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager));
-    cur_line = text_lines.back().get();
+    cur_line = text_lines.back();
 }

 void HTMLTextPage::set_page_size(double width, double height)
--- a/src/HTMLTextPage.h
+++ b/src/HTMLTextPage.h
@ -7,7 +7,6 @@
 #define HTMLTEXTPAGE_H__

 #include <vector>
-#include <memory>
 #include <ostream>

 #include "Param.h"
@ -26,6 +25,7 @@ class HTMLTextPage
 {
 public:
    HTMLTextPage (const Param & param, AllStateManager & all_manager);
+    ~HTMLTextPage();

    HTMLTextLine * get_cur_line(void) const { return cur_line; }

@ -47,7 +47,7 @@ private:
    HTMLTextLine * cur_line;
    double page_width, page_height;

-    std::vector<std::unique_ptr<HTMLTextLine>> text_lines;
+    std::vector<HTMLTextLine*> text_lines;

    struct Clip {
        HTMLClipState clip_state;