From b83611bd65e52799b0ea0474b629b0cb29bf52d3 Mon Sep 17 00:00:00 2001
From: Lu Wang <coolwanglu@gmail.com>
Date: Wed, 20 Mar 2013 23:46:58 +0800
Subject: [PATCH 1/4] working

---
 src/HTMLRenderer/HTMLRenderer.h    |  2 +-
 src/HTMLRenderer/TextLineBuffer.cc | 59 +++++++++++++++++++++++++-----
 src/HTMLRenderer/TextLineBuffer.h  |  7 +++-
 src/HTMLRenderer/font.cc           | 27 ++++++++------
 4 files changed, 71 insertions(+), 24 deletions(-)
diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h
index b079800..131d824 100644
--- a/src/HTMLRenderer/HTMLRenderer.h
+++ b/src/HTMLRenderer/HTMLRenderer.h
@@ -30,13 +30,13 @@
 
 namespace pdf2htmlEX {
 
-// we may need more info of a font in the future
 class FontInfo
 {
 public:
     long long id;
     bool use_tounicode;
     int em_size;
+    int space_width;
     double ascent, descent;
     bool is_type3;
 };
diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc
index 2a678f8..f8ef903 100644
--- a/src/HTMLRenderer/TextLineBuffer.cc
+++ b/src/HTMLRenderer/TextLineBuffer.cc
@@ -7,6 +7,7 @@
  */
 
 #include <vector>
+#include <algorithm>
 
 #include "HTMLRenderer.h"
 #include "TextLineBuffer.h"
@@ -24,6 +25,7 @@ using std::vector;
 using std::ostream;
 using std::cerr;
 using std::endl;
+using std::find;
 
 void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
 {
@@ -50,6 +52,7 @@ void HTMLRenderer::TextLineBuffer::append_state(void)
     {
         states.resize(states.size() + 1);
         states.back().start_idx = text.size();
+        states.back().hash_umask = 0;
     }
 
     set_state(states.back());
@@ -69,6 +72,8 @@ void HTMLRenderer::TextLineBuffer::flush(void)
         return;
     }
 
+    optimize();
+
     for(auto iter = states.begin(); iter != states.end(); ++iter)
         iter->hash();
 
@@ -81,7 +86,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
     for(auto iter = states.begin(); iter != states.end(); ++iter)
     {
         const auto & s = *iter;
-        max_ascent = max<double>(max_ascent, s.ascent * s.draw_font_size);
+        max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size);
     }
 
     ostream & out = renderer->f_pages.fs;
@@ -157,7 +162,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
                 last_text_pos_with_negative_offset = cur_text_idx;
 
             auto * p = stack.back();
-            double threshold = p->draw_font_size * (p->ascent - p->descent) * (renderer->param->space_threshold);
+            double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);
 
             out << "<span class=\"" << CSS::WHITESPACE_CN
                 << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
@@ -199,18 +204,47 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state)
     state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id();
     state.ids[State::RISE_ID] = renderer->rise_manager.get_id();
 
-    const FontInfo * info = renderer->cur_font_info;
-    state.ascent = info->ascent;
-    state.descent = info->descent;
+    state.font_info = renderer->cur_font_info;
     state.draw_font_size = renderer->font_size_manager.get_value();
 }
 
+void HTMLRenderer::TextLineBuffer::optimize(void)
+{
+    assert(!states.empty());
+
+    // TODO
+   
+    // set proper hash_umask
+    
+    // In some PDF files all spaces are converted into positionig shifts
+    // We may try to change them to ' ' and adjusted word_spaces
+    // This can also be applied when param->space_as_offset is set
+
+}
+
+// this state will be converted to a child node of the node of prev_state
+// dump the difference between previous state
+// also clone corresponding states
 void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state)
 {
+    long long cur_mask = 0xff;
     bool first = true;
-    for(int i = 0; i < ID_COUNT; ++i)
+    for(int i = 0; i < ID_COUNT; ++i, cur_mask<<=8)
     {
-        if(prev_state && (prev_state->ids[i] == ids[i]))
+        if(hash_umask & cur_mask) // we don't care about this ID
+        {
+            if (prev_state && (!(prev_state->hash_umask & cur_mask))) // if prev_state have it set
+            {
+                // we have to inherit it
+                ids[i] = prev_state->ids[i]; 
+                hash_umask &= (~cur_mask);
+            }
+            //anyway we don't have to output it
+            continue;
+        }
+
+        // now we care about the ID
+        if(prev_state && (!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i]))
             continue;
 
         if(first)
@@ -231,7 +265,7 @@ void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * pr
             out << ids[i];
     }
 
-    if(first)
+    if(first) // we actually just inherit the whole prev_state
     {
         need_close = false;
     }
@@ -264,12 +298,17 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
      * it could be wrong when there are more then 256 classes, 
      * in which case the output may not be optimal, but still 'correct' in terms of HTML
      */
-    if(hash_value == s.hash_value) return 0;
+    long long common_mask = ~(hash_umask | s.hash_umask);
+    if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0;
 
+    long long cur_mask = 0xff;
     int d = 0;
     for(int i = 0; i < ID_COUNT; ++i)
-        if(ids[i] != s.ids[i])
+    {
+        if((common_mask & cur_mask) && (ids[i] != s.ids[i]))
             ++ d;
+        cur_mask <<= 8;
+    }
     return d;
 }
 
diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h
index 2cc288a..7051e0b 100644
--- a/src/HTMLRenderer/TextLineBuffer.h
+++ b/src/HTMLRenderer/TextLineBuffer.h
@@ -44,13 +44,14 @@ public:
 
             long long ids[ID_COUNT];
 
-            double ascent;
-            double descent;
+            const FontInfo * font_info;
             double draw_font_size;
+            double word_space;
 
             size_t start_idx; // index of the first Text using this state
             // for optimzation
             long long hash_value;
+            long long hash_umask; // some states may not be actually used
             bool need_close;
 
             static const char * const css_class_names []; // class names for each id
@@ -73,6 +74,8 @@ private:
     // retrieve state from renderer
     void set_state(State & state);
 
+    void optimize(void);
+
     HTMLRenderer * renderer;
 
     double x, y;
diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc
index ca4eaa0..f686871 100644
--- a/src/HTMLRenderer/font.cc
+++ b/src/HTMLRenderer/font.cc
@@ -204,6 +204,19 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
 
     info.em_size = ffw_get_em_size();
 
+    if(!font->isCIDFont())
+    {
+        if(font_8bit)
+        {
+            info.space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5);
+        }
+        else
+        {
+            char buf[2] = {0, ' '};
+            info.space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5);
+        }
+    }
+
     if(get_metric_only)
     {
         ffw_metric(&info.ascent, &info.descent);
@@ -424,17 +437,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
         // Might be a problem if ' ' is in the font, but not empty
         if(!has_space)
         {
-            int space_width;
-            if(font_8bit)
-            {
-                space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5);
-            }
-            else
-            {
-                char buf[2] = {0, ' '};
-                space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5);
-            }
-            ffw_add_empty_char((int32_t)' ', space_width);
+            ffw_add_empty_char((int32_t)' ', info.space_width);
         }
 
         if(ctu)
@@ -525,6 +528,8 @@ const FontInfo * HTMLRenderer::install_font(GfxFont * font)
 
     if(font == nullptr)
     {
+        new_font_info.em_size = 0;
+        new_font_info.space_width = 0;
         new_font_info.ascent = 0;
         new_font_info.descent = 0;
         new_font_info.is_type3 = false;

From 8ef466714e0d03a30bbb3efdeb71258bd7afa32f Mon Sep 17 00:00:00 2001
From: Lu Wang <coolwanglu@gmail.com>
Date: Thu, 21 Mar 2013 12:18:26 +0800
Subject: [PATCH 2/4] working on space optimization

---
 src/HTMLRenderer/HTMLRenderer.h    |   2 +-
 src/HTMLRenderer/TextLineBuffer.cc | 155 +++++++++++++++++++++++++----
 src/HTMLRenderer/TextLineBuffer.h  |   3 +
 src/HTMLRenderer/font.cc           |  23 ++---
 4 files changed, 147 insertions(+), 36 deletions(-)

diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h
index 131d824..32bfb33 100644
--- a/src/HTMLRenderer/HTMLRenderer.h
+++ b/src/HTMLRenderer/HTMLRenderer.h
@@ -36,7 +36,7 @@ public:
     long long id;
     bool use_tounicode;
     int em_size;
-    int space_width;
+    double space_width;
     double ascent, descent;
     bool is_type3;
 };
diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc
index f8ef903..f1c956f 100644
--- a/src/HTMLRenderer/TextLineBuffer.cc
+++ b/src/HTMLRenderer/TextLineBuffer.cc
@@ -7,6 +7,7 @@
  */
 
 #include <vector>
+#include <cmath>
 #include <algorithm>
 
 #include "HTMLRenderer.h"
@@ -26,6 +27,7 @@ using std::ostream;
 using std::cerr;
 using std::endl;
 using std::find;
+using std::abs;
 
 void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
 {
@@ -74,14 +76,6 @@ void HTMLRenderer::TextLineBuffer::flush(void)
 
     optimize();
 
-    for(auto iter = states.begin(); iter != states.end(); ++iter)
-        iter->hash();
-
-    states.resize(states.size() + 1);
-    states.back().start_idx = text.size();
-
-    offsets.push_back(Offset({text.size(), 0}));
-
     double max_ascent = 0;
     for(auto iter = states.begin(); iter != states.end(); ++iter)
     {
@@ -89,6 +83,16 @@ void HTMLRenderer::TextLineBuffer::flush(void)
         max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size);
     }
 
+    // append a dummy state for convenience
+    states.resize(states.size() + 1);
+    states.back().start_idx = text.size();
+    
+    for(auto iter = states.begin(); iter != states.end(); ++iter)
+        iter->hash();
+
+    // append a dummy offset for convenience
+    offsets.push_back(Offset({text.size(), 0}));
+
     ostream & out = renderer->f_pages.fs;
     renderer->height_manager.install(max_ascent);
     renderer->left_manager  .install(x);
@@ -153,21 +157,30 @@ void HTMLRenderer::TextLineBuffer::flush(void)
         {
             double target = cur_offset_iter->width + dx;
 
-            auto & wm = renderer->whitespace_manager;
-            wm.install(target);
-            auto wid = wm.get_id();
-            double w = wm.get_actual_value();
+            if(equal(target, stack.back()->single_space_offset()))
+            {
+                Unicode u = ' ';
+                outputUnicodes(out, &u, 1);
+                dx = 0;
+            }
+            else
+            {
+                auto & wm = renderer->whitespace_manager;
+                wm.install(target);
+                auto wid = wm.get_id();
+                double w = wm.get_actual_value();
 
-            if(w < 0)
-                last_text_pos_with_negative_offset = cur_text_idx;
+                if(w < 0)
+                    last_text_pos_with_negative_offset = cur_text_idx;
 
-            auto * p = stack.back();
-            double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);
+                auto * p = stack.back();
+                double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);
 
-            out << "<span class=\"" << CSS::WHITESPACE_CN
-                << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
+                out << "<span class=\"" << CSS::WHITESPACE_CN
+                    << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
 
-            dx = target - w;
+                dx = target - w;
+            }
 
             ++ cur_offset_iter;
         }
@@ -205,21 +218,114 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state)
     state.ids[State::RISE_ID] = renderer->rise_manager.get_id();
 
     state.font_info = renderer->cur_font_info;
-    state.draw_font_size = renderer->font_size_manager.get_value();
+    state.draw_font_size = renderer->font_size_manager.get_actual_value();
+    state.letter_space = renderer->letter_space_manager.get_actual_value();
+    state.word_space = renderer->word_space_manager.get_actual_value();
 }
 
 void HTMLRenderer::TextLineBuffer::optimize(void)
 {
     assert(!states.empty());
 
-    // TODO
-   
     // set proper hash_umask
+    long long word_space_umask = ((long long)0xff) << (8*((int)State::WORD_SPACE_ID));
+    for(auto iter = states.begin(); iter != states.end(); ++iter)
+    {
+        auto text_iter1 = text.begin() + (iter->start_idx);
+        auto next_iter = iter;
+        ++next_iter;
+        auto text_iter2 =  (next_iter == states.end()) ? (text.end()) : (text.begin() + (next_iter->start_idx));
+        if(find(text_iter1, text_iter2, ' ') == text_iter2)
+        {
+            // if there's no space, word_space does not matter;
+            iter->hash_umask |= word_space_umask;
+        }
+    }
+
+    // clean zero offsets
+    {
+        auto write_iter = offsets.begin();
+        for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
+        {
+            if(!equal(iter->width, 0))
+            {
+                *write_iter = *iter;
+                ++write_iter;
+            }
+        }
+        offsets.erase(write_iter, offsets.end());
+    }
     
     // In some PDF files all spaces are converted into positionig shifts
     // We may try to change them to ' ' and adjusted word_spaces
     // This can also be applied when param->space_as_offset is set
 
+    // for now, we cosider only the no-space scenario
+    if(offsets.size() > 0)
+    {
+        // Since GCC 4.4.6 is suported, I cannot use all_of + lambda here
+        bool all_ws_umask = true;
+        for(auto iter = states.begin(); iter != states.end(); ++iter)
+        {
+            if(!(iter->hash_umask & word_space_umask))
+            {
+                all_ws_umask = false;
+                break;
+            }
+        }
+        if(all_ws_umask)
+        {
+            double avg_width = 0;
+            int posive_offset_count = 0;
+            for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
+            {
+                if(is_positive(iter->width))
+                {
+                    ++posive_offset_count;
+                    avg_width += iter->width;
+                }
+            }
+            avg_width /= posive_offset_count;
+
+            // now check if the width of offsets are close enough
+            // TODO: it might make more sense if the threshold is proportion to the font size
+            bool ok = true;
+            double accum_off = 0;
+            double orig_accum_off = 0;
+            for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
+            {
+                orig_accum_off += iter->width;
+                accum_off += avg_width;
+                if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps)
+                {
+                    ok = false;
+                    break;
+                }
+            }
+            if(ok)
+            {
+                // ok, make all offsets equi-width
+                for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
+                {
+                    if(is_positive(iter->width))
+                        iter->width = avg_width;
+                }
+                // set new word_space
+                for(auto iter = states.begin(); iter != states.end(); ++iter)
+                {
+                    double new_word_space = avg_width - iter->single_space_offset();
+
+                    // install new word_space
+                    // we might introduce more variance here
+                    auto & wm = renderer->word_space_manager;
+                    wm.install(new_word_space);
+                    iter->ids[State::WORD_SPACE_ID] = wm.get_id();
+                    iter->word_space = wm.get_actual_value();
+                    iter->hash_umask &= (~word_space_umask);
+                }
+            }
+        }
+    }
 }
 
 // this state will be converted to a child node of the node of prev_state
@@ -312,6 +418,11 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
     return d;
 }
 
+double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const
+{
+    return letter_space + font_info->space_width * draw_font_size;
+}
+
 // the order should be the same as in the enum
 const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = {
     CSS::FONT_FAMILY_CN,
diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h
index 7051e0b..c289eb7 100644
--- a/src/HTMLRenderer/TextLineBuffer.h
+++ b/src/HTMLRenderer/TextLineBuffer.h
@@ -29,6 +29,8 @@ public:
             void hash(void);
             // calculate the difference between another State
             int diff(const State & s) const;
+            // the offset cause by a single ' ' char
+            double single_space_offset(void) const;
 
             enum {
                 FONT_ID,
@@ -46,6 +48,7 @@ public:
 
             const FontInfo * font_info;
             double draw_font_size;
+            double letter_space;
             double word_space;
 
             size_t start_idx; // index of the first Text using this state
diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc
index f686871..f9d15b5 100644
--- a/src/HTMLRenderer/font.cc
+++ b/src/HTMLRenderer/font.cc
@@ -206,15 +206,14 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
 
     if(!font->isCIDFont())
     {
-        if(font_8bit)
-        {
-            info.space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5);
-        }
-        else
-        {
-            char buf[2] = {0, ' '};
-            info.space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5);
-        }
+        font_8bit = dynamic_cast<Gfx8BitFont*>(font);
+        info.space_width = font_8bit->getWidth(' ');
+    }
+    else
+    {
+        font_cid = dynamic_cast<GfxCIDFont*>(font);
+        char buf[2] = {0, ' '};
+        info.space_width = font_cid->getWidth(buf, 2);
     }
 
     if(get_metric_only)
@@ -241,9 +240,8 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
      * for CID Truetype
      * same as 8bitTrueType, except for that we have to check 65536 charcodes
      */
-    if(!font->isCIDFont())
+    if(font_8bit)
     {
-        font_8bit = dynamic_cast<Gfx8BitFont*>(font);
         maxcode = 0xff;
         if(is_truetype_suffix(suffix))
         {
@@ -296,7 +294,6 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
     }
     else
     {
-        font_cid = dynamic_cast<GfxCIDFont*>(font);
         maxcode = 0xffff;
 
         if(is_truetype_suffix(suffix))
@@ -437,7 +434,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
         // Might be a problem if ' ' is in the font, but not empty
         if(!has_space)
         {
-            ffw_add_empty_char((int32_t)' ', info.space_width);
+            ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5));
         }
 
         if(ctu)

From 821a65ac73619d455f9a865dff55bcc614901370 Mon Sep 17 00:00:00 2001
From: Lu Wang <coolwanglu@gmail.com>
Date: Thu, 21 Mar 2013 12:27:07 +0800
Subject: [PATCH 3/4] fix space optimization

---
 src/HTMLRenderer/TextLineBuffer.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc
index f1c956f..cc2406f 100644
--- a/src/HTMLRenderer/TextLineBuffer.cc
+++ b/src/HTMLRenderer/TextLineBuffer.cc
@@ -313,7 +313,7 @@ void HTMLRenderer::TextLineBuffer::optimize(void)
                 // set new word_space
                 for(auto iter = states.begin(); iter != states.end(); ++iter)
                 {
-                    double new_word_space = avg_width - iter->single_space_offset();
+                    double new_word_space = avg_width - iter->single_space_offset() + iter->word_space;
 
                     // install new word_space
                     // we might introduce more variance here
@@ -420,7 +420,7 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
 
 double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const
 {
-    return letter_space + font_info->space_width * draw_font_size;
+    return word_space + letter_space + font_info->space_width * draw_font_size;
 }
 
 // the order should be the same as in the enum

From 7ea4f054bb2c46ef110db37ab6bc8311fe56c4cb Mon Sep 17 00:00:00 2001
From: Lu Wang <coolwanglu@gmail.com>
Date: Thu, 21 Mar 2013 12:30:45 +0800
Subject: [PATCH 4/4] todo

---
 TODO | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/TODO b/TODO
index d279edb..e0e5aab 100644
--- a/TODO
+++ b/TODO
@@ -1,3 +1,7 @@
+non-trivial space optimization
+(For each state whose word_space is free, set a proper value such that it may cover most whitespaces)
+(Or just set word_space according to the first positive whitespace, but need to do this before the state inherit some value)
+
 == Future: ==
 
 Too difficult/complicated to implement: