working on space optimization

2024-07-05 01:28:39 +00:00 · 2013-03-21 12:18:26 +08:00 · 2013-03-21 12:18:26 +08:00 · 8ef466714e
commit 8ef466714e
parent b83611bd65
4 changed files with 147 additions and 36 deletions
--- a/src/HTMLRenderer/HTMLRenderer.h
+++ b/src/HTMLRenderer/HTMLRenderer.h
@ -36,7 +36,7 @@ public:
    long long id;
    bool use_tounicode;
    int em_size;
-    int space_width;
+    double space_width;
    double ascent, descent;
    bool is_type3;
 };
--- a/src/HTMLRenderer/TextLineBuffer.cc
+++ b/src/HTMLRenderer/TextLineBuffer.cc
@ -7,6 +7,7 @@
 */
 #include <vector>
 #include <cmath>
 #include <algorithm>
 #include "HTMLRenderer.h"
@ -26,6 +27,7 @@ using std::ostream;
 using std::cerr;
 using std::endl;
 using std::find;
 using std::abs;
 void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
 {
@ -74,14 +76,6 @@ void HTMLRenderer::TextLineBuffer::flush(void)
    optimize();
    for(auto iter = states.begin(); iter != states.end(); ++iter)
        iter->hash();
    states.resize(states.size() + 1);
    states.back().start_idx = text.size();
    offsets.push_back(Offset({text.size(), 0}));
    double max_ascent = 0;
    for(auto iter = states.begin(); iter != states.end(); ++iter)
    {
@ -89,6 +83,16 @@ void HTMLRenderer::TextLineBuffer::flush(void)
        max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size);
    }
    // append a dummy state for convenience
    states.resize(states.size() + 1);
    states.back().start_idx = text.size();
    for(auto iter = states.begin(); iter != states.end(); ++iter)
        iter->hash();
    // append a dummy offset for convenience
    offsets.push_back(Offset({text.size(), 0}));
    ostream & out = renderer->f_pages.fs;
    renderer->height_manager.install(max_ascent);
    renderer->left_manager  .install(x);
@ -153,21 +157,30 @@ void HTMLRenderer::TextLineBuffer::flush(void)
        {
            double target = cur_offset_iter->width + dx;
-            auto & wm = renderer->whitespace_manager;
+            if(equal(target, stack.back()->single_space_offset()))
-            wm.install(target);
+            {
-            auto wid = wm.get_id();
+                Unicode u = ' ';
-            double w = wm.get_actual_value();
+                outputUnicodes(out, &u, 1);
                dx = 0;
            }
            else
            {
                auto & wm = renderer->whitespace_manager;
                wm.install(target);
                auto wid = wm.get_id();
                double w = wm.get_actual_value();
-            if(w < 0)
+                if(w < 0)
-                last_text_pos_with_negative_offset = cur_text_idx;
+                    last_text_pos_with_negative_offset = cur_text_idx;
-            auto * p = stack.back();
+                auto * p = stack.back();
-            double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);
+                double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);
-            out << "<span class=\"" << CSS::WHITESPACE_CN
+                out << "<span class=\"" << CSS::WHITESPACE_CN
-                << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
+                    << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
-            dx = target - w;
+                dx = target - w;
            }
            ++ cur_offset_iter;
        }
@ -205,21 +218,114 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state)
    state.ids[State::RISE_ID] = renderer->rise_manager.get_id();
    state.font_info = renderer->cur_font_info;
-    state.draw_font_size = renderer->font_size_manager.get_value();
+    state.draw_font_size = renderer->font_size_manager.get_actual_value();
    state.letter_space = renderer->letter_space_manager.get_actual_value();
    state.word_space = renderer->word_space_manager.get_actual_value();
 }
 void HTMLRenderer::TextLineBuffer::optimize(void)
 {
    assert(!states.empty());
    // TODO
    // set proper hash_umask
    long long word_space_umask = ((long long)0xff) << (8*((int)State::WORD_SPACE_ID));
    for(auto iter = states.begin(); iter != states.end(); ++iter)
    {
        auto text_iter1 = text.begin() + (iter->start_idx);
        auto next_iter = iter;
        ++next_iter;
        auto text_iter2 =  (next_iter == states.end()) ? (text.end()) : (text.begin() + (next_iter->start_idx));
        if(find(text_iter1, text_iter2, ' ') == text_iter2)
        {
            // if there's no space, word_space does not matter;
            iter->hash_umask |= word_space_umask;
        }
    }
    // clean zero offsets
    {
        auto write_iter = offsets.begin();
        for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
        {
            if(!equal(iter->width, 0))
            {
                *write_iter = *iter;
                ++write_iter;
            }
        }
        offsets.erase(write_iter, offsets.end());
    }
    // In some PDF files all spaces are converted into positionig shifts
    // We may try to change them to ' ' and adjusted word_spaces
    // This can also be applied when param->space_as_offset is set
    // for now, we cosider only the no-space scenario
    if(offsets.size() > 0)
    {
        // Since GCC 4.4.6 is suported, I cannot use all_of + lambda here
        bool all_ws_umask = true;
        for(auto iter = states.begin(); iter != states.end(); ++iter)
        {
            if(!(iter->hash_umask & word_space_umask))
            {
                all_ws_umask = false;
                break;
            }
        }
        if(all_ws_umask)
        {
            double avg_width = 0;
            int posive_offset_count = 0;
            for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
            {
                if(is_positive(iter->width))
                {
                    ++posive_offset_count;
                    avg_width += iter->width;
                }
            }
            avg_width /= posive_offset_count;
            // now check if the width of offsets are close enough
            // TODO: it might make more sense if the threshold is proportion to the font size
            bool ok = true;
            double accum_off = 0;
            double orig_accum_off = 0;
            for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
            {
                orig_accum_off += iter->width;
                accum_off += avg_width;
                if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps)
                {
                    ok = false;
                    break;
                }
            }
            if(ok)
            {
                // ok, make all offsets equi-width
                for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
                {
                    if(is_positive(iter->width))
                        iter->width = avg_width;
                }
                // set new word_space
                for(auto iter = states.begin(); iter != states.end(); ++iter)
                {
                    double new_word_space = avg_width - iter->single_space_offset();
                    // install new word_space
                    // we might introduce more variance here
                    auto & wm = renderer->word_space_manager;
                    wm.install(new_word_space);
                    iter->ids[State::WORD_SPACE_ID] = wm.get_id();
                    iter->word_space = wm.get_actual_value();
                    iter->hash_umask &= (~word_space_umask);
                }
            }
        }
    }
 }
 // this state will be converted to a child node of the node of prev_state
@ -312,6 +418,11 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
    return d;
 }
 double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const
 {
    return letter_space + font_info->space_width * draw_font_size;
 }
 // the order should be the same as in the enum
 const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = {
    CSS::FONT_FAMILY_CN,
--- a/src/HTMLRenderer/TextLineBuffer.h
+++ b/src/HTMLRenderer/TextLineBuffer.h
@ -29,6 +29,8 @@ public:
            void hash(void);
            // calculate the difference between another State
            int diff(const State & s) const;
            // the offset cause by a single ' ' char
            double single_space_offset(void) const;
            enum {
                FONT_ID,
@ -46,6 +48,7 @@ public:
            const FontInfo * font_info;
            double draw_font_size;
            double letter_space;
            double word_space;
            size_t start_idx; // index of the first Text using this state
--- a/src/HTMLRenderer/font.cc
+++ b/src/HTMLRenderer/font.cc
@ -206,15 +206,14 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
    if(!font->isCIDFont())
    {
-        if(font_8bit)
+        font_8bit = dynamic_cast<Gfx8BitFont*>(font);
-        {
+        info.space_width = font_8bit->getWidth(' ');
-            info.space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5);
+    }
-        }
+    else
-        else
+    {
-        {
+        font_cid = dynamic_cast<GfxCIDFont*>(font);
-            char buf[2] = {0, ' '};
+        char buf[2] = {0, ' '};
-            info.space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5);
+        info.space_width = font_cid->getWidth(buf, 2);
        }
    }
    if(get_metric_only)
@ -241,9 +240,8 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
     * for CID Truetype
     * same as 8bitTrueType, except for that we have to check 65536 charcodes
     */
-    if(!font->isCIDFont())
+    if(font_8bit)
    {
        font_8bit = dynamic_cast<Gfx8BitFont*>(font);
        maxcode = 0xff;
        if(is_truetype_suffix(suffix))
        {
@ -296,7 +294,6 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
    }
    else
    {
        font_cid = dynamic_cast<GfxCIDFont*>(font);
        maxcode = 0xffff;
        if(is_truetype_suffix(suffix))
@ -437,7 +434,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
        // Might be a problem if ' ' is in the font, but not empty
        if(!has_space)
        {
-            ffw_add_empty_char((int32_t)' ', info.space_width);
+            ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5));
        }
        if(ctu)