pdf2htmlEX/src/HTMLRenderer/TextLineBuffer.cc

/*
 * TextLineBuffer.cc
 *
 * Generate and optimized HTML for one line
 *
 * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
 */

#include <vector>
#include <cmath>
#include <algorithm>

#include "HTMLRenderer.h"
#include "TextLineBuffer.h"
#include "util/namespace.h"
#include "util/unicode.h"
#include "util/math.h"
#include "util/css_const.h"
#include "util/encoding.h"

namespace pdf2htmlEX {

using std::min;
using std::max;
using std::vector;
using std::ostream;
using std::cerr;
using std::endl;
using std::find;
using std::abs;

void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
{
    state->transform(state->getCurX(), state->getCurY(), &x, &y);
    tm_id = renderer->transform_matrix_manager.get_id();
}

void HTMLRenderer::TextLineBuffer::append_unicodes(const Unicode * u, int l)
{
    text.insert(text.end(), u, u+l);
}

void HTMLRenderer::TextLineBuffer::append_offset(double width)
{
    if((!offsets.empty()) && (offsets.back().start_idx == text.size()))
        offsets.back().width += width;
    else
        offsets.push_back(Offset({text.size(), width}));
}

void HTMLRenderer::TextLineBuffer::append_state(void)
{
    if(states.empty() || (states.back().start_idx != text.size()))
    {
        states.resize(states.size() + 1);
        states.back().start_idx = text.size();
        states.back().hash_umask = 0;
    }

    set_state(states.back());
}

void HTMLRenderer::TextLineBuffer::flush(void)
{
    /*
     * Each Line is an independent absolute positioned block
     * so even we have a few states or offsets, we may omit them
     */
    if(text.empty()) return;

    if(states.empty() || (states[0].start_idx != 0))
    {
        cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
        return;
    }

    optimize();

    double max_ascent = 0;
    for(auto iter = states.begin(); iter != states.end(); ++iter)
    {
        const auto & s = *iter;
        max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size);
    }

    // append a dummy state for convenience
    states.resize(states.size() + 1);
    states.back().start_idx = text.size();
    
    for(auto iter = states.begin(); iter != states.end(); ++iter)
        iter->hash();

    // append a dummy offset for convenience
    offsets.push_back(Offset({text.size(), 0}));

    ostream & out = renderer->f_pages.fs;
    renderer->height_manager.install(max_ascent);
    renderer->left_manager  .install(x);
    renderer->bottom_manager.install(y);

    out << "<div class=\"" << CSS::LINE_CN
        << " "             << CSS::TRANSFORM_MATRIX_CN << tm_id 
        << " "             << CSS::LEFT_CN             << renderer->left_manager  .get_id()
        << " "             << CSS::HEIGHT_CN           << renderer->height_manager.get_id()
        << " "             << CSS::BOTTOM_CN           << renderer->bottom_manager.get_id()
        << "\">";

    auto cur_state_iter = states.begin();
    auto cur_offset_iter = offsets.begin();

    //accumulated horizontal offset;
    double dx = 0;

    stack.clear();
    stack.push_back(nullptr);

    // whenever a negative offset appears, we should not pop out that <span>
    // otherwise the effect of negative margin-left would disappear
    size_t last_text_pos_with_negative_offset = 0;

    size_t cur_text_idx = 0;
    while(cur_text_idx < text.size())
    {
        if(cur_text_idx >= cur_state_iter->start_idx)
        {
            // greedy
            int best_cost = State::ID_COUNT;
            
            // we have a nullptr at the beginning, so no need to check for rend
            for(auto iter = stack.rbegin(); *iter; ++iter)
            {
                int cost = cur_state_iter->diff(**iter);
                if(cost < best_cost)
                {
                    while(stack.back() != *iter)
                    {
                        stack.back()->end(out);
                        stack.pop_back();
                    }
                    best_cost = cost;

                    if(best_cost == 0)
                        break;
                }

                // cannot go further
                if((*iter)->start_idx <= last_text_pos_with_negative_offset)
                    break;
            }
            cur_state_iter->begin(out, stack.back());
            stack.push_back(&*cur_state_iter);

            ++ cur_state_iter;
        }

        if(cur_text_idx >= cur_offset_iter->start_idx)
        {
            double target = cur_offset_iter->width + dx;

            if(equal(target, 0))
            {
                dx = 0;
            }
            else if(equal(target, stack.back()->single_space_offset()))
            {
                Unicode u = ' ';
                outputUnicodes(out, &u, 1);
                dx = 0;
            }
            else
            {
                auto & wm = renderer->whitespace_manager;
                wm.install(target);
                auto wid = wm.get_id();
                double w = wm.get_actual_value();

                if(w < 0)
                    last_text_pos_with_negative_offset = cur_text_idx;

                auto * p = stack.back();
                double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);

                out << "<span class=\"" << CSS::WHITESPACE_CN
                    << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";

                dx = target - w;
            }

            ++ cur_offset_iter;
        }

        size_t next_text_idx = min<size_t>(cur_state_iter->start_idx, cur_offset_iter->start_idx);

        outputUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
        cur_text_idx = next_text_idx;
    }

    // we have a nullptr in the bottom
    while(stack.back())
    {
        stack.back()->end(out);
        stack.pop_back();
    }

    out << "</div>";


    states.clear();
    offsets.clear();
    text.clear();

}

void HTMLRenderer::TextLineBuffer::set_state (State & state)
{
    state.ids[State::FONT_ID] = renderer->cur_font_info->id;
    state.ids[State::FONT_SIZE_ID] = renderer->font_size_manager.get_id();
    state.ids[State::FILL_COLOR_ID] = renderer->fill_color_manager.get_id();
    state.ids[State::STROKE_COLOR_ID] = renderer->stroke_color_manager.get_id();
    state.ids[State::LETTER_SPACE_ID] = renderer->letter_space_manager.get_id();
    state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id();
    state.ids[State::RISE_ID] = renderer->rise_manager.get_id();

    state.font_info = renderer->cur_font_info;
    state.draw_font_size = renderer->font_size_manager.get_actual_value();
    state.letter_space = renderer->letter_space_manager.get_actual_value();
    state.word_space = renderer->word_space_manager.get_actual_value();
}

void HTMLRenderer::TextLineBuffer::optimize(void)
{
    // this function needs more work
    return;

    assert(!states.empty());

    // set proper hash_umask
    long long word_space_umask = ((long long)0xff) << (8*((int)State::WORD_SPACE_ID));
    for(auto iter = states.begin(); iter != states.end(); ++iter)
    {
        auto text_iter1 = text.begin() + (iter->start_idx);
        auto next_iter = iter;
        ++next_iter;
        auto text_iter2 =  (next_iter == states.end()) ? (text.end()) : (text.begin() + (next_iter->start_idx));
        if(find(text_iter1, text_iter2, ' ') == text_iter2)
        {
            // if there's no space, word_space does not matter;
            iter->hash_umask |= word_space_umask;
        }
    }

    // clean zero offsets
    {
        auto write_iter = offsets.begin();
        for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
        {
            if(!equal(iter->width, 0))
            {
                *write_iter = *iter;
                ++write_iter;
            }
        }
        offsets.erase(write_iter, offsets.end());
    }
    
    // In some PDF files all spaces are converted into positionig shifts
    // We may try to change them to ' ' and adjusted word_spaces
    // This can also be applied when param->space_as_offset is set

    // for now, we cosider only the no-space scenario
    if(offsets.size() > 0)
    {
        // Since GCC 4.4.6 is suported, I cannot use all_of + lambda here
        bool all_ws_umask = true;
        for(auto iter = states.begin(); iter != states.end(); ++iter)
        {
            if(!(iter->hash_umask & word_space_umask))
            {
                all_ws_umask = false;
                break;
            }
        }
        if(all_ws_umask)
        {
            double avg_width = 0;
            int posive_offset_count = 0;
            for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
            {
                if(is_positive(iter->width))
                {
                    ++posive_offset_count;
                    avg_width += iter->width;
                }
            }
            avg_width /= posive_offset_count;

            // now check if the width of offsets are close enough
            // TODO: it might make more sense if the threshold is proportion to the font size
            bool ok = true;
            double accum_off = 0;
            double orig_accum_off = 0;
            for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
            {
                orig_accum_off += iter->width;
                accum_off += avg_width;
                if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps)
                {
                    ok = false;
                    break;
                }
            }
            if(ok)
            {
                // ok, make all offsets equi-width
                for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
                {
                    if(is_positive(iter->width))
                        iter->width = avg_width;
                }
                // set new word_space
                for(auto iter = states.begin(); iter != states.end(); ++iter)
                {
                    double new_word_space = avg_width - iter->single_space_offset() + iter->word_space;

                    // install new word_space
                    // we might introduce more variance here
                    auto & wm = renderer->word_space_manager;
                    wm.install(new_word_space);
                    iter->ids[State::WORD_SPACE_ID] = wm.get_id();
                    iter->word_space = wm.get_actual_value();
                    iter->hash_umask &= (~word_space_umask);
                }
            }
        }
    }
}

// this state will be converted to a child node of the node of prev_state
// dump the difference between previous state
// also clone corresponding states
void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state)
{
    long long cur_mask = 0xff;
    bool first = true;
    for(int i = 0; i < ID_COUNT; ++i, cur_mask<<=8)
    {
        if(hash_umask & cur_mask) // we don't care about this ID
        {
            if (prev_state && (!(prev_state->hash_umask & cur_mask))) // if prev_state have it set
            {
                // we have to inherit it
                ids[i] = prev_state->ids[i]; 
                hash_umask &= (~cur_mask);
            }
            //anyway we don't have to output it
            continue;
        }

        // now we care about the ID
        if(prev_state && (!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i]))
            continue;

        if(first)
        { 
            out << "<span class=\"";
            first = false;
        }
        else
        {
            out << ' ';
        }

        // out should have hex set
        out << css_class_names[i];
        if (ids[i] == -1)
            out << CSS::INVALID_ID;
        else
            out << ids[i];
    }

    if(first) // we actually just inherit the whole prev_state
    {
        need_close = false;
    }
    else
    {
        out << "\">";
        need_close = true;
    }
}

void HTMLRenderer::TextLineBuffer::State::end(ostream & out) const
{
    if(need_close)
        out << "</span>";
}

void HTMLRenderer::TextLineBuffer::State::hash(void)
{
    hash_value = 0;
    for(int i = 0; i < ID_COUNT; ++i)
    {
        hash_value = (hash_value << 8) | (ids[i] & 0xff);
    }
}

int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
{
    /*
     * A quick check based on hash_value
     * it could be wrong when there are more then 256 classes, 
     * in which case the output may not be optimal, but still 'correct' in terms of HTML
     */
    long long common_mask = ~(hash_umask | s.hash_umask);
    if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0;

    long long cur_mask = 0xff;
    int d = 0;
    for(int i = 0; i < ID_COUNT; ++i)
    {
        if((common_mask & cur_mask) && (ids[i] != s.ids[i]))
            ++ d;
        cur_mask <<= 8;
    }
    return d;
}

double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const
{
    return word_space + letter_space + font_info->space_width * draw_font_size;
}

// the order should be the same as in the enum
const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = {
    CSS::FONT_FAMILY_CN,
    CSS::FONT_SIZE_CN,
    CSS::FILL_COLOR_CN,
    CSS::STROKE_COLOR_CN,
    CSS::LETTER_SPACE_CN,
    CSS::WORD_SPACE_CN,
    CSS::RISE_CN
};

} //namespace pdf2htmlEX
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`/*`
separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`* TextLineBuffer.cc`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`*`
			`* Generate and optimized HTML for one line`
			`*`
FontSizeManager 2013-02-05 06:36:36 +00:00			`* Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`*/`

optimize states in HTML 2012-09-05 07:13:21 +00:00			`#include <vector>`
working on space optimization 2013-03-21 04:18:26 +00:00			`#include <cmath>`
working 2013-03-20 15:46:58 +00:00			`#include <algorithm>`
optimize states in HTML 2012-09-05 07:13:21 +00:00
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`#include "HTMLRenderer.h"`
separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`#include "TextLineBuffer.h"`
reorganize files 2012-11-29 09:28:05 +00:00			`#include "util/namespace.h"`
reorganizaing 2012-11-29 09:45:26 +00:00			`#include "util/unicode.h"`
split util.h 2012-11-29 10:28:07 +00:00			`#include "util/math.h"`
consistent css class names 2013-02-27 18:11:34 +00:00			`#include "util/css_const.h"`
url escaping in HTML 2013-02-15 05:07:00 +00:00			`#include "util/encoding.h"`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00
add manifest & split-pages 2012-09-12 15:26:14 +00:00			`namespace pdf2htmlEX {`

first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`using std::min;`
			`using std::max;`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`using std::vector;`
successfully built with CYGWIN 2012-09-10 17:53:33 +00:00			`using std::ostream;`
split util.h 2012-11-29 10:28:07 +00:00			`using std::cerr;`
			`using std::endl;`
working 2013-03-20 15:46:58 +00:00			`using std::find;`
working on space optimization 2013-03-21 04:18:26 +00:00			`using std::abs;`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00
separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`void HTMLRenderer::TextLineBuffer::reset(GfxState * state)`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`{`
			`state->transform(state->getCurX(), state->getCurY(), &x, &y);`
fix matrix 2013-02-05 12:37:05 +00:00			`tm_id = renderer->transform_matrix_manager.get_id();`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`}`

separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`void HTMLRenderer::TextLineBuffer::append_unicodes(const Unicode * u, int l)`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`{`
			`text.insert(text.end(), u, u+l);`
			`}`

separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`void HTMLRenderer::TextLineBuffer::append_offset(double width)`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`{`
			`if((!offsets.empty()) && (offsets.back().start_idx == text.size()))`
			`offsets.back().width += width;`
			`else`
buildable with gcc 4.4.6 2012-09-09 06:48:10 +00:00			`offsets.push_back(Offset({text.size(), width}));`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`}`

separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`void HTMLRenderer::TextLineBuffer::append_state(void)`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`{`
			`if(states.empty() \|\| (states.back().start_idx != text.size()))`
			`{`
			`states.resize(states.size() + 1);`
			`states.back().start_idx = text.size();`
working 2013-03-20 15:46:58 +00:00			`states.back().hash_umask = 0;`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`}`

			`set_state(states.back());`
			`}`

separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`void HTMLRenderer::TextLineBuffer::flush(void)`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`{`
			`/*`
.. 2012-12-11 12:52:36 +00:00			`* Each Line is an independent absolute positioned block`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`* so even we have a few states or offsets, we may omit them`
			`*/`
			`if(text.empty()) return;`

			`if(states.empty() \|\| (states[0].start_idx != 0))`
			`{`
			`cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;`
			`return;`
			`}`

working 2013-03-20 15:46:58 +00:00			`optimize();`

first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`double max_ascent = 0;`
buildable with gcc 4.4.6 2012-09-09 06:48:10 +00:00			`for(auto iter = states.begin(); iter != states.end(); ++iter)`
			`{`
			`const auto & s = *iter;`
working 2013-03-20 15:46:58 +00:00			`max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size);`
buildable with gcc 4.4.6 2012-09-09 06:48:10 +00:00			`}`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00
working on space optimization 2013-03-21 04:18:26 +00:00			`// append a dummy state for convenience`
			`states.resize(states.size() + 1);`
			`states.back().start_idx = text.size();`

			`for(auto iter = states.begin(); iter != states.end(); ++iter)`
			`iter->hash();`

			`// append a dummy offset for convenience`
			`offsets.push_back(Offset({text.size(), 0}));`

refactor file objects 2013-01-28 10:46:44 +00:00			`ostream & out = renderer->f_pages.fs;`
height 2013-02-05 06:51:00 +00:00			`renderer->height_manager.install(max_ascent);`
more state managers 2013-02-06 12:09:40 +00:00			`renderer->left_manager .install(x);`
			`renderer->bottom_manager.install(y);`

			`out << "<div class=\"" << CSS::LINE_CN`
			`<< " " << CSS::TRANSFORM_MATRIX_CN << tm_id`
			`<< " " << CSS::LEFT_CN << renderer->left_manager .get_id()`
			`<< " " << CSS::HEIGHT_CN << renderer->height_manager.get_id()`
			`<< " " << CSS::BOTTOM_CN << renderer->bottom_manager.get_id()`
add class for height 2012-09-16 07:53:41 +00:00			`<< "\">";`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00
			`auto cur_state_iter = states.begin();`
			`auto cur_offset_iter = offsets.begin();`

optimize states in HTML 2012-09-05 07:13:21 +00:00			`//accumulated horizontal offset;`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`double dx = 0;`

removed 'optmized' option, use greedy method automatically 2012-09-06 06:37:09 +00:00			`stack.clear();`
			`stack.push_back(nullptr);`

			`// whenever a negative offset appears, we should not pop out that <span>`
			`// otherwise the effect of negative margin-left would disappear`
fix state optimization 2012-09-06 11:05:49 +00:00			`size_t last_text_pos_with_negative_offset = 0;`
optimize states in HTML 2012-09-05 07:13:21 +00:00
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`size_t cur_text_idx = 0;`
			`while(cur_text_idx < text.size())`
			`{`
			`if(cur_text_idx >= cur_state_iter->start_idx)`
			`{`
removed 'optmized' option, use greedy method automatically 2012-09-06 06:37:09 +00:00			`// greedy`
			`int best_cost = State::ID_COUNT;`

			`// we have a nullptr at the beginning, so no need to check for rend`
			`for(auto iter = stack.rbegin(); *iter; ++iter)`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`{`
removed 'optmized' option, use greedy method automatically 2012-09-06 06:37:09 +00:00			`int cost = cur_state_iter->diff(**iter);`
			`if(cost < best_cost)`
			`{`
			`while(stack.back() != *iter)`
			`{`
			`stack.back()->end(out);`
			`stack.pop_back();`
			`}`
			`best_cost = cost;`

			`if(best_cost == 0)`
			`break;`
			`}`
optimize states in HTML 2012-09-05 07:13:21 +00:00
removed 'optmized' option, use greedy method automatically 2012-09-06 06:37:09 +00:00			`// cannot go further`
			`if((*iter)->start_idx <= last_text_pos_with_negative_offset)`
			`break;`
			`}`
			`cur_state_iter->begin(out, stack.back());`
			`stack.push_back(&*cur_state_iter);`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00
			`++ cur_state_iter;`
			`}`

			`if(cur_text_idx >= cur_offset_iter->start_idx)`
			`{`
			`double target = cur_offset_iter->width + dx;`

working on line merging 2013-03-24 09:18:13 +00:00			`if(equal(target, 0))`
			`{`
			`dx = 0;`
			`}`
			`else if(equal(target, stack.back()->single_space_offset()))`
working on space optimization 2013-03-21 04:18:26 +00:00			`{`
			`Unicode u = ' ';`
			`outputUnicodes(out, &u, 1);`
			`dx = 0;`
			`}`
			`else`
			`{`
			`auto & wm = renderer->whitespace_manager;`
			`wm.install(target);`
			`auto wid = wm.get_id();`
			`double w = wm.get_actual_value();`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00
working on space optimization 2013-03-21 04:18:26 +00:00			`if(w < 0)`
			`last_text_pos_with_negative_offset = cur_text_idx;`
removed 'optmized' option, use greedy method automatically 2012-09-06 06:37:09 +00:00
working on space optimization 2013-03-21 04:18:26 +00:00			`auto * p = stack.back();`
			`double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);`
option 'space-as-offset' 2012-09-07 00:39:21 +00:00
working on space optimization 2013-03-21 04:18:26 +00:00			`out << "<span class=\"" << CSS::WHITESPACE_CN`
			`<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00
working on space optimization 2013-03-21 04:18:26 +00:00			`dx = target - w;`
			`}`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00
			`++ cur_offset_iter;`
			`}`

working on linear gradient 2012-10-02 18:19:40 +00:00			`size_t next_text_idx = min<size_t>(cur_state_iter->start_idx, cur_offset_iter->start_idx);`
.. 2012-09-10 19:01:02 +00:00
			`outputUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`cur_text_idx = next_text_idx;`
			`}`

optimize states in HTML 2012-09-05 07:13:21 +00:00			`// we have a nullptr in the bottom`
removed 'optmized' option, use greedy method automatically 2012-09-06 06:37:09 +00:00			`while(stack.back())`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`{`
removed 'optmized' option, use greedy method automatically 2012-09-06 06:37:09 +00:00			`stack.back()->end(out);`
			`stack.pop_back();`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`}`

first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`out << "</div>";`


			`states.clear();`
			`offsets.clear();`
			`text.clear();`

			`}`

separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`void HTMLRenderer::TextLineBuffer::set_state (State & state)`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`{`
			`state.ids[State::FONT_ID] = renderer->cur_font_info->id;`
FontSizeManager 2013-02-05 06:36:36 +00:00			`state.ids[State::FONT_SIZE_ID] = renderer->font_size_manager.get_id();`
color managers 2013-02-05 13:56:19 +00:00			`state.ids[State::FILL_COLOR_ID] = renderer->fill_color_manager.get_id();`
			`state.ids[State::STROKE_COLOR_ID] = renderer->stroke_color_manager.get_id();`
FontSizeManager 2013-02-05 06:36:36 +00:00			`state.ids[State::LETTER_SPACE_ID] = renderer->letter_space_manager.get_id();`
			`state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id();`
rise 2013-02-05 06:45:40 +00:00			`state.ids[State::RISE_ID] = renderer->rise_manager.get_id();`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00
working 2013-03-20 15:46:58 +00:00			`state.font_info = renderer->cur_font_info;`
working on space optimization 2013-03-21 04:18:26 +00:00			`state.draw_font_size = renderer->font_size_manager.get_actual_value();`
			`state.letter_space = renderer->letter_space_manager.get_actual_value();`
			`state.word_space = renderer->word_space_manager.get_actual_value();`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`}`

working 2013-03-20 15:46:58 +00:00			`void HTMLRenderer::TextLineBuffer::optimize(void)`
			`{`
temporarily disable the optimize function 2013-03-21 14:29:38 +00:00			`// this function needs more work`
			`return;`

working 2013-03-20 15:46:58 +00:00			`assert(!states.empty());`

			`// set proper hash_umask`
working on space optimization 2013-03-21 04:18:26 +00:00			`long long word_space_umask = ((long long)0xff) << (8*((int)State::WORD_SPACE_ID));`
			`for(auto iter = states.begin(); iter != states.end(); ++iter)`
			`{`
			`auto text_iter1 = text.begin() + (iter->start_idx);`
			`auto next_iter = iter;`
			`++next_iter;`
			`auto text_iter2 = (next_iter == states.end()) ? (text.end()) : (text.begin() + (next_iter->start_idx));`
			`if(find(text_iter1, text_iter2, ' ') == text_iter2)`
			`{`
			`// if there's no space, word_space does not matter;`
			`iter->hash_umask \|= word_space_umask;`
			`}`
			`}`

			`// clean zero offsets`
			`{`
			`auto write_iter = offsets.begin();`
			`for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)`
			`{`
			`if(!equal(iter->width, 0))`
			`{`
			`write_iter = iter;`
			`++write_iter;`
			`}`
			`}`
			`offsets.erase(write_iter, offsets.end());`
			`}`
working 2013-03-20 15:46:58 +00:00
			`// In some PDF files all spaces are converted into positionig shifts`
			`// We may try to change them to ' ' and adjusted word_spaces`
			`// This can also be applied when param->space_as_offset is set`

working on space optimization 2013-03-21 04:18:26 +00:00			`// for now, we cosider only the no-space scenario`
			`if(offsets.size() > 0)`
			`{`
			`// Since GCC 4.4.6 is suported, I cannot use all_of + lambda here`
			`bool all_ws_umask = true;`
			`for(auto iter = states.begin(); iter != states.end(); ++iter)`
			`{`
			`if(!(iter->hash_umask & word_space_umask))`
			`{`
			`all_ws_umask = false;`
			`break;`
			`}`
			`}`
			`if(all_ws_umask)`
			`{`
			`double avg_width = 0;`
			`int posive_offset_count = 0;`
			`for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)`
			`{`
			`if(is_positive(iter->width))`
			`{`
			`++posive_offset_count;`
			`avg_width += iter->width;`
			`}`
			`}`
			`avg_width /= posive_offset_count;`

			`// now check if the width of offsets are close enough`
			`// TODO: it might make more sense if the threshold is proportion to the font size`
			`bool ok = true;`
			`double accum_off = 0;`
			`double orig_accum_off = 0;`
			`for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)`
			`{`
			`orig_accum_off += iter->width;`
			`accum_off += avg_width;`
			`if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps)`
			`{`
			`ok = false;`
			`break;`
			`}`
			`}`
			`if(ok)`
			`{`
			`// ok, make all offsets equi-width`
			`for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)`
			`{`
			`if(is_positive(iter->width))`
			`iter->width = avg_width;`
			`}`
			`// set new word_space`
			`for(auto iter = states.begin(); iter != states.end(); ++iter)`
			`{`
fix space optimization 2013-03-21 04:27:07 +00:00			`double new_word_space = avg_width - iter->single_space_offset() + iter->word_space;`
working on space optimization 2013-03-21 04:18:26 +00:00
			`// install new word_space`
			`// we might introduce more variance here`
			`auto & wm = renderer->word_space_manager;`
			`wm.install(new_word_space);`
			`iter->ids[State::WORD_SPACE_ID] = wm.get_id();`
			`iter->word_space = wm.get_actual_value();`
			`iter->hash_umask &= (~word_space_umask);`
			`}`
			`}`
			`}`
			`}`
working 2013-03-20 15:46:58 +00:00			`}`

			`// this state will be converted to a child node of the node of prev_state`
			`// dump the difference between previous state`
			`// also clone corresponding states`
separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state)`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`{`
working 2013-03-20 15:46:58 +00:00			`long long cur_mask = 0xff;`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`bool first = true;`
working 2013-03-20 15:46:58 +00:00			`for(int i = 0; i < ID_COUNT; ++i, cur_mask<<=8)`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`{`
working 2013-03-20 15:46:58 +00:00			`if(hash_umask & cur_mask) // we don't care about this ID`
			`{`
			`if (prev_state && (!(prev_state->hash_umask & cur_mask))) // if prev_state have it set`
			`{`
			`// we have to inherit it`
			`ids[i] = prev_state->ids[i];`
			`hash_umask &= (~cur_mask);`
			`}`
			`//anyway we don't have to output it`
			`continue;`
			`}`

			`// now we care about the ID`
			`if(prev_state && (!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i]))`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`continue;`

			`if(first)`
			`{`
removed 'optmized' option, use greedy method automatically 2012-09-06 06:37:09 +00:00			`out << "<span class=\"";`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`first = false;`
			`}`
			`else`
			`{`
			`out << ' ';`
			`}`

stroked text 2013-01-31 22:21:57 +00:00			`// out should have hex set`
color managers 2013-02-05 13:56:19 +00:00			`out << css_class_names[i];`
stroked text 2013-01-31 22:21:57 +00:00			`if (ids[i] == -1)`
color managers 2013-02-05 13:56:19 +00:00			`out << CSS::INVALID_ID;`
stroked text 2013-01-31 22:21:57 +00:00			`else`
color managers 2013-02-05 13:56:19 +00:00			`out << ids[i];`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`}`
optimize states in HTML 2012-09-05 07:13:21 +00:00
working 2013-03-20 15:46:58 +00:00			`if(first) // we actually just inherit the whole prev_state`
removed 'optmized' option, use greedy method automatically 2012-09-06 06:37:09 +00:00			`{`
			`need_close = false;`
			`}`
			`else`
			`{`
			`out << "\">";`
			`need_close = true;`
			`}`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`}`

separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`void HTMLRenderer::TextLineBuffer::State::end(ostream & out) const`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`{`
			`if(need_close)`
			`out << "</span>";`
			`}`

separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`void HTMLRenderer::TextLineBuffer::State::hash(void)`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`{`
			`hash_value = 0;`
			`for(int i = 0; i < ID_COUNT; ++i)`
			`{`
			`hash_value = (hash_value << 8) \| (ids[i] & 0xff);`
			`}`
			`}`

separate TextLineBuffer 2012-12-11 12:48:01 +00:00			`int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`{`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`/*`
			`* A quick check based on hash_value`
			`* it could be wrong when there are more then 256 classes,`
add an option 'optimize' 2012-09-05 08:19:01 +00:00			`* in which case the output may not be optimal, but still 'correct' in terms of HTML`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`*/`
working 2013-03-20 15:46:58 +00:00			`long long common_mask = ~(hash_umask \| s.hash_umask);`
			`if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0;`
optimize states in HTML 2012-09-05 07:13:21 +00:00
working 2013-03-20 15:46:58 +00:00			`long long cur_mask = 0xff;`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`int d = 0;`
			`for(int i = 0; i < ID_COUNT; ++i)`
working 2013-03-20 15:46:58 +00:00			`{`
			`if((common_mask & cur_mask) && (ids[i] != s.ids[i]))`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`++ d;`
working 2013-03-20 15:46:58 +00:00			`cur_mask <<= 8;`
			`}`
optimize states in HTML 2012-09-05 07:13:21 +00:00			`return d;`
first implemenation of LineBuffer 2012-09-04 15:33:15 +00:00			`}`

working on space optimization 2013-03-21 04:18:26 +00:00			`double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const`
			`{`
fix space optimization 2013-03-21 04:27:07 +00:00			`return word_space + letter_space + font_info->space_width * draw_font_size;`
working on space optimization 2013-03-21 04:18:26 +00:00			`}`

css class name constants 2013-02-05 10:19:25 +00:00			`// the order should be the same as in the enum`
			`const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = {`
change css names 2013-02-28 07:59:14 +00:00			`CSS::FONT_FAMILY_CN,`
css class name constants 2013-02-05 10:19:25 +00:00			`CSS::FONT_SIZE_CN,`
			`CSS::FILL_COLOR_CN,`
			`CSS::STROKE_COLOR_CN,`
			`CSS::LETTER_SPACE_CN,`
			`CSS::WORD_SPACE_CN,`
			`CSS::RISE_CN`
			`};`

add manifest & split-pages 2012-09-12 15:26:14 +00:00			`} //namespace pdf2htmlEX`