1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

improve space optimization

This commit is contained in:
Lu Wang 2013-03-31 01:00:04 +08:00
parent 495b04f046
commit b1121bf02b
3 changed files with 97 additions and 114 deletions

View File

@ -191,7 +191,6 @@ void HTMLRenderer::TextLineBuffer::flush(void)
out << "<span class=\"" << CSS::WHITESPACE_CN
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
}
}
}
@ -239,93 +238,77 @@ void HTMLRenderer::TextLineBuffer::optimize(void)
{
assert(!states.empty());
auto offset_iter = offsets.begin();
std::map<double, int> width_map;
// set proper hash_umask
long long word_space_umask = State::umask_by_id(State::WORD_SPACE_ID);
for(auto iter = states.begin(); iter != states.end(); ++iter)
for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++;
state_iter1 != states.end();
++state_iter1, ++state_iter2)
{
auto text_iter1 = text.begin() + (iter->start_idx);
auto next_iter = iter;
++next_iter;
auto text_iter2 = (next_iter == states.end()) ? (text.end()) : (text.begin() + (next_iter->start_idx));
if(find(text_iter1, text_iter2, ' ') == text_iter2)
{
// if there's no space, word_space does not matter;
iter->hash_umask |= word_space_umask;
}
}
// In some PDF files all spaces are converted into positionig shifts
// We may try to change them to ' ' and adjusted word_spaces
// This can also be applied when param->space_as_offset is set
size_t text_idx1 = state_iter1->start_idx;
size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
// for now, we cosider only the no-space scenario
if(offsets.size() > 0)
{
// Since GCC 4.4.6 is suported, I cannot use all_of + lambda here
bool all_ws_umask = true;
for(auto iter = states.begin(); iter != states.end(); ++iter)
// get the text segment covered by current state (*state_iter1)
auto text_iter1 = text.begin() + text_idx1;
auto text_iter2 = text.begin() + text_idx2;
// In some PDF files all spaces are converted into positionig shift
// We may try to change (some of) them to ' ' and adjust word_space accordingly
// This can also be applied when param->space_as_offset is set
// for now, we cosider only the no-space scenario
if(find(text_iter1, text_iter2, ' ') != text_iter2)
continue;
// if there is not any space, we may change the value of word_space arbitrarily
// collect widths
width_map.clear();
while((offset_iter != offsets.end()) && (offset_iter->start_idx < text_idx1))
++ offset_iter;
for(; (offset_iter != offsets.end()) && (offset_iter->start_idx < text_idx2); ++offset_iter)
{
if(!(iter->hash_umask & word_space_umask))
double target = offset_iter->width;
auto iter = width_map.lower_bound(target-EPS);
if((iter != width_map.end()) && (abs(iter->first - target) <= EPS))
{
all_ws_umask = false;
break;
++ iter->second;
}
else
{
width_map.insert(iter, std::make_pair(target, 1));
}
}
if(all_ws_umask)
if(width_map.empty())
{
double avg_width = 0;
int posive_offset_count = 0;
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
// if there is no offset at all
// we just free word_space
state_iter1->hash_umask |= word_space_umask;
continue;
}
// set word_space for the most frequently used offset
double most_used_width = 0;
int max_count = 0;
for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
{
if(iter->second > max_count)
{
if(is_positive(iter->width))
{
++posive_offset_count;
avg_width += iter->width;
}
}
if(posive_offset_count > 0)
{
avg_width /= posive_offset_count;
// now check if the width of offsets are close enough
// TODO: it might make more sense if the threshold is proportion to the font size
bool ok = true;
double accum_off = 0;
double orig_accum_off = 0;
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
{
orig_accum_off += iter->width;
accum_off += avg_width;
if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps)
{
ok = false;
break;
}
}
if(ok)
{
// ok, make all offsets equi-width
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
{
if(is_positive(iter->width))
iter->width = avg_width;
}
// set new word_space
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
iter->word_space = 0;
double new_word_space = avg_width - iter->single_space_offset();
// install new word_space
// we might introduce more variance here
iter->ids[State::WORD_SPACE_ID] = renderer->word_space_manager.install(new_word_space, &(iter->word_space));
iter->hash_umask &= (~word_space_umask);
}
}
max_count = iter->second;
most_used_width = iter->first;
}
}
}
state_iter1->word_space = 0;
double new_word_space = most_used_width - state_iter1->single_space_offset();
// install new word_space
state_iter1->ids[State::WORD_SPACE_ID] = renderer->word_space_manager.install(new_word_space, &(state_iter1->word_space));
// mark that the word_space is not free
state_iter1->hash_umask &= (~word_space_umask);
}
}
// this state will be converted to a child node of the node of prev_state

View File

@ -20,53 +20,53 @@ public:
TextLineBuffer (HTMLRenderer * renderer) : renderer(renderer) { }
class State {
public:
// before output
void begin(std::ostream & out, const State * prev_state);
// after output
void end(std::ostream & out) const;
// calculate the hash code
void hash(void);
// calculate the difference between another State
int diff(const State & s) const;
// the offset cause by a single ' ' char
double single_space_offset(void) const;
public:
// before output
void begin(std::ostream & out, const State * prev_state);
// after output
void end(std::ostream & out) const;
// calculate the hash code
void hash(void);
// calculate the difference between another State
int diff(const State & s) const;
// the offset cause by a single ' ' char
double single_space_offset(void) const;
enum {
FONT_ID,
FONT_SIZE_ID,
FILL_COLOR_ID,
STROKE_COLOR_ID,
LETTER_SPACE_ID,
WORD_SPACE_ID,
RISE_ID,
enum {
FONT_ID,
FONT_SIZE_ID,
FILL_COLOR_ID,
STROKE_COLOR_ID,
LETTER_SPACE_ID,
WORD_SPACE_ID,
RISE_ID,
ID_COUNT
};
ID_COUNT
};
static long long umask_by_id(int id);
static long long umask_by_id(int id);
long long ids[ID_COUNT];
long long ids[ID_COUNT];
const FontInfo * font_info;
double draw_font_size;
double letter_space;
double word_space;
const FontInfo * font_info;
double draw_font_size;
double letter_space;
double word_space;
size_t start_idx; // index of the first Text using this state
// for optimzation
long long hash_value;
long long hash_umask; // some states may not be actually used
bool need_close;
size_t start_idx; // index of the first Text using this state
// for optimzation
long long hash_value;
long long hash_umask; // some states may not be actually used
bool need_close;
static const char * const css_class_names []; // class names for each id
static const char * const css_class_names []; // class names for each id
};
class Offset {
public:
size_t start_idx; // should put this Offset right before text[start_idx];
double width;
public:
size_t start_idx; // should put this Offset right before text[start_idx];
double width;
};
void set_pos(GfxState * state);

View File

@ -66,7 +66,7 @@ public:
}
long long id = value_map.size();
double v = value_map.insert(std::make_pair(new_value, id)).first->first;
double v = value_map.insert(iter, std::make_pair(new_value, id))->first;
if(actual_value_ptr != nullptr)
*actual_value_ptr = v;
return id;
@ -158,7 +158,7 @@ protected:
}
long long id = value_map.size();
value_map.insert(std::make_pair(new_value, id));
value_map.insert(iter, std::make_pair(new_value, id));
return id;
}