diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc index e90ed63..45f09ef 100644 --- a/src/HTMLRenderer/TextLineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -243,6 +243,10 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state) state.word_space = renderer->word_space_manager.get_actual_value(); } +/* + * Adjust letter space and word space in order to reduce the number of HTML elements + * May also unmask word space + */ void HTMLRenderer::TextLineBuffer::optimize() { if(!(renderer->param->optimize_text)) @@ -250,97 +254,172 @@ void HTMLRenderer::TextLineBuffer::optimize() assert(!states.empty()); + const long long word_space_umask = State::umask_by_id(State::WORD_SPACE_ID); + // for optimization, we need accurate values + auto & ls_manager = renderer->letter_space_manager; + double old_ls_eps = ls_manager.get_eps(); + ls_manager.set_eps(EPS); auto & ws_manager = renderer->word_space_manager; double old_ws_eps = ws_manager.get_eps(); ws_manager.set_eps(EPS); - auto offset_iter1 = offsets.begin(); + // statistics of widths std::map width_map; + // store optimized offsets + std::vector new_offsets; + new_offsets.reserve(offsets.size()); - // set proper hash_umask - long long word_space_umask = State::umask_by_id(State::WORD_SPACE_ID); + auto offset_iter1 = offsets.begin(); for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++; state_iter1 != states.end(); ++state_iter1, ++state_iter2) { - size_t text_idx1 = state_iter1->start_idx; - size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx; + const size_t text_idx1 = state_iter1->start_idx; + const size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx; // get the text segment covered by current state (*state_iter1) - auto text_iter1 = text.begin() + text_idx1; - auto text_iter2 = text.begin() + text_idx2; + const auto text_iter1 = text.begin() + text_idx1; + const auto text_iter2 = text.begin() + text_idx2; + size_t text_count = text_idx2 - text_idx1; while((offset_iter1 != offsets.end()) && (offset_iter1->start_idx <= text_idx1)) ++ offset_iter1; auto offset_iter2 = offset_iter1; for(; (offset_iter2 != offsets.end()) && (offset_iter2->start_idx <= text_idx2); ++offset_iter2) { } + // There are `offset_count` 's, the target is to reduce this number + size_t offset_count = offset_iter2 - offset_iter1; + assert(text_count >= offset_count); + + double letter_space_diff = 0; // will be later used for optimizing word space + width_map.clear(); + // Optimize letter space // In some PDF files all letter spaces are implemented as position shifts between each letter // try to simplify it with a proper letter space - - // In some PDF files all spaces are converted into positionig shift - // We may try to change (some of) them to ' ' and adjust word_space accordingly - // This can also be applied when param->space_as_offset is set - // for now, we cosider only the no-space scenario - if(find(text_iter1, text_iter2, ' ') != text_iter2) - continue; - - // if there is not any space, we may change the value of word_space arbitrarily - // collect widths - width_map.clear(); - - - double threshold = (state_iter1->em_size()) * (renderer->param->space_threshold); - for(auto off_iter = offset_iter1; off_iter != offset_iter2; ++off_iter) + if(offset_count > 0) { - double target = off_iter->width; - // we don't want to add spaces for tiny gaps, or even negative shifts - if(target < threshold - EPS) - continue; + // mark the current letter_space + if(text_count > offset_count) + width_map.insert(std::make_pair(0, text_count - offset_count)); - auto iter = width_map.lower_bound(target-EPS); - if((iter != width_map.end()) && (abs(iter->first - target) <= EPS)) + for(auto off_iter = offset_iter1; off_iter != offset_iter2; ++off_iter) { - ++ iter->second; + const double target = off_iter->width; + auto iter = width_map.lower_bound(target-EPS); + if((iter != width_map.end()) && (abs(iter->first - target) <= EPS)) + { + ++ iter->second; + } + else + { + width_map.insert(iter, std::make_pair(target, 1)); + } + } + + double most_used_width = 0; + int max_count = 0; + for(auto iter = width_map.begin(); iter != width_map.end(); ++iter) + { + if(iter->second > max_count) + { + most_used_width = iter->first; + max_count = iter->second; + } + } + + // now we would like to adjust letter space to most_used width + if(equal(most_used_width, 0)) + { + // the old value is the best + // just copy copy offsets + new_offsets.insert(new_offsets.end(), offset_iter1, offset_iter2); } else { - width_map.insert(iter, std::make_pair(target, 1)); + // install new letter space + const double old_ls = state_iter1->letter_space; + state_iter1->ids[State::LETTER_SPACE_ID] = ls_manager.install(old_ls + most_used_width, &(state_iter1->letter_space)); + letter_space_diff = old_ls - state_iter1->letter_space; + // update offsets + auto off_iter = offset_iter1; + // re-count number of offsets + offset_count = 0; + for(size_t cur_text_idx = text_idx1; cur_text_idx < text_idx2; ++cur_text_idx) + { + double cur_width = 0; + if((off_iter != offset_iter2) && (off_iter->start_idx == cur_text_idx + 1)) + { + cur_width = off_iter->width + letter_space_diff; + ++off_iter; + } + else + { + cur_width = letter_space_diff ; + } + if(!equal(cur_width, 0)) + { + new_offsets.push_back(Offset({cur_text_idx+1, cur_width})); + ++ offset_count; + } + } } } - if(width_map.empty()) - { - // if there is no offset at all - // we just free word_space - state_iter1->hash_umask |= word_space_umask; - continue; - } - // set word_space for the most frequently used offset - double most_used_width = 0; - int max_count = 0; - for(auto iter = width_map.begin(); iter != width_map.end(); ++iter) - { - if(iter->second > max_count) - { - max_count = iter->second; - most_used_width = iter->first; - } - } + // Optimize word space - state_iter1->word_space = 0; - double new_word_space = most_used_width - state_iter1->single_space_offset(); - // install new word_space - state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space)); - // mark that the word_space is not free - state_iter1->hash_umask &= (~word_space_umask); + // In some PDF files all spaces are converted into positionig shift + // We may try to change (some of) them to ' ' by adjusting word_space + // for now, we cosider only the no-space scenario + // which also includes the case when param->space_as_offset is set + if(find(text_iter1, text_iter2, ' ') == text_iter2) + { + // if there is not any space, we may change the value of word_space arbitrarily + // note that we may only change word space, no offset will be affected + // The actual effect will emerge during flushing, where it could be detected that an offset can be optimized as a single space character + + if(offset_count > 0) + { + double threshold = (state_iter1->em_size()) * (renderer->param->space_threshold); + // set word_space for the most frequently used offset + double most_used_width = 0; + int max_count = 0; + // if offset_count > 0, we must have updated width_map in the previous step + // find the most frequent width, with new letter space applied + for(auto iter = width_map.begin(); iter != width_map.end(); ++iter) + { + double fixed_width = iter->first + letter_space_diff; + // we don't want to add spaces for tiny gaps, or even negative shifts + if((fixed_width >= threshold - EPS) && (iter->second > max_count)) + { + max_count = iter->second; + most_used_width = fixed_width; + } + } + state_iter1->word_space = 0; + double new_word_space = most_used_width - state_iter1->single_space_offset(); + // install new word_space + state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space)); + // mark that the word_space is not free + state_iter1->hash_umask &= (~word_space_umask); + } + else + { + // if there is no offset at all + // we just free word_space + state_iter1->hash_umask |= word_space_umask; + } + } offset_iter1 = offset_iter2; } + + // apply optimization + std::swap(offsets, new_offsets); // restore old eps + ls_manager.set_eps(old_ls_eps); ws_manager.set_eps(old_ws_eps); }