diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc index 383683d..e90ed63 100644 --- a/src/HTMLRenderer/TextLineBuffer.cc +++ b/src/HTMLRenderer/TextLineBuffer.cc @@ -255,10 +255,9 @@ void HTMLRenderer::TextLineBuffer::optimize() double old_ws_eps = ws_manager.get_eps(); ws_manager.set_eps(EPS); - auto offset_iter = offsets.begin(); + auto offset_iter1 = offsets.begin(); std::map width_map; - // optimize word space // set proper hash_umask long long word_space_umask = State::umask_by_id(State::WORD_SPACE_ID); for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++; @@ -272,6 +271,14 @@ void HTMLRenderer::TextLineBuffer::optimize() auto text_iter1 = text.begin() + text_idx1; auto text_iter2 = text.begin() + text_idx2; + while((offset_iter1 != offsets.end()) && (offset_iter1->start_idx <= text_idx1)) + ++ offset_iter1; + auto offset_iter2 = offset_iter1; + for(; (offset_iter2 != offsets.end()) && (offset_iter2->start_idx <= text_idx2); ++offset_iter2) { } + + // In some PDF files all letter spaces are implemented as position shifts between each letter + // try to simplify it with a proper letter space + // In some PDF files all spaces are converted into positionig shift // We may try to change (some of) them to ' ' and adjust word_space accordingly // This can also be applied when param->space_as_offset is set @@ -283,13 +290,11 @@ void HTMLRenderer::TextLineBuffer::optimize() // collect widths width_map.clear(); - while((offset_iter != offsets.end()) && (offset_iter->start_idx <= text_idx1)) - ++ offset_iter; double threshold = (state_iter1->em_size()) * (renderer->param->space_threshold); - for(; (offset_iter != offsets.end()) && (offset_iter->start_idx <= text_idx2); ++offset_iter) + for(auto off_iter = offset_iter1; off_iter != offset_iter2; ++off_iter) { - double target = offset_iter->width; + double target = off_iter->width; // we don't want to add spaces for tiny gaps, or even negative shifts if(target < threshold - EPS) continue; @@ -330,6 +335,9 @@ void HTMLRenderer::TextLineBuffer::optimize() state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space)); // mark that the word_space is not free state_iter1->hash_umask &= (~word_space_umask); + + + offset_iter1 = offset_iter2; } // restore old eps