mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
clean code
This commit is contained in:
parent
ae35bb24b6
commit
7cc6f1efcd
@ -42,17 +42,22 @@ void HTMLRenderer::TextLineBuffer::append_unicodes(const Unicode * u, int l)
|
||||
|
||||
void HTMLRenderer::TextLineBuffer::append_offset(double width)
|
||||
{
|
||||
/*
|
||||
* If the last offset is very thin, we can ignore it and directly use it
|
||||
* But this should not happen often, and we will also filter near-zero offsets when outputting them
|
||||
* So don't check it
|
||||
*/
|
||||
if((!offsets.empty()) && (offsets.back().start_idx == text.size()))
|
||||
offsets.back().width += width;
|
||||
else
|
||||
offsets.push_back(Offset({text.size(), width}));
|
||||
offsets.emplace_back(text.size(), width);
|
||||
}
|
||||
|
||||
void HTMLRenderer::TextLineBuffer::append_state(void)
|
||||
{
|
||||
if(states.empty() || (states.back().start_idx != text.size()))
|
||||
{
|
||||
states.resize(states.size() + 1);
|
||||
states.emplace_back();
|
||||
states.back().start_idx = text.size();
|
||||
states.back().hash_umask = 0;
|
||||
}
|
||||
@ -73,6 +78,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||
return;
|
||||
}
|
||||
|
||||
// remove unuseful states in the end
|
||||
while((!states.empty()) && (states.back().start_idx >= text.size()))
|
||||
states.pop_back();
|
||||
|
||||
@ -84,24 +90,27 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||
return;
|
||||
}
|
||||
|
||||
// optimize before output
|
||||
optimize();
|
||||
|
||||
double max_ascent = 0;
|
||||
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
||||
{
|
||||
const auto & s = *iter;
|
||||
max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size);
|
||||
}
|
||||
|
||||
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
||||
iter->hash();
|
||||
|
||||
// Start Output
|
||||
ostream & out = renderer->f_pages.fs;
|
||||
{
|
||||
// max_ascent determines the height of the div
|
||||
double max_ascent = 0;
|
||||
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
||||
{
|
||||
double cur_ascent = iter->font_info->ascent * iter->draw_font_size;
|
||||
if(cur_ascent > max_ascent)
|
||||
max_ascent = cur_ascent;
|
||||
iter->hash();
|
||||
}
|
||||
|
||||
long long hid = renderer->height_manager.install(max_ascent);
|
||||
long long lid = renderer->left_manager .install(x);
|
||||
long long bid = renderer->bottom_manager.install(y);
|
||||
|
||||
// open <div> for the current text line
|
||||
out << "<div class=\"" << CSS::LINE_CN
|
||||
<< " " << CSS::TRANSFORM_MATRIX_CN << tm_id
|
||||
<< " " << CSS::LEFT_CN << lid
|
||||
@ -110,6 +119,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||
<< "\">";
|
||||
}
|
||||
|
||||
// a special safeguard in the bottom
|
||||
stack.clear();
|
||||
stack.push_back(nullptr);
|
||||
|
||||
@ -151,16 +161,19 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||
if((*iter)->start_idx <= last_text_pos_with_negative_offset)
|
||||
break;
|
||||
}
|
||||
// export the diff between *state_iter1 and stack.back()
|
||||
state_iter1->begin(out, stack.back());
|
||||
stack.push_back(&*state_iter1);
|
||||
}
|
||||
|
||||
// [state_iter1->start_idx, text_idx2) are covered by the current state
|
||||
size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
|
||||
|
||||
// dump all text and offsets before next state
|
||||
while(true)
|
||||
{
|
||||
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx <= cur_text_idx))
|
||||
if((cur_offset_iter != offsets.end())
|
||||
&& (cur_offset_iter->start_idx <= cur_text_idx))
|
||||
{
|
||||
if(cur_offset_iter->start_idx > text_idx2)
|
||||
break;
|
||||
@ -168,6 +181,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||
double target = cur_offset_iter->width + dx;
|
||||
double actual_offset = 0;
|
||||
|
||||
//ignore near-zero offsets
|
||||
if(abs(target) <= renderer->param->h_eps)
|
||||
{
|
||||
actual_offset = 0;
|
||||
@ -175,6 +189,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||
else
|
||||
{
|
||||
bool done = false;
|
||||
// check if the offset is equivalent to a single ' '
|
||||
if(!(state_iter1->hash_umask & State::umask_by_id(State::WORD_SPACE_ID)))
|
||||
{
|
||||
double space_off = state_iter1->single_space_offset();
|
||||
@ -187,6 +202,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||
}
|
||||
}
|
||||
|
||||
// finally, just dump it
|
||||
if(!done)
|
||||
{
|
||||
long long wid = renderer->whitespace_manager.install(target, &actual_offset);
|
||||
@ -236,6 +252,8 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||
|
||||
void HTMLRenderer::TextLineBuffer::set_state (State & state)
|
||||
{
|
||||
// TODO: as letter_space and word_space may be modified (optimization)
|
||||
// we should not install them so early
|
||||
state.ids[State::FONT_ID] = renderer->cur_font_info->id;
|
||||
state.ids[State::FONT_SIZE_ID] = renderer->font_size_manager.get_id();
|
||||
state.ids[State::FILL_COLOR_ID] = renderer->fill_color_manager.get_id();
|
||||
@ -284,25 +302,28 @@ void HTMLRenderer::TextLineBuffer::optimize()
|
||||
{
|
||||
const size_t text_idx1 = state_iter1->start_idx;
|
||||
const size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
|
||||
|
||||
// get the text segment covered by current state (*state_iter1)
|
||||
const auto text_iter1 = text.begin() + text_idx1;
|
||||
const auto text_iter2 = text.begin() + text_idx2;
|
||||
size_t text_count = text_idx2 - text_idx1;
|
||||
|
||||
while((offset_iter1 != offsets.end()) && (offset_iter1->start_idx <= text_idx1))
|
||||
// there might be some offsets before the first state
|
||||
while((offset_iter1 != offsets.end())
|
||||
&& (offset_iter1->start_idx <= text_idx1))
|
||||
{
|
||||
new_offsets.push_back(*(offset_iter1++));
|
||||
}
|
||||
|
||||
// find the last offset covered by the current state
|
||||
auto offset_iter2 = offset_iter1;
|
||||
for(; (offset_iter2 != offsets.end()) && (offset_iter2->start_idx <= text_idx2); ++offset_iter2) { }
|
||||
|
||||
// There are `offset_count` <span>'s, the target is to reduce this number
|
||||
size_t offset_count = offset_iter2 - offset_iter1;
|
||||
assert(text_count >= offset_count);
|
||||
|
||||
double letter_space_diff = 0; // will be later used for optimizing word space
|
||||
width_map.clear();
|
||||
|
||||
// Optimize letter space
|
||||
// how much letter_space is changed
|
||||
// will be later used for optimizing word space
|
||||
double letter_space_diff = 0;
|
||||
width_map.clear();
|
||||
|
||||
// In some PDF files all letter spaces are implemented as position shifts between each letter
|
||||
// try to simplify it with a proper letter space
|
||||
@ -325,7 +346,9 @@ void HTMLRenderer::TextLineBuffer::optimize()
|
||||
width_map.insert(iter, std::make_pair(target, 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// TODO snapping the widths may result a better result
|
||||
// e.g. for (-0.7 0.6 -0.2 0.3 10 10), 0 is better than 10
|
||||
double most_used_width = 0;
|
||||
size_t max_count = 0;
|
||||
for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
|
||||
@ -342,7 +365,7 @@ void HTMLRenderer::TextLineBuffer::optimize()
|
||||
if(max_count <= text_count / 2)
|
||||
{
|
||||
// the old value is the best
|
||||
// just copy copy offsets
|
||||
// just copy old offsets
|
||||
new_offsets.insert(new_offsets.end(), offset_iter1, offset_iter2);
|
||||
}
|
||||
else
|
||||
@ -369,7 +392,7 @@ void HTMLRenderer::TextLineBuffer::optimize()
|
||||
}
|
||||
if(!equal(cur_width, 0))
|
||||
{
|
||||
new_offsets.push_back(Offset({cur_text_idx+1, cur_width}));
|
||||
new_offsets.emplace_back(cur_text_idx+1, cur_width);
|
||||
++ offset_count;
|
||||
}
|
||||
}
|
||||
@ -382,6 +405,10 @@ void HTMLRenderer::TextLineBuffer::optimize()
|
||||
// We may try to change (some of) them to ' ' by adjusting word_space
|
||||
// for now, we cosider only the no-space scenario
|
||||
// which also includes the case when param->space_as_offset is set
|
||||
|
||||
// get the text segment covered by current state (*state_iter1)
|
||||
const auto text_iter1 = text.begin() + text_idx1;
|
||||
const auto text_iter2 = text.begin() + text_idx2;
|
||||
if(find(text_iter1, text_iter2, ' ') == text_iter2)
|
||||
{
|
||||
// if there is not any space, we may change the value of word_space arbitrarily
|
||||
@ -399,7 +426,7 @@ void HTMLRenderer::TextLineBuffer::optimize()
|
||||
// find the most frequent width, with new letter space applied
|
||||
for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
|
||||
{
|
||||
double fixed_width = iter->first + letter_space_diff;
|
||||
double fixed_width = iter->first + letter_space_diff; // this is the actual offset in HTML
|
||||
// we don't want to add spaces for tiny gaps, or even negative shifts
|
||||
if((fixed_width >= threshold - EPS) && (iter->second > max_count))
|
||||
{
|
||||
@ -408,18 +435,14 @@ void HTMLRenderer::TextLineBuffer::optimize()
|
||||
}
|
||||
}
|
||||
|
||||
state_iter1->word_space = 0;
|
||||
state_iter1->word_space = 0; // clear word_space for single_space_offset
|
||||
double new_word_space = most_used_width - state_iter1->single_space_offset();
|
||||
// install new word_space
|
||||
state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space));
|
||||
// mark that the word_space is not free
|
||||
state_iter1->hash_umask &= (~word_space_umask);
|
||||
state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space)); // install new word_space
|
||||
state_iter1->hash_umask &= (~word_space_umask); // mark that the word_space is not free
|
||||
}
|
||||
else
|
||||
else // there is no offset at all
|
||||
{
|
||||
// if there is no offset at all
|
||||
// we just free word_space
|
||||
state_iter1->hash_umask |= word_space_umask;
|
||||
state_iter1->hash_umask |= word_space_umask; // we just free word_space
|
||||
}
|
||||
}
|
||||
offset_iter1 = offset_iter2;
|
||||
|
@ -67,6 +67,9 @@ public:
|
||||
|
||||
class Offset {
|
||||
public:
|
||||
Offset(size_t size_idx, double width)
|
||||
:start_idx(size_idx),width(width)
|
||||
{ }
|
||||
size_t start_idx; // should put this Offset right before text[start_idx];
|
||||
double width;
|
||||
};
|
||||
|
@ -74,8 +74,17 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
bool is_space = false;
|
||||
if (n == 1 && *p == ' ')
|
||||
{
|
||||
++nSpaces;
|
||||
/*
|
||||
* This is by standard
|
||||
* however some PDF will use ' ' as a normal encoding slot
|
||||
* such that it will be mapped to other unicodes
|
||||
* In that case, when sapce_as_offset is on, we will simply ignore that character...
|
||||
*
|
||||
* Checking mapped unicode may or may not work
|
||||
* There are always ugly PDF files with no usefull info at all.
|
||||
*/
|
||||
is_space = true;
|
||||
++nSpaces;
|
||||
}
|
||||
|
||||
if(is_space && (param->space_as_offset))
|
||||
|
Loading…
Reference in New Issue
Block a user