1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

fix space optimization

This commit is contained in:
Lu Wang 2013-03-30 22:37:20 +08:00
parent 0268a9d966
commit 495b04f046
8 changed files with 175 additions and 166 deletions

View File

@ -29,7 +29,7 @@ using std::endl;
using std::find;
using std::abs;
void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
void HTMLRenderer::TextLineBuffer::set_pos(GfxState * state)
{
state->transform(state->getCurX(), state->getCurY(), &x, &y);
tm_id = renderer->transform_matrix_manager.get_id();
@ -94,15 +94,15 @@ void HTMLRenderer::TextLineBuffer::flush(void)
offsets.push_back(Offset({text.size(), 0}));
ostream & out = renderer->f_pages.fs;
renderer->height_manager.install(max_ascent);
renderer->left_manager .install(x);
renderer->bottom_manager.install(y);
long long hid = renderer->height_manager.install(max_ascent);
long long lid = renderer->left_manager .install(x);
long long bid = renderer->bottom_manager.install(y);
out << "<div class=\"" << CSS::LINE_CN
<< " " << CSS::TRANSFORM_MATRIX_CN << tm_id
<< " " << CSS::LEFT_CN << renderer->left_manager .get_id()
<< " " << CSS::HEIGHT_CN << renderer->height_manager.get_id()
<< " " << CSS::BOTTOM_CN << renderer->bottom_manager.get_id()
<< " " << CSS::LEFT_CN << lid
<< " " << CSS::HEIGHT_CN << hid
<< " " << CSS::BOTTOM_CN << bid
<< "\">";
auto cur_state_iter = states.begin();
@ -180,10 +180,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
if(!done)
{
auto & wm = renderer->whitespace_manager;
wm.install(target);
auto wid = wm.get_id();
actual_offset = wm.get_actual_value();
long long wid = renderer->whitespace_manager.install(target, &actual_offset);
if(!equal(actual_offset, 0))
{
@ -217,11 +214,9 @@ void HTMLRenderer::TextLineBuffer::flush(void)
out << "</div>";
states.clear();
offsets.clear();
text.clear();
}
void HTMLRenderer::TextLineBuffer::set_state (State & state)
@ -242,9 +237,6 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state)
void HTMLRenderer::TextLineBuffer::optimize(void)
{
// need more work
return;
assert(!states.empty());
// set proper hash_umask
@ -291,43 +283,45 @@ void HTMLRenderer::TextLineBuffer::optimize(void)
avg_width += iter->width;
}
}
avg_width /= posive_offset_count;
// now check if the width of offsets are close enough
// TODO: it might make more sense if the threshold is proportion to the font size
bool ok = true;
double accum_off = 0;
double orig_accum_off = 0;
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
if(posive_offset_count > 0)
{
orig_accum_off += iter->width;
accum_off += avg_width;
if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps)
{
ok = false;
break;
}
}
if(ok)
{
// ok, make all offsets equi-width
avg_width /= posive_offset_count;
// now check if the width of offsets are close enough
// TODO: it might make more sense if the threshold is proportion to the font size
bool ok = true;
double accum_off = 0;
double orig_accum_off = 0;
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
{
if(is_positive(iter->width))
iter->width = avg_width;
orig_accum_off += iter->width;
accum_off += avg_width;
if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps)
{
ok = false;
break;
}
}
// set new word_space
for(auto iter = states.begin(); iter != states.end(); ++iter)
if(ok)
{
double new_word_space = avg_width - iter->single_space_offset() + iter->word_space;
// ok, make all offsets equi-width
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
{
if(is_positive(iter->width))
iter->width = avg_width;
}
// set new word_space
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
iter->word_space = 0;
double new_word_space = avg_width - iter->single_space_offset();
// install new word_space
// we might introduce more variance here
auto & wm = renderer->word_space_manager;
wm.install(new_word_space);
iter->ids[State::WORD_SPACE_ID] = wm.get_id();
iter->word_space = wm.get_actual_value();
iter->hash_umask &= (~word_space_umask);
// install new word_space
// we might introduce more variance here
iter->ids[State::WORD_SPACE_ID] = renderer->word_space_manager.install(new_word_space, &(iter->word_space));
iter->hash_umask &= (~word_space_umask);
}
}
}
}

View File

@ -69,7 +69,7 @@ public:
double width;
};
void reset(GfxState * state);
void set_pos(GfxState * state);
void append_unicodes(const Unicode * u, int l);
void append_offset(double width);
void append_state(void);

View File

@ -372,7 +372,7 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co
}
}
transform_matrix_manager.install(new_tm);
transform_matrix_manager.update(new_tm);
f_pages.fs << "<div class=\"" << CSS::CSS_DRAW_CN
<< ' ' << CSS::TRANSFORM_MATRIX_CN << transform_matrix_manager.get_id()
<< "\" style=\"";

View File

@ -203,17 +203,15 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
const char * used_map = nullptr;
info.em_size = ffw_get_em_size();
info.space_width = 0;
if(!font->isCIDFont())
{
font_8bit = dynamic_cast<Gfx8BitFont*>(font);
info.space_width = font_8bit->getWidth(' ');
}
else
{
font_cid = dynamic_cast<GfxCIDFont*>(font);
char buf[2] = {0, ' '};
info.space_width = font_cid->getWidth(buf, 2);
}
if(get_metric_only)
@ -343,47 +341,44 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
* Traverse all possible codes
*/
bool retried = false; // avoid infinite loop
for(int i = 0; i <= maxcode; ++i)
for(int cur_code = 0; cur_code <= maxcode; ++cur_code)
{
if(!used_map[i])
if(!used_map[cur_code])
continue;
/*
* Skip glyphs without names (only for non-ttf fonts)
*/
if(!is_truetype && (font_8bit != nullptr)
&& (font_8bit->getCharName(i) == nullptr))
&& (font_8bit->getCharName(cur_code) == nullptr))
{
continue;
}
int k = i;
int mapped_code = cur_code;
if(code2GID)
{
// for fonts with GID (e.g. TTF) we need to map GIDs instead of codes
if((k = code2GID[i]) == 0) continue;
if((mapped_code = code2GID[cur_code]) == 0) continue;
}
if(k > max_key)
max_key = k;
if(mapped_code > max_key)
max_key = mapped_code;
Unicode u, *pu=&u;
if(info.use_tounicode)
{
int n = ctu ? (ctu->mapToUnicode(i, &pu)) : 0;
u = check_unicode(pu, n, i, font);
int n = ctu ? (ctu->mapToUnicode(cur_code, &pu)) : 0;
u = check_unicode(pu, n, cur_code, font);
}
else
{
u = unicode_from_font(i, font);
u = unicode_from_font(cur_code, font);
}
if(u == ' ')
has_space = true;
if(codeset.insert(u).second)
{
cur_mapping[k] = u;
cur_mapping[mapped_code] = u;
}
else
{
@ -400,7 +395,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
//TODO: constant for the length
memset(cur_mapping, -1, 0x10000 * sizeof(*cur_mapping));
memset(width_list, -1, 0x10000 * sizeof(*width_list));
i = -1;
cur_code = -1;
continue;
}
}
@ -412,16 +407,26 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
}
}
if(font_8bit)
{
width_list[k] = (int)floor(font_8bit->getWidth(i) * info.em_size + 0.5);
}
else
{
char buf[2];
buf[0] = (i >> 8) & 0xff;
buf[1] = (i & 0xff);
width_list[k] = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5);
double cur_width = 0;
if(font_8bit)
{
cur_width = font_8bit->getWidth(cur_code);
}
else
{
char buf[2];
buf[0] = (cur_code >> 8) & 0xff;
buf[1] = (cur_code & 0xff);
cur_width = font_cid->getWidth(buf, 2) ;
}
width_list[mapped_code] = (int)floor(cur_width * info.em_size + 0.5);
if(u == ' ')
{
has_space = true;
info.space_width = cur_width;
}
}
}
@ -434,6 +439,15 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
// Might be a problem if ' ' is in the font, but not empty
if(!has_space)
{
if(font_8bit)
{
info.space_width = font_8bit->getWidth(' ');
}
else
{
char buf[2] = {0, ' '};
info.space_width = font_cid->getWidth(buf, 2);
}
ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5));
}

View File

@ -164,12 +164,12 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
this->pageNum = pageNum;
width_manager.install(state->getPageWidth());
height_manager.install(state->getPageHeight());
long long wid = width_manager.install(state->getPageWidth());
long long hid = height_manager.install(state->getPageHeight());
f_pages.fs
<< "<div class=\"" << CSS::PAGE_DECORATION_CN
<< " " << CSS::WIDTH_CN << width_manager.get_id()
<< " " << CSS::HEIGHT_CN << height_manager.get_id()
<< " " << CSS::WIDTH_CN << wid
<< " " << CSS::HEIGHT_CN << hid
<< "\">"
<< "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
<< "\" class=\"" << CSS::PAGE_FRAME_CN

View File

@ -205,7 +205,7 @@ void HTMLRenderer::processLink(AnnotLink * al)
f_pages.fs << ">";
}
transform_matrix_manager.install(default_ctm);
transform_matrix_manager.update(default_ctm);
f_pages.fs << "<div class=\"" << CSS::CSS_DRAW_CN << ' ' << CSS::TRANSFORM_MATRIX_CN
<< transform_matrix_manager.get_id()
<< "\" style=\"";

View File

@ -253,11 +253,11 @@ void HTMLRenderer::check_state_change(GfxState * state)
draw_text_scale = new_draw_text_scale;
}
if(font_size_manager.install(new_draw_font_size))
if(font_size_manager.update(new_draw_font_size))
{
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
}
if(transform_matrix_manager.install(new_draw_text_tm))
if(transform_matrix_manager.update(new_draw_text_tm))
{
new_line_state = max<NewLineState>(new_line_state, NLS_DIV);
}
@ -334,7 +334,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
// letter space
// depends: draw_text_scale
if((all_changed || letter_space_changed || draw_text_scale_changed)
&& (letter_space_manager.install(state->getCharSpace() * draw_text_scale)))
&& (letter_space_manager.update(state->getCharSpace() * draw_text_scale)))
{
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
}
@ -342,7 +342,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
// word space
// depends draw_text_scale
if((all_changed || word_space_changed || draw_text_scale_changed)
&& (word_space_manager.install(state->getWordSpace() * draw_text_scale)))
&& (word_space_manager.update(state->getWordSpace() * draw_text_scale)))
{
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
}
@ -360,11 +360,11 @@ void HTMLRenderer::check_state_change(GfxState * state)
{
GfxRGB new_color;
state->getFillRGB(&new_color);
changed = fill_color_manager.install(new_color);
changed = fill_color_manager.update(new_color);
}
else
{
changed = fill_color_manager.install_transparent();
changed = fill_color_manager.update_transparent();
}
if(changed)
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
@ -384,11 +384,11 @@ void HTMLRenderer::check_state_change(GfxState * state)
{
GfxRGB new_color;
state->getStrokeRGB(&new_color);
changed = stroke_color_manager.install(new_color);
changed = stroke_color_manager.update(new_color);
}
else
{
changed = stroke_color_manager.install_transparent();
changed = stroke_color_manager.update_transparent();
}
if(changed)
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
@ -397,7 +397,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
// rise
// depends draw_text_scale
if((all_changed || rise_changed || draw_text_scale_changed)
&& (rise_manager.install(state->getRise() * draw_text_scale)))
&& (rise_manager.update(state->getRise() * draw_text_scale)))
{
new_line_state = max<NewLineState>(new_line_state, NLS_SPAN);
}
@ -416,7 +416,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
{
close_text_line();
text_line_buf->reset(state);
text_line_buf->set_pos(state);
//resync position
draw_ty = cur_ty;

View File

@ -38,23 +38,44 @@ public:
// usually called at the beginning of a page
void reset(void) {
_install(imp->default_value());
cur_value = imp->default_value();
cur_id = install(cur_value, &cur_actual_value);
}
/*
* install new_value if changed (equal() should be faster than map::lower_bound)
* update the current state, which will be installed automatically
* return if the state has been indeed changed
*/
bool install(double new_value) {
if(equal(new_value, value))
bool update(double new_value) {
if(equal(new_value, cur_value))
return false;
_install(new_value);
cur_value = new_value;
cur_id = install(cur_value, &cur_actual_value);
return true;
}
long long get_id (void) const { return id; }
double get_value (void) const { return value; }
double get_actual_value (void) const { return actual_value; }
// install new_value into the map, but do not update the state
// return the corresponding id, and set
long long install(double new_value, double * actual_value_ptr = nullptr) {
auto iter = value_map.lower_bound(new_value - eps);
if((iter != value_map.end()) && (abs(iter->first - new_value) <= eps))
{
if(actual_value_ptr != nullptr)
*actual_value_ptr = iter->first;
return iter->second;
}
long long id = value_map.size();
double v = value_map.insert(std::make_pair(new_value, id)).first->first;
if(actual_value_ptr != nullptr)
*actual_value_ptr = v;
return id;
}
// get current state
long long get_id (void) const { return cur_id; }
double get_value (void) const { return cur_value; }
double get_actual_value (void) const { return cur_actual_value; }
void dump_css(std::ostream & out) {
for(auto iter = value_map.begin(); iter != value_map.end(); ++iter)
@ -75,34 +96,19 @@ public:
}
protected:
// this version of install does not check if value has been updated
// return if a new entry has been created
bool _install(double new_value) {
value = new_value;
auto iter = value_map.lower_bound(new_value - eps);
if((iter != value_map.end()) && (abs(iter->first - value) <= eps))
{
actual_value = iter->first;
id = iter->second;
return false;
}
id = value_map.size();
actual_value = value_map.insert(std::make_pair(new_value, id)).first->first;
return true;
}
double eps;
Imp * imp;
long long id;
double value; // the value we are tracking
double actual_value; // the value we actually exported to HTML
long long cur_id;
double cur_value; // the value we are tracking
double cur_actual_value; // the value we actually exported to HTML
std::map<double, long long> value_map;
};
// Be careful about the mixed usage of Matrix and const double *
// the input is usually double *, which might be changed, so we have to copy the content out
// in the map we use Matrix instead of double * such that the array may be automatically release when deconstructign
// since the address of cur_value.m cannot be changed, we can export double * instead of Matrix
template <class Imp>
class StateManager<Matrix, Imp>
{
@ -112,21 +118,24 @@ public:
{ }
void reset(void) {
_install(imp->default_value());
memcpy(cur_value.m, imp->default_value(), sizeof(cur_value.m));
cur_id = install(cur_value);
}
// return if changed
bool install(const double * new_value) {
bool update(const double * new_value) {
// For a transform matrix m
// m[4] & m[5] have been taken care of
if(tm_equal(new_value, value.m, 4))
if(tm_equal(new_value, cur_value.m, 4))
return false;
_install(new_value);
memcpy(cur_value.m, new_value, sizeof(cur_value.m));
cur_id = install(cur_value);
return true;
}
long long get_id (void) const { return id; }
const Matrix & get_value (void) const { return value; }
long long get_id (void) const { return cur_id; }
const double * get_value (void) const { return cur_value.m; }
void dump_css(std::ostream & out) {
for(auto iter = value_map.begin(); iter != value_map.end(); ++iter)
@ -140,26 +149,23 @@ public:
void dump_print_css(std::ostream & out, double scale) {}
protected:
// return if a new entry has been created
bool _install(const double * new_value) {
memcpy(value.m, new_value, sizeof(value.m));
auto iter = value_map.lower_bound(value);
if((iter != value_map.end()) && (tm_equal(value.m, iter->first.m, 4)))
// return id
long long install(const Matrix & new_value) {
auto iter = value_map.lower_bound(new_value);
if((iter != value_map.end()) && (tm_equal(new_value.m, iter->first.m, 4)))
{
id = iter->second;
return false;
return iter->second;
}
id = value_map.size();
value_map.insert(std::make_pair(value, id));
return true;
long long id = value_map.size();
value_map.insert(std::make_pair(new_value, id));
return id;
}
Imp * imp;
long long id;
Matrix value;
long long cur_id;
Matrix cur_value;
class Matrix_less
{
@ -177,6 +183,7 @@ protected:
return false;
}
};
std::map<Matrix, long long, Matrix_less> value_map;
};
@ -189,28 +196,31 @@ public:
{ }
void reset(void) {
is_transparent = true;
id = -1;
cur_is_transparent = true;
cur_id = -1;
}
bool install(const GfxRGB & new_value) {
if((!is_transparent) && gfxrgb_equal_obj(new_value, value))
bool update(const GfxRGB & new_value) {
if((!cur_is_transparent) && gfxrgb_equal_obj(new_value, cur_value))
return false;
_install(new_value);
cur_value = new_value;
cur_is_transparent = false;
cur_id = install(cur_value);
return true;
}
bool install_transparent (void) {
if(is_transparent)
bool update_transparent (void) {
if(cur_is_transparent)
return false;
_install_transparent();
cur_is_transparent = true;
cur_id = -1;
return true;
}
long long get_id (void) const { return id; }
const GfxRGB & get_value (void) const { return value; }
bool get_is_transparent (void) const { return is_transparent; }
long long get_id (void) const { return cur_id; }
const GfxRGB & get_value (void) const { return cur_value; }
bool get_is_transparent (void) const { return cur_is_transparent; }
void dump_css(std::ostream & out) {
out << "." << imp->get_css_class_name() << CSS::INVALID_ID << "{";
@ -228,32 +238,23 @@ public:
void dump_print_css(std::ostream & out, double scale) {}
protected:
bool _install(const GfxRGB & new_value) {
is_transparent = false;
value = new_value;
long long install(const GfxRGB & new_value) {
auto iter = value_map.find(new_value);
if(iter != value_map.end())
{
id = iter->second;
return false;
return iter->second;
}
id = value_map.size();
value_map.insert(std::make_pair(value, id));
return true;
}
bool _install_transparent(void) {
is_transparent = true;
id = -1;
return false;
long long id = value_map.size();
value_map.insert(std::make_pair(new_value, id));
return id;
}
Imp * imp;
long long id;
GfxRGB value;
bool is_transparent;
long long cur_id;
GfxRGB cur_value;
bool cur_is_transparent;
class GfxRGB_hash
{