mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-07-05 01:28:39 +00:00
Merge branch 'master' of github.com:coolwanglu/pdf2htmlEX
This commit is contained in:
commit
c57bd1a655
4
TODO
4
TODO
|
@ -1,3 +1,7 @@
|
||||||
|
non-trivial space optimization
|
||||||
|
(For each state whose word_space is free, set a proper value such that it may cover most whitespaces)
|
||||||
|
(Or just set word_space according to the first positive whitespace, but need to do this before the state inherit some value)
|
||||||
|
|
||||||
== Future: ==
|
== Future: ==
|
||||||
|
|
||||||
Too difficult/complicated to implement:
|
Too difficult/complicated to implement:
|
||||||
|
|
|
@ -30,13 +30,13 @@
|
||||||
|
|
||||||
namespace pdf2htmlEX {
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
// we may need more info of a font in the future
|
|
||||||
class FontInfo
|
class FontInfo
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
long long id;
|
long long id;
|
||||||
bool use_tounicode;
|
bool use_tounicode;
|
||||||
int em_size;
|
int em_size;
|
||||||
|
double space_width;
|
||||||
double ascent, descent;
|
double ascent, descent;
|
||||||
bool is_type3;
|
bool is_type3;
|
||||||
};
|
};
|
||||||
|
|
|
@ -7,6 +7,8 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <cmath>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#include "HTMLRenderer.h"
|
#include "HTMLRenderer.h"
|
||||||
#include "TextLineBuffer.h"
|
#include "TextLineBuffer.h"
|
||||||
|
@ -24,6 +26,8 @@ using std::vector;
|
||||||
using std::ostream;
|
using std::ostream;
|
||||||
using std::cerr;
|
using std::cerr;
|
||||||
using std::endl;
|
using std::endl;
|
||||||
|
using std::find;
|
||||||
|
using std::abs;
|
||||||
|
|
||||||
void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
|
void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
|
||||||
{
|
{
|
||||||
|
@ -50,6 +54,7 @@ void HTMLRenderer::TextLineBuffer::append_state(void)
|
||||||
{
|
{
|
||||||
states.resize(states.size() + 1);
|
states.resize(states.size() + 1);
|
||||||
states.back().start_idx = text.size();
|
states.back().start_idx = text.size();
|
||||||
|
states.back().hash_umask = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
set_state(states.back());
|
set_state(states.back());
|
||||||
|
@ -69,21 +74,25 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
optimize();
|
||||||
iter->hash();
|
|
||||||
|
|
||||||
states.resize(states.size() + 1);
|
|
||||||
states.back().start_idx = text.size();
|
|
||||||
|
|
||||||
offsets.push_back(Offset({text.size(), 0}));
|
|
||||||
|
|
||||||
double max_ascent = 0;
|
double max_ascent = 0;
|
||||||
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
||||||
{
|
{
|
||||||
const auto & s = *iter;
|
const auto & s = *iter;
|
||||||
max_ascent = max<double>(max_ascent, s.ascent * s.draw_font_size);
|
max_ascent = max<double>(max_ascent, s.font_info->ascent * s.draw_font_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// append a dummy state for convenience
|
||||||
|
states.resize(states.size() + 1);
|
||||||
|
states.back().start_idx = text.size();
|
||||||
|
|
||||||
|
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
||||||
|
iter->hash();
|
||||||
|
|
||||||
|
// append a dummy offset for convenience
|
||||||
|
offsets.push_back(Offset({text.size(), 0}));
|
||||||
|
|
||||||
ostream & out = renderer->f_pages.fs;
|
ostream & out = renderer->f_pages.fs;
|
||||||
renderer->height_manager.install(max_ascent);
|
renderer->height_manager.install(max_ascent);
|
||||||
renderer->left_manager .install(x);
|
renderer->left_manager .install(x);
|
||||||
|
@ -148,21 +157,30 @@ void HTMLRenderer::TextLineBuffer::flush(void)
|
||||||
{
|
{
|
||||||
double target = cur_offset_iter->width + dx;
|
double target = cur_offset_iter->width + dx;
|
||||||
|
|
||||||
auto & wm = renderer->whitespace_manager;
|
if(equal(target, stack.back()->single_space_offset()))
|
||||||
wm.install(target);
|
{
|
||||||
auto wid = wm.get_id();
|
Unicode u = ' ';
|
||||||
double w = wm.get_actual_value();
|
outputUnicodes(out, &u, 1);
|
||||||
|
dx = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto & wm = renderer->whitespace_manager;
|
||||||
|
wm.install(target);
|
||||||
|
auto wid = wm.get_id();
|
||||||
|
double w = wm.get_actual_value();
|
||||||
|
|
||||||
if(w < 0)
|
if(w < 0)
|
||||||
last_text_pos_with_negative_offset = cur_text_idx;
|
last_text_pos_with_negative_offset = cur_text_idx;
|
||||||
|
|
||||||
auto * p = stack.back();
|
auto * p = stack.back();
|
||||||
double threshold = p->draw_font_size * (p->ascent - p->descent) * (renderer->param->space_threshold);
|
double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);
|
||||||
|
|
||||||
out << "<span class=\"" << CSS::WHITESPACE_CN
|
out << "<span class=\"" << CSS::WHITESPACE_CN
|
||||||
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
|
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
|
||||||
|
|
||||||
dx = target - w;
|
dx = target - w;
|
||||||
|
}
|
||||||
|
|
||||||
++ cur_offset_iter;
|
++ cur_offset_iter;
|
||||||
}
|
}
|
||||||
|
@ -199,18 +217,140 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state)
|
||||||
state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id();
|
state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id();
|
||||||
state.ids[State::RISE_ID] = renderer->rise_manager.get_id();
|
state.ids[State::RISE_ID] = renderer->rise_manager.get_id();
|
||||||
|
|
||||||
const FontInfo * info = renderer->cur_font_info;
|
state.font_info = renderer->cur_font_info;
|
||||||
state.ascent = info->ascent;
|
state.draw_font_size = renderer->font_size_manager.get_actual_value();
|
||||||
state.descent = info->descent;
|
state.letter_space = renderer->letter_space_manager.get_actual_value();
|
||||||
state.draw_font_size = renderer->font_size_manager.get_value();
|
state.word_space = renderer->word_space_manager.get_actual_value();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void HTMLRenderer::TextLineBuffer::optimize(void)
|
||||||
|
{
|
||||||
|
assert(!states.empty());
|
||||||
|
|
||||||
|
// set proper hash_umask
|
||||||
|
long long word_space_umask = ((long long)0xff) << (8*((int)State::WORD_SPACE_ID));
|
||||||
|
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
||||||
|
{
|
||||||
|
auto text_iter1 = text.begin() + (iter->start_idx);
|
||||||
|
auto next_iter = iter;
|
||||||
|
++next_iter;
|
||||||
|
auto text_iter2 = (next_iter == states.end()) ? (text.end()) : (text.begin() + (next_iter->start_idx));
|
||||||
|
if(find(text_iter1, text_iter2, ' ') == text_iter2)
|
||||||
|
{
|
||||||
|
// if there's no space, word_space does not matter;
|
||||||
|
iter->hash_umask |= word_space_umask;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// clean zero offsets
|
||||||
|
{
|
||||||
|
auto write_iter = offsets.begin();
|
||||||
|
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
|
||||||
|
{
|
||||||
|
if(!equal(iter->width, 0))
|
||||||
|
{
|
||||||
|
*write_iter = *iter;
|
||||||
|
++write_iter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
offsets.erase(write_iter, offsets.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
// In some PDF files all spaces are converted into positionig shifts
|
||||||
|
// We may try to change them to ' ' and adjusted word_spaces
|
||||||
|
// This can also be applied when param->space_as_offset is set
|
||||||
|
|
||||||
|
// for now, we cosider only the no-space scenario
|
||||||
|
if(offsets.size() > 0)
|
||||||
|
{
|
||||||
|
// Since GCC 4.4.6 is suported, I cannot use all_of + lambda here
|
||||||
|
bool all_ws_umask = true;
|
||||||
|
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
||||||
|
{
|
||||||
|
if(!(iter->hash_umask & word_space_umask))
|
||||||
|
{
|
||||||
|
all_ws_umask = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(all_ws_umask)
|
||||||
|
{
|
||||||
|
double avg_width = 0;
|
||||||
|
int posive_offset_count = 0;
|
||||||
|
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
|
||||||
|
{
|
||||||
|
if(is_positive(iter->width))
|
||||||
|
{
|
||||||
|
++posive_offset_count;
|
||||||
|
avg_width += iter->width;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
avg_width /= posive_offset_count;
|
||||||
|
|
||||||
|
// now check if the width of offsets are close enough
|
||||||
|
// TODO: it might make more sense if the threshold is proportion to the font size
|
||||||
|
bool ok = true;
|
||||||
|
double accum_off = 0;
|
||||||
|
double orig_accum_off = 0;
|
||||||
|
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
|
||||||
|
{
|
||||||
|
orig_accum_off += iter->width;
|
||||||
|
accum_off += avg_width;
|
||||||
|
if(is_positive(iter->width) && abs(orig_accum_off - accum_off) >= renderer->param->h_eps)
|
||||||
|
{
|
||||||
|
ok = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(ok)
|
||||||
|
{
|
||||||
|
// ok, make all offsets equi-width
|
||||||
|
for(auto iter = offsets.begin(); iter != offsets.end(); ++iter)
|
||||||
|
{
|
||||||
|
if(is_positive(iter->width))
|
||||||
|
iter->width = avg_width;
|
||||||
|
}
|
||||||
|
// set new word_space
|
||||||
|
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
||||||
|
{
|
||||||
|
double new_word_space = avg_width - iter->single_space_offset() + iter->word_space;
|
||||||
|
|
||||||
|
// install new word_space
|
||||||
|
// we might introduce more variance here
|
||||||
|
auto & wm = renderer->word_space_manager;
|
||||||
|
wm.install(new_word_space);
|
||||||
|
iter->ids[State::WORD_SPACE_ID] = wm.get_id();
|
||||||
|
iter->word_space = wm.get_actual_value();
|
||||||
|
iter->hash_umask &= (~word_space_umask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// this state will be converted to a child node of the node of prev_state
|
||||||
|
// dump the difference between previous state
|
||||||
|
// also clone corresponding states
|
||||||
void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state)
|
void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state)
|
||||||
{
|
{
|
||||||
|
long long cur_mask = 0xff;
|
||||||
bool first = true;
|
bool first = true;
|
||||||
for(int i = 0; i < ID_COUNT; ++i)
|
for(int i = 0; i < ID_COUNT; ++i, cur_mask<<=8)
|
||||||
{
|
{
|
||||||
if(prev_state && (prev_state->ids[i] == ids[i]))
|
if(hash_umask & cur_mask) // we don't care about this ID
|
||||||
|
{
|
||||||
|
if (prev_state && (!(prev_state->hash_umask & cur_mask))) // if prev_state have it set
|
||||||
|
{
|
||||||
|
// we have to inherit it
|
||||||
|
ids[i] = prev_state->ids[i];
|
||||||
|
hash_umask &= (~cur_mask);
|
||||||
|
}
|
||||||
|
//anyway we don't have to output it
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// now we care about the ID
|
||||||
|
if(prev_state && (!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i]))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if(first)
|
if(first)
|
||||||
|
@ -231,7 +371,7 @@ void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * pr
|
||||||
out << ids[i];
|
out << ids[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
if(first)
|
if(first) // we actually just inherit the whole prev_state
|
||||||
{
|
{
|
||||||
need_close = false;
|
need_close = false;
|
||||||
}
|
}
|
||||||
|
@ -264,15 +404,25 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
|
||||||
* it could be wrong when there are more then 256 classes,
|
* it could be wrong when there are more then 256 classes,
|
||||||
* in which case the output may not be optimal, but still 'correct' in terms of HTML
|
* in which case the output may not be optimal, but still 'correct' in terms of HTML
|
||||||
*/
|
*/
|
||||||
if(hash_value == s.hash_value) return 0;
|
long long common_mask = ~(hash_umask | s.hash_umask);
|
||||||
|
if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0;
|
||||||
|
|
||||||
|
long long cur_mask = 0xff;
|
||||||
int d = 0;
|
int d = 0;
|
||||||
for(int i = 0; i < ID_COUNT; ++i)
|
for(int i = 0; i < ID_COUNT; ++i)
|
||||||
if(ids[i] != s.ids[i])
|
{
|
||||||
|
if((common_mask & cur_mask) && (ids[i] != s.ids[i]))
|
||||||
++ d;
|
++ d;
|
||||||
|
cur_mask <<= 8;
|
||||||
|
}
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double HTMLRenderer::TextLineBuffer::State::single_space_offset(void) const
|
||||||
|
{
|
||||||
|
return word_space + letter_space + font_info->space_width * draw_font_size;
|
||||||
|
}
|
||||||
|
|
||||||
// the order should be the same as in the enum
|
// the order should be the same as in the enum
|
||||||
const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = {
|
const char * const HTMLRenderer::TextLineBuffer::State::css_class_names [] = {
|
||||||
CSS::FONT_FAMILY_CN,
|
CSS::FONT_FAMILY_CN,
|
||||||
|
|
|
@ -29,6 +29,8 @@ public:
|
||||||
void hash(void);
|
void hash(void);
|
||||||
// calculate the difference between another State
|
// calculate the difference between another State
|
||||||
int diff(const State & s) const;
|
int diff(const State & s) const;
|
||||||
|
// the offset cause by a single ' ' char
|
||||||
|
double single_space_offset(void) const;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
FONT_ID,
|
FONT_ID,
|
||||||
|
@ -44,13 +46,15 @@ public:
|
||||||
|
|
||||||
long long ids[ID_COUNT];
|
long long ids[ID_COUNT];
|
||||||
|
|
||||||
double ascent;
|
const FontInfo * font_info;
|
||||||
double descent;
|
|
||||||
double draw_font_size;
|
double draw_font_size;
|
||||||
|
double letter_space;
|
||||||
|
double word_space;
|
||||||
|
|
||||||
size_t start_idx; // index of the first Text using this state
|
size_t start_idx; // index of the first Text using this state
|
||||||
// for optimzation
|
// for optimzation
|
||||||
long long hash_value;
|
long long hash_value;
|
||||||
|
long long hash_umask; // some states may not be actually used
|
||||||
bool need_close;
|
bool need_close;
|
||||||
|
|
||||||
static const char * const css_class_names []; // class names for each id
|
static const char * const css_class_names []; // class names for each id
|
||||||
|
@ -73,6 +77,8 @@ private:
|
||||||
// retrieve state from renderer
|
// retrieve state from renderer
|
||||||
void set_state(State & state);
|
void set_state(State & state);
|
||||||
|
|
||||||
|
void optimize(void);
|
||||||
|
|
||||||
HTMLRenderer * renderer;
|
HTMLRenderer * renderer;
|
||||||
|
|
||||||
double x, y;
|
double x, y;
|
||||||
|
|
|
@ -204,6 +204,18 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
|
||||||
|
|
||||||
info.em_size = ffw_get_em_size();
|
info.em_size = ffw_get_em_size();
|
||||||
|
|
||||||
|
if(!font->isCIDFont())
|
||||||
|
{
|
||||||
|
font_8bit = dynamic_cast<Gfx8BitFont*>(font);
|
||||||
|
info.space_width = font_8bit->getWidth(' ');
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
font_cid = dynamic_cast<GfxCIDFont*>(font);
|
||||||
|
char buf[2] = {0, ' '};
|
||||||
|
info.space_width = font_cid->getWidth(buf, 2);
|
||||||
|
}
|
||||||
|
|
||||||
if(get_metric_only)
|
if(get_metric_only)
|
||||||
{
|
{
|
||||||
ffw_metric(&info.ascent, &info.descent);
|
ffw_metric(&info.ascent, &info.descent);
|
||||||
|
@ -228,9 +240,8 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
|
||||||
* for CID Truetype
|
* for CID Truetype
|
||||||
* same as 8bitTrueType, except for that we have to check 65536 charcodes
|
* same as 8bitTrueType, except for that we have to check 65536 charcodes
|
||||||
*/
|
*/
|
||||||
if(!font->isCIDFont())
|
if(font_8bit)
|
||||||
{
|
{
|
||||||
font_8bit = dynamic_cast<Gfx8BitFont*>(font);
|
|
||||||
maxcode = 0xff;
|
maxcode = 0xff;
|
||||||
if(is_truetype_suffix(suffix))
|
if(is_truetype_suffix(suffix))
|
||||||
{
|
{
|
||||||
|
@ -283,7 +294,6 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
font_cid = dynamic_cast<GfxCIDFont*>(font);
|
|
||||||
maxcode = 0xffff;
|
maxcode = 0xffff;
|
||||||
|
|
||||||
if(is_truetype_suffix(suffix))
|
if(is_truetype_suffix(suffix))
|
||||||
|
@ -424,17 +434,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
|
||||||
// Might be a problem if ' ' is in the font, but not empty
|
// Might be a problem if ' ' is in the font, but not empty
|
||||||
if(!has_space)
|
if(!has_space)
|
||||||
{
|
{
|
||||||
int space_width;
|
ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5));
|
||||||
if(font_8bit)
|
|
||||||
{
|
|
||||||
space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
char buf[2] = {0, ' '};
|
|
||||||
space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5);
|
|
||||||
}
|
|
||||||
ffw_add_empty_char((int32_t)' ', space_width);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(ctu)
|
if(ctu)
|
||||||
|
@ -525,6 +525,8 @@ const FontInfo * HTMLRenderer::install_font(GfxFont * font)
|
||||||
|
|
||||||
if(font == nullptr)
|
if(font == nullptr)
|
||||||
{
|
{
|
||||||
|
new_font_info.em_size = 0;
|
||||||
|
new_font_info.space_width = 0;
|
||||||
new_font_info.ascent = 0;
|
new_font_info.ascent = 0;
|
||||||
new_font_info.descent = 0;
|
new_font_info.descent = 0;
|
||||||
new_font_info.is_type3 = false;
|
new_font_info.is_type3 = false;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user