diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h
index b079800..131d824 100644
--- a/src/HTMLRenderer/HTMLRenderer.h
+++ b/src/HTMLRenderer/HTMLRenderer.h
@@ -30,13 +30,13 @@
namespace pdf2htmlEX {
-// we may need more info of a font in the future
class FontInfo
{
public:
long long id;
bool use_tounicode;
int em_size;
+ int space_width;
double ascent, descent;
bool is_type3;
};
diff --git a/src/HTMLRenderer/TextLineBuffer.cc b/src/HTMLRenderer/TextLineBuffer.cc
index 2a678f8..f8ef903 100644
--- a/src/HTMLRenderer/TextLineBuffer.cc
+++ b/src/HTMLRenderer/TextLineBuffer.cc
@@ -7,6 +7,7 @@
*/
#include
+#include
#include "HTMLRenderer.h"
#include "TextLineBuffer.h"
@@ -24,6 +25,7 @@ using std::vector;
using std::ostream;
using std::cerr;
using std::endl;
+using std::find;
void HTMLRenderer::TextLineBuffer::reset(GfxState * state)
{
@@ -50,6 +52,7 @@ void HTMLRenderer::TextLineBuffer::append_state(void)
{
states.resize(states.size() + 1);
states.back().start_idx = text.size();
+ states.back().hash_umask = 0;
}
set_state(states.back());
@@ -69,6 +72,8 @@ void HTMLRenderer::TextLineBuffer::flush(void)
return;
}
+ optimize();
+
for(auto iter = states.begin(); iter != states.end(); ++iter)
iter->hash();
@@ -81,7 +86,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
const auto & s = *iter;
- max_ascent = max(max_ascent, s.ascent * s.draw_font_size);
+ max_ascent = max(max_ascent, s.font_info->ascent * s.draw_font_size);
}
ostream & out = renderer->f_pages.fs;
@@ -157,7 +162,7 @@ void HTMLRenderer::TextLineBuffer::flush(void)
last_text_pos_with_negative_offset = cur_text_idx;
auto * p = stack.back();
- double threshold = p->draw_font_size * (p->ascent - p->descent) * (renderer->param->space_threshold);
+ double threshold = p->draw_font_size * (p->font_info->ascent - p->font_info->descent) * (renderer->param->space_threshold);
out << "" << (target > (threshold - EPS) ? " " : "") << "";
@@ -199,18 +204,47 @@ void HTMLRenderer::TextLineBuffer::set_state (State & state)
state.ids[State::WORD_SPACE_ID] = renderer->word_space_manager.get_id();
state.ids[State::RISE_ID] = renderer->rise_manager.get_id();
- const FontInfo * info = renderer->cur_font_info;
- state.ascent = info->ascent;
- state.descent = info->descent;
+ state.font_info = renderer->cur_font_info;
state.draw_font_size = renderer->font_size_manager.get_value();
}
+void HTMLRenderer::TextLineBuffer::optimize(void)
+{
+ assert(!states.empty());
+
+ // TODO
+
+ // set proper hash_umask
+
+ // In some PDF files all spaces are converted into positionig shifts
+ // We may try to change them to ' ' and adjusted word_spaces
+ // This can also be applied when param->space_as_offset is set
+
+}
+
+// this state will be converted to a child node of the node of prev_state
+// dump the difference between previous state
+// also clone corresponding states
void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * prev_state)
{
+ long long cur_mask = 0xff;
bool first = true;
- for(int i = 0; i < ID_COUNT; ++i)
+ for(int i = 0; i < ID_COUNT; ++i, cur_mask<<=8)
{
- if(prev_state && (prev_state->ids[i] == ids[i]))
+ if(hash_umask & cur_mask) // we don't care about this ID
+ {
+ if (prev_state && (!(prev_state->hash_umask & cur_mask))) // if prev_state have it set
+ {
+ // we have to inherit it
+ ids[i] = prev_state->ids[i];
+ hash_umask &= (~cur_mask);
+ }
+ //anyway we don't have to output it
+ continue;
+ }
+
+ // now we care about the ID
+ if(prev_state && (!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i]))
continue;
if(first)
@@ -231,7 +265,7 @@ void HTMLRenderer::TextLineBuffer::State::begin (ostream & out, const State * pr
out << ids[i];
}
- if(first)
+ if(first) // we actually just inherit the whole prev_state
{
need_close = false;
}
@@ -264,12 +298,17 @@ int HTMLRenderer::TextLineBuffer::State::diff(const State & s) const
* it could be wrong when there are more then 256 classes,
* in which case the output may not be optimal, but still 'correct' in terms of HTML
*/
- if(hash_value == s.hash_value) return 0;
+ long long common_mask = ~(hash_umask | s.hash_umask);
+ if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0;
+ long long cur_mask = 0xff;
int d = 0;
for(int i = 0; i < ID_COUNT; ++i)
- if(ids[i] != s.ids[i])
+ {
+ if((common_mask & cur_mask) && (ids[i] != s.ids[i]))
++ d;
+ cur_mask <<= 8;
+ }
return d;
}
diff --git a/src/HTMLRenderer/TextLineBuffer.h b/src/HTMLRenderer/TextLineBuffer.h
index 2cc288a..7051e0b 100644
--- a/src/HTMLRenderer/TextLineBuffer.h
+++ b/src/HTMLRenderer/TextLineBuffer.h
@@ -44,13 +44,14 @@ public:
long long ids[ID_COUNT];
- double ascent;
- double descent;
+ const FontInfo * font_info;
double draw_font_size;
+ double word_space;
size_t start_idx; // index of the first Text using this state
// for optimzation
long long hash_value;
+ long long hash_umask; // some states may not be actually used
bool need_close;
static const char * const css_class_names []; // class names for each id
@@ -73,6 +74,8 @@ private:
// retrieve state from renderer
void set_state(State & state);
+ void optimize(void);
+
HTMLRenderer * renderer;
double x, y;
diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc
index ca4eaa0..f686871 100644
--- a/src/HTMLRenderer/font.cc
+++ b/src/HTMLRenderer/font.cc
@@ -204,6 +204,19 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
info.em_size = ffw_get_em_size();
+ if(!font->isCIDFont())
+ {
+ if(font_8bit)
+ {
+ info.space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5);
+ }
+ else
+ {
+ char buf[2] = {0, ' '};
+ info.space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5);
+ }
+ }
+
if(get_metric_only)
{
ffw_metric(&info.ascent, &info.descent);
@@ -424,17 +437,7 @@ void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo
// Might be a problem if ' ' is in the font, but not empty
if(!has_space)
{
- int space_width;
- if(font_8bit)
- {
- space_width = (int)floor(font_8bit->getWidth(' ') * info.em_size + 0.5);
- }
- else
- {
- char buf[2] = {0, ' '};
- space_width = (int)floor(font_cid->getWidth(buf, 2) * info.em_size + 0.5);
- }
- ffw_add_empty_char((int32_t)' ', space_width);
+ ffw_add_empty_char((int32_t)' ', info.space_width);
}
if(ctu)
@@ -525,6 +528,8 @@ const FontInfo * HTMLRenderer::install_font(GfxFont * font)
if(font == nullptr)
{
+ new_font_info.em_size = 0;
+ new_font_info.space_width = 0;
new_font_info.ascent = 0;
new_font_info.descent = 0;
new_font_info.is_type3 = false;