diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc
index 289f053..23c3af3 100644
--- a/src/HTMLRenderer/text.cc
+++ b/src/HTMLRenderer/text.cc
@@ -29,6 +29,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
// unscaled
double cur_letter_space = state->getCharSpace();
double cur_word_space = state->getWordSpace();
+ double cur_horiz_scaling = state->getHorizScaling();
+
// Writing mode fonts and Type 3 fonts are rendered as images
// I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
@@ -89,15 +91,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
if(is_space && (param.space_as_offset))
{
- // ignore horiz_scaling, as it's merged in CTM
+ // ignore horiz_scaling, as it has been merged into CTM
html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
}
else
{
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
{
- html_text_page.get_cur_line()->append_unicodes(u, uLen);
- // TODO: decomposed characters may be not with the same width as the original ligature, need to fix it.
+ // TODO: why multiply cur_horiz_scaling here?
+ html_text_page.get_cur_line()->append_unicodes(u, uLen, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling);
}
else
{
@@ -110,7 +112,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
{
uu = unicode_from_font(code, font);
}
- html_text_page.get_cur_line()->append_unicodes(&uu, 1);
+ // TODO: why multiply cur_horiz_scaling here?
+ html_text_page.get_cur_line()->append_unicodes(&uu, 1, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling);
/*
* In PDF, word_space is appended if (n == 1 and *p = ' ')
* but in HTML, word_space is appended if (uu == ' ')
@@ -131,12 +134,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
len -= n;
}
- double hs = state->getHorizScaling();
-
// horiz_scaling is merged into ctm now,
// so the coordinate system is ugly
- // TODO: why multiply hs here
- dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs;
+ // TODO: why multiply cur_horiz_scaling here
+ dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * cur_horiz_scaling;
dy *= cur_font_size;
diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc
index 4759e15..51cd2d8 100644
--- a/src/HTMLTextLine.cc
+++ b/src/HTMLTextLine.cc
@@ -31,11 +31,13 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
,line_state(line_state)
,clip_x1(0)
,clip_y1(0)
+ ,width(0)
{ }
-void HTMLTextLine::append_unicodes(const Unicode * u, int l)
+void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
{
text.insert(text.end(), u, u+l);
+ this->width += width;
}
void HTMLTextLine::append_offset(double width)
@@ -49,6 +51,7 @@ void HTMLTextLine::append_offset(double width)
offsets.back().width += width;
else
offsets.emplace_back(text.size(), width);
+ this->width += width;
}
void HTMLTextLine::append_state(const HTMLTextState & text_state)
@@ -188,28 +191,17 @@ void HTMLTextLine::dump_text(ostream & out)
// finally, just dump it
if(!done)
{
- if(param.optimize_text < 3)
+ long long wid = all_manager.whitespace.install(target, &actual_offset);
+
+ if(!equal(actual_offset, 0))
{
- long long wid = all_manager.whitespace.install(target, &actual_offset);
+ if(is_positive(-actual_offset))
+ last_text_pos_with_negative_offset = cur_text_idx;
- if(!equal(actual_offset, 0))
- {
- if(is_positive(-actual_offset))
- last_text_pos_with_negative_offset = cur_text_idx;
-
- double threshold = state_iter1->em_size() * (param.space_threshold);
-
- out << "" << (target > (threshold - EPS) ? " " : "") << "";
- }
- }
- else
- {
- // aggressive optimization
double threshold = state_iter1->em_size() * (param.space_threshold);
- if(target > threshold)
- out << ' ';
- actual_offset = target;
+
+ out << "" << (target > (threshold - EPS) ? " " : "") << "";
}
}
}
@@ -255,9 +247,6 @@ void HTMLTextLine::clip(const HTMLClipState & clip_state)
void HTMLTextLine::prepare(void)
{
- if(param.optimize_text)
- optimize();
-
// max_ascent determines the height of the div
double accum_vertical_align = 0; // accumulated
ascent = 0;
@@ -285,11 +274,22 @@ void HTMLTextLine::prepare(void)
}
+void HTMLTextLine::optimize(std::vector & lines)
+{
+ if(param.optimize_text == 3)
+ {
+ optimize_aggressive(lines);
+ }
+ else
+ {
+ optimize_normal(lines);
+ }
+}
/*
* Adjust letter space and word space in order to reduce the number of HTML elements
* May also unmask word space
*/
-void HTMLTextLine::optimize()
+void HTMLTextLine::optimize_normal(std::vector & lines)
{
// remove unuseful states in the end
while((!states.empty()) && (states.back().start_idx >= text.size()))
@@ -465,6 +465,32 @@ void HTMLTextLine::optimize()
// apply optimization
std::swap(offsets, new_offsets);
+
+ lines.push_back(this);
+}
+
+// for optimize-text == 3
+void HTMLTextLine::optimize_aggressive(std::vector & lines)
+{
+ HTMLTextLine *cur_line = this;
+ while(true)
+ {
+ lines.push_back(cur_line);
+
+ // break the line if there are a large (positive or negative) shift
+ // letter space / word space are not taken into consideration (yet)
+
+
+ // TODO
+ }
+
+ /*
+ // aggressive optimization
+ if(target > state_iter1->em_size() * (param.space_threshold) - EPS)
+ out << ' ';
+ dx = 0;
+ lines.push_back(this);
+ */
}
// this state will be converted to a child node of the node of prev_state
diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h
index c974c0f..782b491 100644
--- a/src/HTMLTextLine.h
+++ b/src/HTMLTextLine.h
@@ -73,7 +73,7 @@ public:
double width;
};
- void append_unicodes(const Unicode * u, int l);
+ void append_unicodes(const Unicode * u, int l, double width);
void append_offset(double width);
void append_state(const HTMLTextState & text_state);
void dump_text(std::ostream & out);
@@ -87,8 +87,10 @@ public:
* Optimize and calculate necessary values
*/
void prepare(void);
+ void optimize(std::vector &);
private:
- void optimize(void);
+ void optimize_normal(std::vector &);
+ void optimize_aggressive(std::vector &);
const Param & param;
AllStateManager & all_manager;
@@ -96,6 +98,7 @@ private:
HTMLLineState line_state;
double ascent, descent;
double clip_x1, clip_y1;
+ double width;
std::vector states;
std::vector offsets;
diff --git a/src/HTMLTextPage.cc b/src/HTMLTextPage.cc
index 0f23d8b..4bc23d3 100644
--- a/src/HTMLTextPage.cc
+++ b/src/HTMLTextPage.cc
@@ -12,7 +12,6 @@
namespace pdf2htmlEX {
using std::ostream;
-using std::unique_ptr;
HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
: param(param)
@@ -22,8 +21,24 @@ HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
, page_height(0)
{ }
+HTMLTextPage::~HTMLTextPage()
+{
+ for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
+ {
+ delete (*iter);
+ }
+}
+
void HTMLTextPage::dump_text(ostream & out)
{
+ if(param.optimize_text)
+ {
+ // text lines may be split during optimization, collect them
+ std::vector new_text_lines;
+ for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
+ (*iter)->optimize(new_text_lines);
+ std::swap(text_lines, new_text_lines);
+ }
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
(*iter)->prepare();
if(param.optimize_text)
@@ -98,7 +113,7 @@ void HTMLTextPage::open_new_line(const HTMLLineState & line_state)
// do not reused the last text_line even if it's empty
// because the clip states may point to the next index
text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager));
- cur_line = text_lines.back().get();
+ cur_line = text_lines.back();
}
void HTMLTextPage::set_page_size(double width, double height)
diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h
index ec01e24..7bffec4 100644
--- a/src/HTMLTextPage.h
+++ b/src/HTMLTextPage.h
@@ -7,7 +7,6 @@
#define HTMLTEXTPAGE_H__
#include
-#include
#include
#include "Param.h"
@@ -26,6 +25,7 @@ class HTMLTextPage
{
public:
HTMLTextPage (const Param & param, AllStateManager & all_manager);
+ ~HTMLTextPage();
HTMLTextLine * get_cur_line(void) const { return cur_line; }
@@ -47,7 +47,7 @@ private:
HTMLTextLine * cur_line;
double page_width, page_height;
- std::vector> text_lines;
+ std::vector text_lines;
struct Clip {
HTMLClipState clip_state;