1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

WIP: HTMLTextLine

This commit is contained in:
Lu Wang 2013-12-22 16:59:59 +08:00
parent 11b488e236
commit aea00515fe
5 changed files with 83 additions and 38 deletions

View File

@ -29,6 +29,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
// unscaled // unscaled
double cur_letter_space = state->getCharSpace(); double cur_letter_space = state->getCharSpace();
double cur_word_space = state->getWordSpace(); double cur_word_space = state->getWordSpace();
double cur_horiz_scaling = state->getHorizScaling();
// Writing mode fonts and Type 3 fonts are rendered as images // Writing mode fonts and Type 3 fonts are rendered as images
// I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly // I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
@ -89,15 +91,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
if(is_space && (param.space_as_offset)) if(is_space && (param.space_as_offset))
{ {
// ignore horiz_scaling, as it's merged in CTM // ignore horiz_scaling, as it has been merged into CTM
html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
} }
else else
{ {
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
{ {
html_text_page.get_cur_line()->append_unicodes(u, uLen); // TODO: why multiply cur_horiz_scaling here?
// TODO: decomposed characters may be not with the same width as the original ligature, need to fix it. html_text_page.get_cur_line()->append_unicodes(u, uLen, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling);
} }
else else
{ {
@ -110,7 +112,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
{ {
uu = unicode_from_font(code, font); uu = unicode_from_font(code, font);
} }
html_text_page.get_cur_line()->append_unicodes(&uu, 1); // TODO: why multiply cur_horiz_scaling here?
html_text_page.get_cur_line()->append_unicodes(&uu, 1, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling);
/* /*
* In PDF, word_space is appended if (n == 1 and *p = ' ') * In PDF, word_space is appended if (n == 1 and *p = ' ')
* but in HTML, word_space is appended if (uu == ' ') * but in HTML, word_space is appended if (uu == ' ')
@ -131,12 +134,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
len -= n; len -= n;
} }
double hs = state->getHorizScaling();
// horiz_scaling is merged into ctm now, // horiz_scaling is merged into ctm now,
// so the coordinate system is ugly // so the coordinate system is ugly
// TODO: why multiply hs here // TODO: why multiply cur_horiz_scaling here
dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs; dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * cur_horiz_scaling;
dy *= cur_font_size; dy *= cur_font_size;

View File

@ -31,11 +31,13 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
,line_state(line_state) ,line_state(line_state)
,clip_x1(0) ,clip_x1(0)
,clip_y1(0) ,clip_y1(0)
,width(0)
{ } { }
void HTMLTextLine::append_unicodes(const Unicode * u, int l) void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
{ {
text.insert(text.end(), u, u+l); text.insert(text.end(), u, u+l);
this->width += width;
} }
void HTMLTextLine::append_offset(double width) void HTMLTextLine::append_offset(double width)
@ -49,6 +51,7 @@ void HTMLTextLine::append_offset(double width)
offsets.back().width += width; offsets.back().width += width;
else else
offsets.emplace_back(text.size(), width); offsets.emplace_back(text.size(), width);
this->width += width;
} }
void HTMLTextLine::append_state(const HTMLTextState & text_state) void HTMLTextLine::append_state(const HTMLTextState & text_state)
@ -188,28 +191,17 @@ void HTMLTextLine::dump_text(ostream & out)
// finally, just dump it // finally, just dump it
if(!done) if(!done)
{ {
if(param.optimize_text < 3) long long wid = all_manager.whitespace.install(target, &actual_offset);
if(!equal(actual_offset, 0))
{ {
long long wid = all_manager.whitespace.install(target, &actual_offset); if(is_positive(-actual_offset))
last_text_pos_with_negative_offset = cur_text_idx;
if(!equal(actual_offset, 0))
{
if(is_positive(-actual_offset))
last_text_pos_with_negative_offset = cur_text_idx;
double threshold = state_iter1->em_size() * (param.space_threshold);
out << "<span class=\"" << CSS::WHITESPACE_CN
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
}
}
else
{
// aggressive optimization
double threshold = state_iter1->em_size() * (param.space_threshold); double threshold = state_iter1->em_size() * (param.space_threshold);
if(target > threshold)
out << ' '; out << "<span class=\"" << CSS::WHITESPACE_CN
actual_offset = target; << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
} }
} }
} }
@ -255,9 +247,6 @@ void HTMLTextLine::clip(const HTMLClipState & clip_state)
void HTMLTextLine::prepare(void) void HTMLTextLine::prepare(void)
{ {
if(param.optimize_text)
optimize();
// max_ascent determines the height of the div // max_ascent determines the height of the div
double accum_vertical_align = 0; // accumulated double accum_vertical_align = 0; // accumulated
ascent = 0; ascent = 0;
@ -285,11 +274,22 @@ void HTMLTextLine::prepare(void)
} }
void HTMLTextLine::optimize(std::vector<HTMLTextLine*> & lines)
{
if(param.optimize_text == 3)
{
optimize_aggressive(lines);
}
else
{
optimize_normal(lines);
}
}
/* /*
* Adjust letter space and word space in order to reduce the number of HTML elements * Adjust letter space and word space in order to reduce the number of HTML elements
* May also unmask word space * May also unmask word space
*/ */
void HTMLTextLine::optimize() void HTMLTextLine::optimize_normal(std::vector<HTMLTextLine*> & lines)
{ {
// remove unuseful states in the end // remove unuseful states in the end
while((!states.empty()) && (states.back().start_idx >= text.size())) while((!states.empty()) && (states.back().start_idx >= text.size()))
@ -465,6 +465,32 @@ void HTMLTextLine::optimize()
// apply optimization // apply optimization
std::swap(offsets, new_offsets); std::swap(offsets, new_offsets);
lines.push_back(this);
}
// for optimize-text == 3
void HTMLTextLine::optimize_aggressive(std::vector<HTMLTextLine*> & lines)
{
HTMLTextLine *cur_line = this;
while(true)
{
lines.push_back(cur_line);
// break the line if there are a large (positive or negative) shift
// letter space / word space are not taken into consideration (yet)
// TODO
}
/*
// aggressive optimization
if(target > state_iter1->em_size() * (param.space_threshold) - EPS)
out << ' ';
dx = 0;
lines.push_back(this);
*/
} }
// this state will be converted to a child node of the node of prev_state // this state will be converted to a child node of the node of prev_state

View File

@ -73,7 +73,7 @@ public:
double width; double width;
}; };
void append_unicodes(const Unicode * u, int l); void append_unicodes(const Unicode * u, int l, double width);
void append_offset(double width); void append_offset(double width);
void append_state(const HTMLTextState & text_state); void append_state(const HTMLTextState & text_state);
void dump_text(std::ostream & out); void dump_text(std::ostream & out);
@ -87,8 +87,10 @@ public:
* Optimize and calculate necessary values * Optimize and calculate necessary values
*/ */
void prepare(void); void prepare(void);
void optimize(std::vector<HTMLTextLine*> &);
private: private:
void optimize(void); void optimize_normal(std::vector<HTMLTextLine*> &);
void optimize_aggressive(std::vector<HTMLTextLine*> &);
const Param & param; const Param & param;
AllStateManager & all_manager; AllStateManager & all_manager;
@ -96,6 +98,7 @@ private:
HTMLLineState line_state; HTMLLineState line_state;
double ascent, descent; double ascent, descent;
double clip_x1, clip_y1; double clip_x1, clip_y1;
double width;
std::vector<State> states; std::vector<State> states;
std::vector<Offset> offsets; std::vector<Offset> offsets;

View File

@ -12,7 +12,6 @@
namespace pdf2htmlEX { namespace pdf2htmlEX {
using std::ostream; using std::ostream;
using std::unique_ptr;
HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager) HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
: param(param) : param(param)
@ -22,8 +21,24 @@ HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
, page_height(0) , page_height(0)
{ } { }
HTMLTextPage::~HTMLTextPage()
{
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
{
delete (*iter);
}
}
void HTMLTextPage::dump_text(ostream & out) void HTMLTextPage::dump_text(ostream & out)
{ {
if(param.optimize_text)
{
// text lines may be split during optimization, collect them
std::vector<HTMLTextLine*> new_text_lines;
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
(*iter)->optimize(new_text_lines);
std::swap(text_lines, new_text_lines);
}
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter) for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
(*iter)->prepare(); (*iter)->prepare();
if(param.optimize_text) if(param.optimize_text)
@ -98,7 +113,7 @@ void HTMLTextPage::open_new_line(const HTMLLineState & line_state)
// do not reused the last text_line even if it's empty // do not reused the last text_line even if it's empty
// because the clip states may point to the next index // because the clip states may point to the next index
text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager)); text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager));
cur_line = text_lines.back().get(); cur_line = text_lines.back();
} }
void HTMLTextPage::set_page_size(double width, double height) void HTMLTextPage::set_page_size(double width, double height)

View File

@ -7,7 +7,6 @@
#define HTMLTEXTPAGE_H__ #define HTMLTEXTPAGE_H__
#include <vector> #include <vector>
#include <memory>
#include <ostream> #include <ostream>
#include "Param.h" #include "Param.h"
@ -26,6 +25,7 @@ class HTMLTextPage
{ {
public: public:
HTMLTextPage (const Param & param, AllStateManager & all_manager); HTMLTextPage (const Param & param, AllStateManager & all_manager);
~HTMLTextPage();
HTMLTextLine * get_cur_line(void) const { return cur_line; } HTMLTextLine * get_cur_line(void) const { return cur_line; }
@ -47,7 +47,7 @@ private:
HTMLTextLine * cur_line; HTMLTextLine * cur_line;
double page_width, page_height; double page_width, page_height;
std::vector<std::unique_ptr<HTMLTextLine>> text_lines; std::vector<HTMLTextLine*> text_lines;
struct Clip { struct Clip {
HTMLClipState clip_state; HTMLClipState clip_state;