mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
WIP: HTMLTextLine
This commit is contained in:
parent
11b488e236
commit
aea00515fe
@ -29,6 +29,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
|||||||
// unscaled
|
// unscaled
|
||||||
double cur_letter_space = state->getCharSpace();
|
double cur_letter_space = state->getCharSpace();
|
||||||
double cur_word_space = state->getWordSpace();
|
double cur_word_space = state->getWordSpace();
|
||||||
|
double cur_horiz_scaling = state->getHorizScaling();
|
||||||
|
|
||||||
|
|
||||||
// Writing mode fonts and Type 3 fonts are rendered as images
|
// Writing mode fonts and Type 3 fonts are rendered as images
|
||||||
// I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
|
// I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
|
||||||
@ -89,15 +91,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
|||||||
|
|
||||||
if(is_space && (param.space_as_offset))
|
if(is_space && (param.space_as_offset))
|
||||||
{
|
{
|
||||||
// ignore horiz_scaling, as it's merged in CTM
|
// ignore horiz_scaling, as it has been merged into CTM
|
||||||
html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
|
html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
|
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
|
||||||
{
|
{
|
||||||
html_text_page.get_cur_line()->append_unicodes(u, uLen);
|
// TODO: why multiply cur_horiz_scaling here?
|
||||||
// TODO: decomposed characters may be not with the same width as the original ligature, need to fix it.
|
html_text_page.get_cur_line()->append_unicodes(u, uLen, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -110,7 +112,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
|||||||
{
|
{
|
||||||
uu = unicode_from_font(code, font);
|
uu = unicode_from_font(code, font);
|
||||||
}
|
}
|
||||||
html_text_page.get_cur_line()->append_unicodes(&uu, 1);
|
// TODO: why multiply cur_horiz_scaling here?
|
||||||
|
html_text_page.get_cur_line()->append_unicodes(&uu, 1, (dx1 * cur_font_size + cur_letter_space) * cur_horiz_scaling);
|
||||||
/*
|
/*
|
||||||
* In PDF, word_space is appended if (n == 1 and *p = ' ')
|
* In PDF, word_space is appended if (n == 1 and *p = ' ')
|
||||||
* but in HTML, word_space is appended if (uu == ' ')
|
* but in HTML, word_space is appended if (uu == ' ')
|
||||||
@ -131,12 +134,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
|||||||
len -= n;
|
len -= n;
|
||||||
}
|
}
|
||||||
|
|
||||||
double hs = state->getHorizScaling();
|
|
||||||
|
|
||||||
// horiz_scaling is merged into ctm now,
|
// horiz_scaling is merged into ctm now,
|
||||||
// so the coordinate system is ugly
|
// so the coordinate system is ugly
|
||||||
// TODO: why multiply hs here
|
// TODO: why multiply cur_horiz_scaling here
|
||||||
dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs;
|
dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * cur_horiz_scaling;
|
||||||
|
|
||||||
dy *= cur_font_size;
|
dy *= cur_font_size;
|
||||||
|
|
||||||
|
@ -31,11 +31,13 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
|
|||||||
,line_state(line_state)
|
,line_state(line_state)
|
||||||
,clip_x1(0)
|
,clip_x1(0)
|
||||||
,clip_y1(0)
|
,clip_y1(0)
|
||||||
|
,width(0)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
void HTMLTextLine::append_unicodes(const Unicode * u, int l)
|
void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
|
||||||
{
|
{
|
||||||
text.insert(text.end(), u, u+l);
|
text.insert(text.end(), u, u+l);
|
||||||
|
this->width += width;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTextLine::append_offset(double width)
|
void HTMLTextLine::append_offset(double width)
|
||||||
@ -49,6 +51,7 @@ void HTMLTextLine::append_offset(double width)
|
|||||||
offsets.back().width += width;
|
offsets.back().width += width;
|
||||||
else
|
else
|
||||||
offsets.emplace_back(text.size(), width);
|
offsets.emplace_back(text.size(), width);
|
||||||
|
this->width += width;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTextLine::append_state(const HTMLTextState & text_state)
|
void HTMLTextLine::append_state(const HTMLTextState & text_state)
|
||||||
@ -188,28 +191,17 @@ void HTMLTextLine::dump_text(ostream & out)
|
|||||||
// finally, just dump it
|
// finally, just dump it
|
||||||
if(!done)
|
if(!done)
|
||||||
{
|
{
|
||||||
if(param.optimize_text < 3)
|
long long wid = all_manager.whitespace.install(target, &actual_offset);
|
||||||
|
|
||||||
|
if(!equal(actual_offset, 0))
|
||||||
{
|
{
|
||||||
long long wid = all_manager.whitespace.install(target, &actual_offset);
|
if(is_positive(-actual_offset))
|
||||||
|
last_text_pos_with_negative_offset = cur_text_idx;
|
||||||
|
|
||||||
if(!equal(actual_offset, 0))
|
|
||||||
{
|
|
||||||
if(is_positive(-actual_offset))
|
|
||||||
last_text_pos_with_negative_offset = cur_text_idx;
|
|
||||||
|
|
||||||
double threshold = state_iter1->em_size() * (param.space_threshold);
|
|
||||||
|
|
||||||
out << "<span class=\"" << CSS::WHITESPACE_CN
|
|
||||||
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// aggressive optimization
|
|
||||||
double threshold = state_iter1->em_size() * (param.space_threshold);
|
double threshold = state_iter1->em_size() * (param.space_threshold);
|
||||||
if(target > threshold)
|
|
||||||
out << ' ';
|
out << "<span class=\"" << CSS::WHITESPACE_CN
|
||||||
actual_offset = target;
|
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -255,9 +247,6 @@ void HTMLTextLine::clip(const HTMLClipState & clip_state)
|
|||||||
|
|
||||||
void HTMLTextLine::prepare(void)
|
void HTMLTextLine::prepare(void)
|
||||||
{
|
{
|
||||||
if(param.optimize_text)
|
|
||||||
optimize();
|
|
||||||
|
|
||||||
// max_ascent determines the height of the div
|
// max_ascent determines the height of the div
|
||||||
double accum_vertical_align = 0; // accumulated
|
double accum_vertical_align = 0; // accumulated
|
||||||
ascent = 0;
|
ascent = 0;
|
||||||
@ -285,11 +274,22 @@ void HTMLTextLine::prepare(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void HTMLTextLine::optimize(std::vector<HTMLTextLine*> & lines)
|
||||||
|
{
|
||||||
|
if(param.optimize_text == 3)
|
||||||
|
{
|
||||||
|
optimize_aggressive(lines);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
optimize_normal(lines);
|
||||||
|
}
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* Adjust letter space and word space in order to reduce the number of HTML elements
|
* Adjust letter space and word space in order to reduce the number of HTML elements
|
||||||
* May also unmask word space
|
* May also unmask word space
|
||||||
*/
|
*/
|
||||||
void HTMLTextLine::optimize()
|
void HTMLTextLine::optimize_normal(std::vector<HTMLTextLine*> & lines)
|
||||||
{
|
{
|
||||||
// remove unuseful states in the end
|
// remove unuseful states in the end
|
||||||
while((!states.empty()) && (states.back().start_idx >= text.size()))
|
while((!states.empty()) && (states.back().start_idx >= text.size()))
|
||||||
@ -465,6 +465,32 @@ void HTMLTextLine::optimize()
|
|||||||
|
|
||||||
// apply optimization
|
// apply optimization
|
||||||
std::swap(offsets, new_offsets);
|
std::swap(offsets, new_offsets);
|
||||||
|
|
||||||
|
lines.push_back(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
// for optimize-text == 3
|
||||||
|
void HTMLTextLine::optimize_aggressive(std::vector<HTMLTextLine*> & lines)
|
||||||
|
{
|
||||||
|
HTMLTextLine *cur_line = this;
|
||||||
|
while(true)
|
||||||
|
{
|
||||||
|
lines.push_back(cur_line);
|
||||||
|
|
||||||
|
// break the line if there are a large (positive or negative) shift
|
||||||
|
// letter space / word space are not taken into consideration (yet)
|
||||||
|
|
||||||
|
|
||||||
|
// TODO
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
// aggressive optimization
|
||||||
|
if(target > state_iter1->em_size() * (param.space_threshold) - EPS)
|
||||||
|
out << ' ';
|
||||||
|
dx = 0;
|
||||||
|
lines.push_back(this);
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
// this state will be converted to a child node of the node of prev_state
|
// this state will be converted to a child node of the node of prev_state
|
||||||
|
@ -73,7 +73,7 @@ public:
|
|||||||
double width;
|
double width;
|
||||||
};
|
};
|
||||||
|
|
||||||
void append_unicodes(const Unicode * u, int l);
|
void append_unicodes(const Unicode * u, int l, double width);
|
||||||
void append_offset(double width);
|
void append_offset(double width);
|
||||||
void append_state(const HTMLTextState & text_state);
|
void append_state(const HTMLTextState & text_state);
|
||||||
void dump_text(std::ostream & out);
|
void dump_text(std::ostream & out);
|
||||||
@ -87,8 +87,10 @@ public:
|
|||||||
* Optimize and calculate necessary values
|
* Optimize and calculate necessary values
|
||||||
*/
|
*/
|
||||||
void prepare(void);
|
void prepare(void);
|
||||||
|
void optimize(std::vector<HTMLTextLine*> &);
|
||||||
private:
|
private:
|
||||||
void optimize(void);
|
void optimize_normal(std::vector<HTMLTextLine*> &);
|
||||||
|
void optimize_aggressive(std::vector<HTMLTextLine*> &);
|
||||||
|
|
||||||
const Param & param;
|
const Param & param;
|
||||||
AllStateManager & all_manager;
|
AllStateManager & all_manager;
|
||||||
@ -96,6 +98,7 @@ private:
|
|||||||
HTMLLineState line_state;
|
HTMLLineState line_state;
|
||||||
double ascent, descent;
|
double ascent, descent;
|
||||||
double clip_x1, clip_y1;
|
double clip_x1, clip_y1;
|
||||||
|
double width;
|
||||||
|
|
||||||
std::vector<State> states;
|
std::vector<State> states;
|
||||||
std::vector<Offset> offsets;
|
std::vector<Offset> offsets;
|
||||||
|
@ -12,7 +12,6 @@
|
|||||||
namespace pdf2htmlEX {
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
using std::ostream;
|
using std::ostream;
|
||||||
using std::unique_ptr;
|
|
||||||
|
|
||||||
HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
|
HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
|
||||||
: param(param)
|
: param(param)
|
||||||
@ -22,8 +21,24 @@ HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
|
|||||||
, page_height(0)
|
, page_height(0)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
|
HTMLTextPage::~HTMLTextPage()
|
||||||
|
{
|
||||||
|
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
|
||||||
|
{
|
||||||
|
delete (*iter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void HTMLTextPage::dump_text(ostream & out)
|
void HTMLTextPage::dump_text(ostream & out)
|
||||||
{
|
{
|
||||||
|
if(param.optimize_text)
|
||||||
|
{
|
||||||
|
// text lines may be split during optimization, collect them
|
||||||
|
std::vector<HTMLTextLine*> new_text_lines;
|
||||||
|
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
|
||||||
|
(*iter)->optimize(new_text_lines);
|
||||||
|
std::swap(text_lines, new_text_lines);
|
||||||
|
}
|
||||||
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
|
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
|
||||||
(*iter)->prepare();
|
(*iter)->prepare();
|
||||||
if(param.optimize_text)
|
if(param.optimize_text)
|
||||||
@ -98,7 +113,7 @@ void HTMLTextPage::open_new_line(const HTMLLineState & line_state)
|
|||||||
// do not reused the last text_line even if it's empty
|
// do not reused the last text_line even if it's empty
|
||||||
// because the clip states may point to the next index
|
// because the clip states may point to the next index
|
||||||
text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager));
|
text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager));
|
||||||
cur_line = text_lines.back().get();
|
cur_line = text_lines.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTextPage::set_page_size(double width, double height)
|
void HTMLTextPage::set_page_size(double width, double height)
|
||||||
|
@ -7,7 +7,6 @@
|
|||||||
#define HTMLTEXTPAGE_H__
|
#define HTMLTEXTPAGE_H__
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
|
||||||
#include <ostream>
|
#include <ostream>
|
||||||
|
|
||||||
#include "Param.h"
|
#include "Param.h"
|
||||||
@ -26,6 +25,7 @@ class HTMLTextPage
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
HTMLTextPage (const Param & param, AllStateManager & all_manager);
|
HTMLTextPage (const Param & param, AllStateManager & all_manager);
|
||||||
|
~HTMLTextPage();
|
||||||
|
|
||||||
HTMLTextLine * get_cur_line(void) const { return cur_line; }
|
HTMLTextLine * get_cur_line(void) const { return cur_line; }
|
||||||
|
|
||||||
@ -47,7 +47,7 @@ private:
|
|||||||
HTMLTextLine * cur_line;
|
HTMLTextLine * cur_line;
|
||||||
double page_width, page_height;
|
double page_width, page_height;
|
||||||
|
|
||||||
std::vector<std::unique_ptr<HTMLTextLine>> text_lines;
|
std::vector<HTMLTextLine*> text_lines;
|
||||||
|
|
||||||
struct Clip {
|
struct Clip {
|
||||||
HTMLClipState clip_state;
|
HTMLClipState clip_state;
|
||||||
|
Loading…
Reference in New Issue
Block a user