1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-03 08:38:39 +00:00

clean code; add support for letter-spacing, word-spacing, horizontal-scale and rise

This commit is contained in:
Lu Wang 2012-08-15 18:48:11 +08:00
parent 29d41573ef
commit 87204a9e9e
8 changed files with 287 additions and 129 deletions

View File

@ -21,7 +21,7 @@
position:absolute;
white-space:pre;
}
.l > .w {
.l > ._ {
display:inline-block;
font-family: monospace;
}

View File

@ -39,16 +39,18 @@
*
* p - Page
* l - Line
* w - White space
* _ - white space
* i - Image
*
* Reusable CSS classes
*
* t<hex> - Transform matrix
* f<hex> - Font (also for font names)
* s<hex> - font Size
* w<hex> - White space
* t<hex> - Transform matrix
* l<hex> - Letter spacing
* w<hex> - Word spacing
* c<hex> - Color
* _<hex> - white space
*
*/
@ -93,13 +95,26 @@ class HTMLRenderer : public OutputDev
* We just mark as changed, and recheck if they have been changed when we are about to output a new string
*/
virtual void updateAll(GfxState * state);
virtual void updateFont(GfxState * state);
virtual void updateTextMat(GfxState * state);
virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
virtual void updateRise(GfxState * state);
virtual void updateTextPos(GfxState * state);
virtual void updateTextShift(GfxState * state, double shift);
virtual void updateFont(GfxState * state);
virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
virtual void updateTextMat(GfxState * state);
virtual void updateHorizScaling(GfxState * state);
virtual void updateCharSpace(GfxState * state);
virtual void updateWordSpace(GfxState * state);
virtual void updateFillColor(GfxState * state);
/*
* Rendering
*/
virtual void drawString(GfxState * state, GooString * s);
virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg);
@ -121,9 +136,11 @@ class HTMLRenderer : public OutputDev
void install_external_font (GfxFont * font, long long fn_id);
long long install_font_size(double font_size);
long long install_whitespace(double ws_width, double & actual_width);
long long install_transform_matrix(const double * tm);
long long install_letter_space(double letter_space);
long long install_word_space(double word_space);
long long install_color(const GfxRGB * rgb);
long long install_whitespace(double ws_width, double & actual_width);
////////////////////////////////////////////////////
// export css styles
@ -136,9 +153,11 @@ class HTMLRenderer : public OutputDev
void export_remote_default_font(long long fn_id);
void export_local_font(long long fn_id, GfxFont * font, const std::string & original_font_name, const std::string & cssfont);
void export_font_size(long long fs_id, double font_size);
void export_whitespace(long long ws_id, double ws_width);
void export_transform_matrix(long long tm_id, const double * tm);
void export_letter_space(long long ls_id, double letter_space);
void export_word_space(long long ws_id, double word_space);
void export_color(long long color_id, const GfxRGB * rgb);
void export_whitespace(long long ws_id, double ws_width);
////////////////////////////////////////////////////
// state tracking
@ -165,26 +184,46 @@ class HTMLRenderer : public OutputDev
////////////////////////////////////////////////////
// if we have a pending opened line
bool line_opened;
// The order is according to the appearance in check_state_change
// any state changed
bool all_changed;
// rise
double cur_rise;
bool rise_changed;
// current position
double cur_tx, cur_ty; // real text position, in text coords
bool text_pos_changed;
// font & size
long long cur_fn_id;
double cur_font_size;
long long cur_fs_id;
bool font_changed;
// transform matrix
long long cur_tm_id;
bool ctm_changed;
bool text_mat_changed;
// horizontal scaling
bool hori_scale_changed;
// this is CTM * TextMAT in PDF, not only CTM
// [4] and [5] are ignored, we'll calculate the position of the origin separately
// [4] and [5] are ignored,
// as we'll calculate the position of the origin separately
// TODO: changed this for images
double cur_ctm[6]; // unscaled
// letter spacing
long long cur_ls_id;
double cur_letter_space;
bool letter_space_changed;
// word spacing
long long cur_ws_id;
double cur_word_space;
bool word_space_changed;
// color
long long cur_color_id;
GfxRGB cur_color;
bool color_changed;
@ -208,10 +247,16 @@ class HTMLRenderer : public OutputDev
std::unordered_map<long long, FontInfo> font_name_map;
std::map<double, long long> font_size_map;
std::map<double, long long> whitespace_map;
std::map<TM, long long> transform_matrix_map;
std::map<double, long long> letter_space_map;
std::map<double, long long> word_space_map;
std::map<GfxRGB, long long> color_map;
std::map<double, long long> whitespace_map;
int image_count;
const Param * param;

View File

@ -94,11 +94,6 @@ void HTMLRenderer::export_font_size (long long fs_id, double font_size)
allcss_fout << format(".s%|1$x|{font-size:%2%px;}") % fs_id % font_size << endl;
}
void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
{
allcss_fout << format(".w%|1$x|{width:%2%px;}") % ws_id % ws_width << endl;
}
void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
{
allcss_fout << format(".t%|1$x|{") % tm_id;
@ -128,10 +123,25 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
allcss_fout << "}" << endl;
}
void HTMLRenderer::export_color(long long color_id, const GfxRGB * rgb)
void HTMLRenderer::export_letter_space (long long ls_id, double letter_space)
{
allcss_fout << format(".l%|1$x|{letter-spacing:%2%px;}") % ls_id % letter_space << endl;
}
void HTMLRenderer::export_word_space (long long ws_id, double word_space)
{
allcss_fout << format(".w%|1$x|{word-spacing:%2%px;}") % ws_id % word_space << endl;
}
void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb)
{
allcss_fout << format(".c%|1$x|{color:rgb(%2%,%3%,%4%);}")
% color_id % (int)colToByte(rgb->r) % (int)colToByte(rgb->g) % (int)colToByte(rgb->b)
<< endl;
}
void HTMLRenderer::export_whitespace (long long ws_id, double ws_width)
{
allcss_fout << format("._%|1$x|{width:%2%px;}") % ws_id % ws_width << endl;
}

View File

@ -30,6 +30,9 @@ HTMLRenderer::HTMLRenderer(const Param * param)
install_font_size(0);
install_transform_matrix(id_matrix);
install_letter_space(0);
install_word_space(0);
GfxRGB black;
black.r = black.g = black.b = 0;
@ -158,18 +161,27 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
html_fout << format(");background-position:0 0;background-size:%1%px %2%px;background-repeat:no-repeat;\">") % pageWidth % pageHeight;
cur_fn_id = cur_fs_id = cur_tm_id = cur_color_id = 0;
cur_tx = cur_ty = 0;
cur_font_size = 0;
cur_rise = 0;
cur_fn_id = cur_fs_id = 0;
cur_font_size = 0;
cur_tm_id = 0;
memcpy(cur_ctm, id_matrix, sizeof(cur_ctm));
cur_ls_id = cur_ws_id = 0;
cur_letter_space = cur_word_space = 0;
cur_color_id = 0;
cur_color.r = cur_color.g = cur_color.b = 0;
cur_tx = cur_ty = 0;
memcpy(draw_ctm, id_matrix, sizeof(draw_ctm));
draw_font_size = 0;
draw_scale = 1.0;
draw_tx = draw_ty = 0;
cur_color.r = cur_color.g = cur_color.b = 0;
reset_state_track();
}

View File

@ -237,22 +237,6 @@ long long HTMLRenderer::install_font_size(double font_size)
return new_fs_id;
}
long long HTMLRenderer::install_whitespace(double ws_width, double & actual_width)
{
auto iter = whitespace_map.lower_bound(ws_width - param->h_eps);
if((iter != whitespace_map.end()) && (abs(iter->first - ws_width) < param->h_eps))
{
actual_width = iter->first;
return iter->second;
}
actual_width = ws_width;
long long new_ws_id = whitespace_map.size();
whitespace_map.insert(make_pair(ws_width, new_ws_id));
export_whitespace(new_ws_id, ws_width);
return new_ws_id;
}
long long HTMLRenderer::install_transform_matrix(const double * tm)
{
TM m(tm);
@ -266,6 +250,30 @@ long long HTMLRenderer::install_transform_matrix(const double * tm)
return new_tm_id;
}
long long HTMLRenderer::install_letter_space(double letter_space)
{
auto iter = letter_space_map.lower_bound(letter_space - EPS);
if((iter != letter_space_map.end()) && (_equal(iter->first, letter_space)))
return iter->second;
long long new_ls_id = letter_space_map.size();
letter_space_map.insert(make_pair(letter_space, new_ls_id));
export_letter_space(new_ls_id, letter_space);
return new_ls_id;
}
long long HTMLRenderer::install_word_space(double word_space)
{
auto iter = word_space_map.lower_bound(word_space - EPS);
if((iter != word_space_map.end()) && (_equal(iter->first, word_space)))
return iter->second;
long long new_ws_id = word_space_map.size();
word_space_map.insert(make_pair(word_space, new_ws_id));
export_word_space(new_ws_id, word_space);
return new_ws_id;
}
long long HTMLRenderer::install_color(const GfxRGB * rgb)
{
const GfxRGB & c = *rgb;
@ -279,3 +287,20 @@ long long HTMLRenderer::install_color(const GfxRGB * rgb)
return new_color_id;
}
long long HTMLRenderer::install_whitespace(double ws_width, double & actual_width)
{
// ws_width is already mulitpled by draw_scale
auto iter = whitespace_map.lower_bound(ws_width - param->h_eps);
if((iter != whitespace_map.end()) && (abs(iter->first - ws_width) < param->h_eps))
{
actual_width = iter->first;
return iter->second;
}
actual_width = ws_width;
long long new_ws_id = whitespace_map.size();
whitespace_map.insert(make_pair(ws_width, new_ws_id));
export_whitespace(new_ws_id, ws_width);
return new_ws_id;
}

View File

@ -11,34 +11,97 @@
#include "HTMLRenderer.h"
#include "namespace.h"
void HTMLRenderer::updateAll(GfxState * state)
{
all_changed = true;
updateTextPos(state);
}
void HTMLRenderer::updateRise(GfxState * state)
{
rise_changed = true;
}
void HTMLRenderer::updateTextPos(GfxState * state)
{
text_pos_changed = true;
cur_tx = state->getLineX();
cur_ty = state->getLineY();
}
void HTMLRenderer::updateTextShift(GfxState * state, double shift)
{
text_pos_changed = true;
cur_tx -= shift * 0.001 * state->getFontSize() * state->getHorizScaling();
}
void HTMLRenderer::updateFont(GfxState * state)
{
font_changed = true;
}
void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32)
{
ctm_changed = true;
}
void HTMLRenderer::updateTextMat(GfxState * state)
{
text_mat_changed = true;
}
void HTMLRenderer::updateHorizScaling(GfxState * state)
{
hori_scale_changed = true;
}
void HTMLRenderer::updateCharSpace(GfxState * state)
{
letter_space_changed = true;
}
void HTMLRenderer::updateWordSpace(GfxState * state)
{
word_space_changed = true;
}
void HTMLRenderer::updateFillColor(GfxState * state)
{
color_changed = true;
}
void HTMLRenderer::check_state_change(GfxState * state)
{
// DEPENDENCY WARNING
// don't adjust the order of state checking
bool close_line = false;
bool need_recheck_position = false;
bool need_rescale_font = false;
// rise
if(all_changed || rise_changed)
{
double new_rise = state->getRise();
if(!_equal(cur_rise, new_rise))
{
need_recheck_position = true;
cur_rise = new_rise;
}
}
// text position
// we've been tracking the text position positively in update... function
if(all_changed || text_pos_changed)
{
if(!(abs(cur_ty - draw_ty) * draw_scale < param->v_eps))
{
close_line = true;
draw_ty = cur_ty;
draw_tx = cur_tx;
}
need_rescale_font = true;
}
// TODO, we may use nested span if only color has been changed
if(all_changed || color_changed)
// draw_tx, draw_ty
// depends: rise & text position
if(need_recheck_position)
{
GfxRGB new_color;
state->getFillRGB(&new_color);
if(!((new_color.r == cur_color.r) && (new_color.g == cur_color.g) && (new_color.b == cur_color.b)))
// it's ok to use the old draw_scale
// should draw_scale be updated, we'll close the line anyway
if(!(abs((cur_ty + cur_rise) - draw_ty) * draw_scale < param->v_eps))
{
close_line = true;
cur_color = new_color;
cur_color_id = install_color(&new_color);
}
}
bool need_rescale_font = false;
// font name & size
if(all_changed || font_changed)
{
long long new_fn_id = install_font(state->getFont());
@ -49,22 +112,25 @@ void HTMLRenderer::check_state_change(GfxState * state)
cur_fn_id = new_fn_id;
}
if(!_equal(cur_font_size, state->getFontSize()))
double new_font_size = state->getFontSize();
if(!_equal(cur_font_size, new_font_size))
{
cur_font_size = state->getFontSize();
need_rescale_font = true;
cur_font_size = new_font_size;
}
}
// TODO
// Rise, HorizScale etc
if(all_changed || text_mat_changed || ctm_changed)
// ctm & text ctm & hori scale
if(all_changed || ctm_changed || text_mat_changed || hori_scale_changed)
{
double new_ctm[6];
double * m1 = state->getCTM();
double * m2 = state->getTextMat();
new_ctm[0] = m1[0] * m2[0] + m1[2] * m2[1];
new_ctm[1] = m1[1] * m2[0] + m1[3] * m2[1];
const double * m1 = state->getCTM();
const double * m2 = state->getTextMat();
double hori_scale = state->getHorizScaling();
new_ctm[0] = (m1[0] * m2[0] + m1[2] * m2[1]) * hori_scale;
new_ctm[1] = (m1[1] * m2[0] + m1[3] * m2[1]) * hori_scale;
new_ctm[2] = m1[0] * m2[2] + m1[2] * m2[3];
new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3];
new_ctm[4] = new_ctm[5] = 0;
@ -76,6 +142,8 @@ void HTMLRenderer::check_state_change(GfxState * state)
}
}
// draw_ctm, draw_scale
// depends: font size & ctm & text_ctm & hori scale
if(need_rescale_font)
{
double new_draw_ctm[6];
@ -97,24 +165,58 @@ void HTMLRenderer::check_state_change(GfxState * state)
if(!(_equal(new_draw_font_size, draw_font_size)))
{
close_line = true;
draw_font_size = new_draw_font_size;
cur_fs_id = install_font_size(draw_font_size);
close_line = true;
}
if(!(_tm_equal(new_draw_ctm, draw_ctm)))
{
close_line = true;
memcpy(draw_ctm, new_draw_ctm, sizeof(draw_ctm));
cur_tm_id = install_transform_matrix(draw_ctm);
close_line = true;
}
}
// TODO: track these
/*
if(!(_equal(s1->getCharSpace(), s2->getCharSpace()) && _equal(s1->getWordSpace(), s2->getWordSpace())
&& _equal(s1->getHorizScaling(), s2->getHorizScaling())))
return false;
*/
// letter space
// depends: draw_scale
if(all_changed || letter_space_changed)
{
double new_letter_space = state->getCharSpace();
if(!_equal(cur_letter_space, new_letter_space))
{
close_line = true;
cur_letter_space = new_letter_space;
cur_ls_id = install_letter_space(cur_letter_space * draw_scale);
}
}
// word space
// depends draw_scale
if(all_changed || word_space_changed)
{
double new_word_space = state->getWordSpace();
if(!_equal(cur_word_space, new_word_space))
{
close_line = true;
cur_word_space = new_word_space;
cur_ws_id = install_word_space(cur_word_space * draw_scale);
}
}
// TODO, we may use nested span if only color is changed
// color
if(all_changed || color_changed)
{
GfxRGB new_color;
state->getFillRGB(&new_color);
if(!((new_color.r == cur_color.r) && (new_color.g == cur_color.g) && (new_color.b == cur_color.b)))
{
close_line = true;
cur_color = new_color;
cur_color_id = install_color(&new_color);
}
}
reset_state_track();
@ -124,10 +226,18 @@ void HTMLRenderer::check_state_change(GfxState * state)
void HTMLRenderer::reset_state_track()
{
all_changed = false;
rise_changed = false;
text_pos_changed = false;
font_changed = false;
ctm_changed = false;
text_mat_changed = false;
font_changed = false;
hori_scale_changed = false;
letter_space_changed = false;
word_space_changed = false;
color_changed = false;
}
void HTMLRenderer::close_cur_line()
@ -137,43 +247,8 @@ void HTMLRenderer::close_cur_line()
html_fout << "</div>" << endl;
line_opened = false;
}
draw_ty = cur_ty + cur_rise;
draw_tx = cur_tx;
}
void HTMLRenderer::updateAll(GfxState * state)
{
all_changed = true;
updateTextPos(state);
}
void HTMLRenderer::updateFont(GfxState * state)
{
font_changed = true;
}
void HTMLRenderer::updateTextMat(GfxState * state)
{
text_mat_changed = true;
}
void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32)
{
ctm_changed = true;
}
void HTMLRenderer::updateTextPos(GfxState * state)
{
text_pos_changed = true;
cur_tx = state->getLineX();
cur_ty = state->getLineY();
}
void HTMLRenderer::updateTextShift(GfxState * state, double shift)
{
text_pos_changed = true;
cur_tx -= shift * 0.001 * state->getFontSize() * state->getHorizScaling();
}
void HTMLRenderer::updateFillColor(GfxState * state)
{
color_changed = true;
}

View File

@ -166,7 +166,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
{
double w;
auto wid = install_whitespace(target, w);
html_fout << format("<span class=\"w w%|1$x|\"> </span>") % wid;
html_fout << format("<span class=\"_ _%|1$x|\"> </span>") % wid;
draw_tx += w / draw_scale;
}
}
@ -188,6 +188,13 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
// "t0" is the id_matrix
if(cur_tm_id != 0)
html_fout << format(" t%|1$x|") % cur_tm_id;
if(cur_ls_id != 0)
html_fout << format(" l%|1$x|") % cur_ls_id;
if(cur_ws_id != 0)
html_fout << format(" w%|1$x|") % cur_ws_id;
{
double x,y; // in user space
@ -199,16 +206,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
<< "left:" << x << "px;"
;
}
// TODO: tracking
// letter & word spacing
if(_is_positive(state->getCharSpace()))
html_fout << "letter-spacing:" << state->getCharSpace() << "px;";
if(_is_positive(state->getWordSpace()))
html_fout << "word-spacing:" << state->getWordSpace() << "px;";
//debug
//real pos & hori_scale
if(0)
{
#if 0
@ -223,11 +222,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
html_fout << "\">";
line_opened = true;
draw_tx = cur_tx;
}
// Now ready to output
// get the unicodes
char *p = s->getCString();

View File

@ -122,13 +122,8 @@ class base64stream
{
public:
base64stream(istream & in)
: in(&in)
{ }
base64stream(istream && in)
: in(&in)
{ }
base64stream(istream & in) : in(&in) { }
base64stream(istream && in) : in(&in) { }
ostream & dumpto(ostream & out)
{