1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

Improve covered text handling: 1. take care of chars corespond to 0 or more than one unicode points;

2. merge sibling invisiable spans; 3. improve interfaces of HTMLLineState and HTMLRenderer;
This commit is contained in:
Duan Yao 2014-06-26 12:39:35 +08:00
parent 65e82028bb
commit 39e171a737
8 changed files with 101 additions and 28 deletions

View File

@ -66,7 +66,7 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
// If a char is treated as image, it is not subject to cover test // If a char is treated as image, it is not subject to cover test
// (see HTMLRenderer::drawString), so don't increase drawn_char_count. // (see HTMLRenderer::drawString), so don't increase drawn_char_count.
else if (param.process_covered_text) { else if (param.process_covered_text) {
if (html_renderer->get_chars_covered()[drawn_char_count]) if (html_renderer->is_char_covered(drawn_char_count))
CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
drawn_char_count++; drawn_char_count++;
} }

View File

@ -91,7 +91,7 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
// If a char is treated as image, it is not subject to cover test // If a char is treated as image, it is not subject to cover test
// (see HTMLRenderer::drawString), so don't increase drawn_char_count. // (see HTMLRenderer::drawString), so don't increase drawn_char_count.
else if (param.process_covered_text) { else if (param.process_covered_text) {
if (html_renderer->get_chars_covered()[drawn_char_count]) if (html_renderer->is_char_covered(drawn_char_count))
SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
drawn_char_count++; drawn_char_count++;
} }

View File

@ -150,7 +150,14 @@ public:
bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); } bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); }
bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); } bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); }
const std::vector<bool> & get_chars_covered() { return covered_text_handler.get_chars_covered(); } /*
* Covered text handling.
*/
// Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page.
// Does not fail on out-of-bound conditions, but return false.
bool is_char_covered(int index);
// Currently drawn char (glyph) count in current page.
int get_char_count() { return (int)covered_text_handler.get_chars_covered().size(); }
protected: protected:
//////////////////////////////////////////////////// ////////////////////////////////////////////////////

View File

@ -123,8 +123,7 @@ void HTMLRenderer::reset_state()
cur_line_state.y = 0; cur_line_state.y = 0;
memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix)); memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));
if (param.process_covered_text) cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);};
cur_line_state.chars_covered = &covered_text_handler.get_chars_covered();
cur_clip_state.xmin = 0; cur_clip_state.xmin = 0;
cur_clip_state.xmax = 0; cur_clip_state.xmax = 0;
@ -510,7 +509,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y); state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y);
state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y); state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y);
if (param.process_covered_text) if (param.process_covered_text)
cur_line_state.first_char_index = covered_text_handler.get_chars_covered().size(); cur_line_state.first_char_index = get_char_count();
html_text_page.open_new_line(cur_line_state); html_text_page.open_new_line(cur_line_state);
cur_text_state.vertical_align = 0; cur_text_state.vertical_align = 0;

View File

@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
while (len > 0) while (len > 0)
{ {
auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy); auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%d\n", u[0])); HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
if(!(equal(ox, 0) && equal(oy, 0))) if(!(equal(ox, 0) && equal(oy, 0)))
{ {
@ -101,6 +101,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
if(is_space && (param.space_as_offset)) if(is_space && (param.space_as_offset))
{ {
html_text_page.get_cur_line()->append_padding_char();
// ignore horiz_scaling, as it has been merged into CTM // ignore horiz_scaling, as it has been merged into CTM
html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
} }
@ -150,4 +151,16 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
draw_ty += dy; draw_ty += dy;
} }
bool HTMLRenderer::is_char_covered(int index)
{
auto covered = covered_text_handler.get_chars_covered();
if (index < 0 || index >= (int)covered.size())
{
std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: "
<< index << ", size: " << covered.size() <<endl;
return false;
}
return covered[index];
}
} // namespace pdf2htmlEX } // namespace pdf2htmlEX

View File

@ -5,6 +5,8 @@
#ifndef HTMLSTATE_H__ #ifndef HTMLSTATE_H__
#define HTMLSTATE_H__ #define HTMLSTATE_H__
#include <functional>
#include "Color.h" #include "Color.h"
namespace pdf2htmlEX { namespace pdf2htmlEX {
@ -64,9 +66,10 @@ struct HTMLLineState
double transform_matrix[4]; double transform_matrix[4];
// The page-cope char index(in drawing order) of the first char in this line. // The page-cope char index(in drawing order) of the first char in this line.
int first_char_index; int first_char_index;
const std::vector<bool> * chars_covered; // A function to determine whether a char is covered at a given index.
std::function<bool(int)> is_char_covered;
HTMLLineState(): first_char_index(-1), chars_covered(nullptr) { } HTMLLineState(): first_char_index(-1) { }
}; };
struct HTMLClipState struct HTMLClipState

View File

@ -36,7 +36,14 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width) void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
{ {
text.insert(text.end(), u, u+l); if (l == 1)
text.push_back(min(u[0], (unsigned)INT_MAX));
else
{
text.push_back(- decomposed_text.size() - 1);
decomposed_text.emplace_back();
decomposed_text.back().assign(u, u + l);
}
this->width += width; this->width += width;
} }
@ -69,30 +76,54 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state)
last_state.font_size *= last_state.font_info->font_size_scale; last_state.font_size *= last_state.font_info->font_size_scale;
} }
void HTMLTextLine::dump_chars(ostream & out, const Unicode * u, int uLen) void HTMLTextLine::dump_char(std::ostream & out, int pos)
{ {
if (!line_state.chars_covered) int c = text[pos];
if (c > 0)
{ {
writeUnicodes(out, u, uLen); Unicode u = c;
writeUnicodes(out, &u, 1);
}
else if (c < 0)
{
auto dt = decomposed_text[- c - 1];
writeUnicodes(out, &dt.front(), dt.size());
}
}
void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
{
if (line_state.first_char_index < 0)
{
for (int i = 0; i < len; i++)
dump_char(out, begin + i);
return; return;
} }
//TODO merge sibling invisiable spans bool invisible_group_open = false;
int start = this->line_state.first_char_index + dumped_char_count; for(int i = 0; i < len; i++)
for(int i = 0; i < uLen; i++)
{ {
if (!(*line_state.chars_covered)[start + i]) //visible if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible
{ {
writeUnicodes(out, u + i, 1); if (invisible_group_open)
{
invisible_group_open = false;
out << "</span>";
}
dump_char(out, begin + i);
} }
else else
{ {
out << "<span style=\"color:transparent\">"; if (!invisible_group_open)
writeUnicodes(out, u + i, 1); {
out << "</span>"; out << "<span style=\"color:transparent\">";
invisible_group_open = true;
}
dump_char(out, begin + i);
} }
} }
dumped_char_count += uLen; if (invisible_group_open)
out << "</span>";
} }
void HTMLTextLine::dump_text(ostream & out) void HTMLTextLine::dump_text(ostream & out)
@ -110,8 +141,6 @@ void HTMLTextLine::dump_text(ostream & out)
return; return;
} }
dumped_char_count = 0;
// Start Output // Start Output
{ {
// open <div> for the current text line // open <div> for the current text line
@ -244,7 +273,7 @@ void HTMLTextLine::dump_text(ostream & out)
size_t next_text_idx = text_idx2; size_t next_text_idx = text_idx2;
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx) if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
next_text_idx = cur_offset_iter->start_idx; next_text_idx = cur_offset_iter->start_idx;
dump_chars(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx); dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx);
cur_text_idx = next_text_idx; cur_text_idx = next_text_idx;
} }
} }

View File

@ -73,7 +73,16 @@ public:
double width; double width;
}; };
/**
* Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
* multiple code points.
*/
void append_unicodes(const Unicode * u, int l, double width); void append_unicodes(const Unicode * u, int l, double width);
/**
* Append a special padding char with 0 width, in order to keep char index consistent.
* The padding char is ignored during output.
*/
void append_padding_char() { text.push_back(0); }
void append_offset(double width); void append_offset(double width);
void append_state(const HTMLTextState & text_state); void append_state(const HTMLTextState & text_state);
void dump_text(std::ostream & out); void dump_text(std::ostream & out);
@ -91,7 +100,13 @@ public:
private: private:
void optimize_normal(std::vector<HTMLTextLine*> &); void optimize_normal(std::vector<HTMLTextLine*> &);
void optimize_aggressive(std::vector<HTMLTextLine*> &); void optimize_aggressive(std::vector<HTMLTextLine*> &);
void dump_chars(std::ostream & out, const Unicode * u, int uLen);
/**
* Dump chars' unicode to output stream.
* begin/pos is the index in 'text'.
*/
void dump_chars(std::ostream & out, int begin, int len);
void dump_char(std::ostream & out, int pos);
const Param & param; const Param & param;
AllStateManager & all_manager; AllStateManager & all_manager;
@ -103,9 +118,16 @@ private:
std::vector<State> states; std::vector<State> states;
std::vector<Offset> offsets; std::vector<Offset> offsets;
std::vector<Unicode> text;
int dumped_char_count; /**
* Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
* - If c > 0, it is the unicode code point corresponds to the glyph;
* - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
* - If c < -1, this glyph corresponds to more than one unicode code points,
* which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
*/
std::vector<int> text;
std::vector<std::vector<Unicode> > decomposed_text;
}; };
} // namespace pdf2htmlEX } // namespace pdf2htmlEX