1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

Improve covered text handling: 1. take care of chars corespond to 0 or more than one unicode points;

2. merge sibling invisiable spans; 3. improve interfaces of HTMLLineState and HTMLRenderer;
This commit is contained in:
Duan Yao 2014-06-26 12:39:35 +08:00
parent 65e82028bb
commit 39e171a737
8 changed files with 101 additions and 28 deletions

View File

@ -66,7 +66,7 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
// If a char is treated as image, it is not subject to cover test
// (see HTMLRenderer::drawString), so don't increase drawn_char_count.
else if (param.process_covered_text) {
if (html_renderer->get_chars_covered()[drawn_char_count])
if (html_renderer->is_char_covered(drawn_char_count))
CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
drawn_char_count++;
}

View File

@ -91,7 +91,7 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
// If a char is treated as image, it is not subject to cover test
// (see HTMLRenderer::drawString), so don't increase drawn_char_count.
else if (param.process_covered_text) {
if (html_renderer->get_chars_covered()[drawn_char_count])
if (html_renderer->is_char_covered(drawn_char_count))
SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
drawn_char_count++;
}

View File

@ -150,7 +150,14 @@ public:
bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); }
bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); }
const std::vector<bool> & get_chars_covered() { return covered_text_handler.get_chars_covered(); }
/*
* Covered text handling.
*/
// Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page.
// Does not fail on out-of-bound conditions, but return false.
bool is_char_covered(int index);
// Currently drawn char (glyph) count in current page.
int get_char_count() { return (int)covered_text_handler.get_chars_covered().size(); }
protected:
////////////////////////////////////////////////////

View File

@ -123,8 +123,7 @@ void HTMLRenderer::reset_state()
cur_line_state.y = 0;
memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));
if (param.process_covered_text)
cur_line_state.chars_covered = &covered_text_handler.get_chars_covered();
cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);};
cur_clip_state.xmin = 0;
cur_clip_state.xmax = 0;
@ -510,7 +509,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y);
state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y);
if (param.process_covered_text)
cur_line_state.first_char_index = covered_text_handler.get_chars_covered().size();
cur_line_state.first_char_index = get_char_count();
html_text_page.open_new_line(cur_line_state);
cur_text_state.vertical_align = 0;

View File

@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
while (len > 0)
{
auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%d\n", u[0]));
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
if(!(equal(ox, 0) && equal(oy, 0)))
{
@ -101,6 +101,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
if(is_space && (param.space_as_offset))
{
html_text_page.get_cur_line()->append_padding_char();
// ignore horiz_scaling, as it has been merged into CTM
html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
}
@ -150,4 +151,16 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
draw_ty += dy;
}
bool HTMLRenderer::is_char_covered(int index)
{
auto covered = covered_text_handler.get_chars_covered();
if (index < 0 || index >= (int)covered.size())
{
std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: "
<< index << ", size: " << covered.size() <<endl;
return false;
}
return covered[index];
}
} // namespace pdf2htmlEX

View File

@ -5,6 +5,8 @@
#ifndef HTMLSTATE_H__
#define HTMLSTATE_H__
#include <functional>
#include "Color.h"
namespace pdf2htmlEX {
@ -64,9 +66,10 @@ struct HTMLLineState
double transform_matrix[4];
// The page-cope char index(in drawing order) of the first char in this line.
int first_char_index;
const std::vector<bool> * chars_covered;
// A function to determine whether a char is covered at a given index.
std::function<bool(int)> is_char_covered;
HTMLLineState(): first_char_index(-1), chars_covered(nullptr) { }
HTMLLineState(): first_char_index(-1) { }
};
struct HTMLClipState

View File

@ -36,7 +36,14 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
{
text.insert(text.end(), u, u+l);
if (l == 1)
text.push_back(min(u[0], (unsigned)INT_MAX));
else
{
text.push_back(- decomposed_text.size() - 1);
decomposed_text.emplace_back();
decomposed_text.back().assign(u, u + l);
}
this->width += width;
}
@ -69,30 +76,54 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state)
last_state.font_size *= last_state.font_info->font_size_scale;
}
void HTMLTextLine::dump_chars(ostream & out, const Unicode * u, int uLen)
void HTMLTextLine::dump_char(std::ostream & out, int pos)
{
if (!line_state.chars_covered)
int c = text[pos];
if (c > 0)
{
writeUnicodes(out, u, uLen);
Unicode u = c;
writeUnicodes(out, &u, 1);
}
else if (c < 0)
{
auto dt = decomposed_text[- c - 1];
writeUnicodes(out, &dt.front(), dt.size());
}
}
void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
{
if (line_state.first_char_index < 0)
{
for (int i = 0; i < len; i++)
dump_char(out, begin + i);
return;
}
//TODO merge sibling invisiable spans
int start = this->line_state.first_char_index + dumped_char_count;
for(int i = 0; i < uLen; i++)
bool invisible_group_open = false;
for(int i = 0; i < len; i++)
{
if (!(*line_state.chars_covered)[start + i]) //visible
if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible
{
writeUnicodes(out, u + i, 1);
if (invisible_group_open)
{
invisible_group_open = false;
out << "</span>";
}
dump_char(out, begin + i);
}
else
{
out << "<span style=\"color:transparent\">";
writeUnicodes(out, u + i, 1);
out << "</span>";
if (!invisible_group_open)
{
out << "<span style=\"color:transparent\">";
invisible_group_open = true;
}
dump_char(out, begin + i);
}
}
dumped_char_count += uLen;
if (invisible_group_open)
out << "</span>";
}
void HTMLTextLine::dump_text(ostream & out)
@ -110,8 +141,6 @@ void HTMLTextLine::dump_text(ostream & out)
return;
}
dumped_char_count = 0;
// Start Output
{
// open <div> for the current text line
@ -244,7 +273,7 @@ void HTMLTextLine::dump_text(ostream & out)
size_t next_text_idx = text_idx2;
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
next_text_idx = cur_offset_iter->start_idx;
dump_chars(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx);
cur_text_idx = next_text_idx;
}
}

View File

@ -73,7 +73,16 @@ public:
double width;
};
/**
* Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
* multiple code points.
*/
void append_unicodes(const Unicode * u, int l, double width);
/**
* Append a special padding char with 0 width, in order to keep char index consistent.
* The padding char is ignored during output.
*/
void append_padding_char() { text.push_back(0); }
void append_offset(double width);
void append_state(const HTMLTextState & text_state);
void dump_text(std::ostream & out);
@ -91,7 +100,13 @@ public:
private:
void optimize_normal(std::vector<HTMLTextLine*> &);
void optimize_aggressive(std::vector<HTMLTextLine*> &);
void dump_chars(std::ostream & out, const Unicode * u, int uLen);
/**
* Dump chars' unicode to output stream.
* begin/pos is the index in 'text'.
*/
void dump_chars(std::ostream & out, int begin, int len);
void dump_char(std::ostream & out, int pos);
const Param & param;
AllStateManager & all_manager;
@ -103,9 +118,16 @@ private:
std::vector<State> states;
std::vector<Offset> offsets;
std::vector<Unicode> text;
int dumped_char_count;
/**
* Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
* - If c > 0, it is the unicode code point corresponds to the glyph;
* - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
* - If c < -1, this glyph corresponds to more than one unicode code points,
* which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
*/
std::vector<int> text;
std::vector<std::vector<Unicode> > decomposed_text;
};
} // namespace pdf2htmlEX