mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
Improve covered text handling: 1. take care of chars corespond to 0 or more than one unicode points;
2. merge sibling invisiable spans; 3. improve interfaces of HTMLLineState and HTMLRenderer;
This commit is contained in:
parent
65e82028bb
commit
39e171a737
@ -66,7 +66,7 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
|
||||
// If a char is treated as image, it is not subject to cover test
|
||||
// (see HTMLRenderer::drawString), so don't increase drawn_char_count.
|
||||
else if (param.process_covered_text) {
|
||||
if (html_renderer->get_chars_covered()[drawn_char_count])
|
||||
if (html_renderer->is_char_covered(drawn_char_count))
|
||||
CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
|
||||
drawn_char_count++;
|
||||
}
|
||||
|
@ -91,7 +91,7 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
|
||||
// If a char is treated as image, it is not subject to cover test
|
||||
// (see HTMLRenderer::drawString), so don't increase drawn_char_count.
|
||||
else if (param.process_covered_text) {
|
||||
if (html_renderer->get_chars_covered()[drawn_char_count])
|
||||
if (html_renderer->is_char_covered(drawn_char_count))
|
||||
SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
|
||||
drawn_char_count++;
|
||||
}
|
||||
|
@ -150,7 +150,14 @@ public:
|
||||
bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); }
|
||||
bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); }
|
||||
|
||||
const std::vector<bool> & get_chars_covered() { return covered_text_handler.get_chars_covered(); }
|
||||
/*
|
||||
* Covered text handling.
|
||||
*/
|
||||
// Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page.
|
||||
// Does not fail on out-of-bound conditions, but return false.
|
||||
bool is_char_covered(int index);
|
||||
// Currently drawn char (glyph) count in current page.
|
||||
int get_char_count() { return (int)covered_text_handler.get_chars_covered().size(); }
|
||||
|
||||
protected:
|
||||
////////////////////////////////////////////////////
|
||||
|
@ -123,8 +123,7 @@ void HTMLRenderer::reset_state()
|
||||
cur_line_state.y = 0;
|
||||
memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));
|
||||
|
||||
if (param.process_covered_text)
|
||||
cur_line_state.chars_covered = &covered_text_handler.get_chars_covered();
|
||||
cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);};
|
||||
|
||||
cur_clip_state.xmin = 0;
|
||||
cur_clip_state.xmax = 0;
|
||||
@ -510,7 +509,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
|
||||
state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y);
|
||||
state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y);
|
||||
if (param.process_covered_text)
|
||||
cur_line_state.first_char_index = covered_text_handler.get_chars_covered().size();
|
||||
cur_line_state.first_char_index = get_char_count();
|
||||
html_text_page.open_new_line(cur_line_state);
|
||||
|
||||
cur_text_state.vertical_align = 0;
|
||||
|
@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
while (len > 0)
|
||||
{
|
||||
auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
|
||||
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%d\n", u[0]));
|
||||
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
|
||||
|
||||
if(!(equal(ox, 0) && equal(oy, 0)))
|
||||
{
|
||||
@ -101,6 +101,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
|
||||
if(is_space && (param.space_as_offset))
|
||||
{
|
||||
html_text_page.get_cur_line()->append_padding_char();
|
||||
// ignore horiz_scaling, as it has been merged into CTM
|
||||
html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
|
||||
}
|
||||
@ -150,4 +151,16 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
draw_ty += dy;
|
||||
}
|
||||
|
||||
bool HTMLRenderer::is_char_covered(int index)
|
||||
{
|
||||
auto covered = covered_text_handler.get_chars_covered();
|
||||
if (index < 0 || index >= (int)covered.size())
|
||||
{
|
||||
std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: "
|
||||
<< index << ", size: " << covered.size() <<endl;
|
||||
return false;
|
||||
}
|
||||
return covered[index];
|
||||
}
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
@ -5,6 +5,8 @@
|
||||
#ifndef HTMLSTATE_H__
|
||||
#define HTMLSTATE_H__
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "Color.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
@ -64,9 +66,10 @@ struct HTMLLineState
|
||||
double transform_matrix[4];
|
||||
// The page-cope char index(in drawing order) of the first char in this line.
|
||||
int first_char_index;
|
||||
const std::vector<bool> * chars_covered;
|
||||
// A function to determine whether a char is covered at a given index.
|
||||
std::function<bool(int)> is_char_covered;
|
||||
|
||||
HTMLLineState(): first_char_index(-1), chars_covered(nullptr) { }
|
||||
HTMLLineState(): first_char_index(-1) { }
|
||||
};
|
||||
|
||||
struct HTMLClipState
|
||||
|
@ -36,7 +36,14 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
|
||||
|
||||
void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
|
||||
{
|
||||
text.insert(text.end(), u, u+l);
|
||||
if (l == 1)
|
||||
text.push_back(min(u[0], (unsigned)INT_MAX));
|
||||
else
|
||||
{
|
||||
text.push_back(- decomposed_text.size() - 1);
|
||||
decomposed_text.emplace_back();
|
||||
decomposed_text.back().assign(u, u + l);
|
||||
}
|
||||
this->width += width;
|
||||
}
|
||||
|
||||
@ -69,30 +76,54 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state)
|
||||
last_state.font_size *= last_state.font_info->font_size_scale;
|
||||
}
|
||||
|
||||
void HTMLTextLine::dump_chars(ostream & out, const Unicode * u, int uLen)
|
||||
void HTMLTextLine::dump_char(std::ostream & out, int pos)
|
||||
{
|
||||
if (!line_state.chars_covered)
|
||||
int c = text[pos];
|
||||
if (c > 0)
|
||||
{
|
||||
writeUnicodes(out, u, uLen);
|
||||
Unicode u = c;
|
||||
writeUnicodes(out, &u, 1);
|
||||
}
|
||||
else if (c < 0)
|
||||
{
|
||||
auto dt = decomposed_text[- c - 1];
|
||||
writeUnicodes(out, &dt.front(), dt.size());
|
||||
}
|
||||
}
|
||||
|
||||
void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
|
||||
{
|
||||
if (line_state.first_char_index < 0)
|
||||
{
|
||||
for (int i = 0; i < len; i++)
|
||||
dump_char(out, begin + i);
|
||||
return;
|
||||
}
|
||||
|
||||
//TODO merge sibling invisiable spans
|
||||
int start = this->line_state.first_char_index + dumped_char_count;
|
||||
for(int i = 0; i < uLen; i++)
|
||||
bool invisible_group_open = false;
|
||||
for(int i = 0; i < len; i++)
|
||||
{
|
||||
if (!(*line_state.chars_covered)[start + i]) //visible
|
||||
if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible
|
||||
{
|
||||
writeUnicodes(out, u + i, 1);
|
||||
if (invisible_group_open)
|
||||
{
|
||||
invisible_group_open = false;
|
||||
out << "</span>";
|
||||
}
|
||||
dump_char(out, begin + i);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!invisible_group_open)
|
||||
{
|
||||
out << "<span style=\"color:transparent\">";
|
||||
writeUnicodes(out, u + i, 1);
|
||||
invisible_group_open = true;
|
||||
}
|
||||
dump_char(out, begin + i);
|
||||
}
|
||||
}
|
||||
if (invisible_group_open)
|
||||
out << "</span>";
|
||||
}
|
||||
}
|
||||
dumped_char_count += uLen;
|
||||
}
|
||||
|
||||
void HTMLTextLine::dump_text(ostream & out)
|
||||
@ -110,8 +141,6 @@ void HTMLTextLine::dump_text(ostream & out)
|
||||
return;
|
||||
}
|
||||
|
||||
dumped_char_count = 0;
|
||||
|
||||
// Start Output
|
||||
{
|
||||
// open <div> for the current text line
|
||||
@ -244,7 +273,7 @@ void HTMLTextLine::dump_text(ostream & out)
|
||||
size_t next_text_idx = text_idx2;
|
||||
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
|
||||
next_text_idx = cur_offset_iter->start_idx;
|
||||
dump_chars(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
|
||||
dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx);
|
||||
cur_text_idx = next_text_idx;
|
||||
}
|
||||
}
|
||||
|
@ -73,7 +73,16 @@ public:
|
||||
double width;
|
||||
};
|
||||
|
||||
/**
|
||||
* Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
|
||||
* multiple code points.
|
||||
*/
|
||||
void append_unicodes(const Unicode * u, int l, double width);
|
||||
/**
|
||||
* Append a special padding char with 0 width, in order to keep char index consistent.
|
||||
* The padding char is ignored during output.
|
||||
*/
|
||||
void append_padding_char() { text.push_back(0); }
|
||||
void append_offset(double width);
|
||||
void append_state(const HTMLTextState & text_state);
|
||||
void dump_text(std::ostream & out);
|
||||
@ -91,7 +100,13 @@ public:
|
||||
private:
|
||||
void optimize_normal(std::vector<HTMLTextLine*> &);
|
||||
void optimize_aggressive(std::vector<HTMLTextLine*> &);
|
||||
void dump_chars(std::ostream & out, const Unicode * u, int uLen);
|
||||
|
||||
/**
|
||||
* Dump chars' unicode to output stream.
|
||||
* begin/pos is the index in 'text'.
|
||||
*/
|
||||
void dump_chars(std::ostream & out, int begin, int len);
|
||||
void dump_char(std::ostream & out, int pos);
|
||||
|
||||
const Param & param;
|
||||
AllStateManager & all_manager;
|
||||
@ -103,9 +118,16 @@ private:
|
||||
|
||||
std::vector<State> states;
|
||||
std::vector<Offset> offsets;
|
||||
std::vector<Unicode> text;
|
||||
|
||||
int dumped_char_count;
|
||||
/**
|
||||
* Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
|
||||
* - If c > 0, it is the unicode code point corresponds to the glyph;
|
||||
* - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
|
||||
* - If c < -1, this glyph corresponds to more than one unicode code points,
|
||||
* which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
|
||||
*/
|
||||
std::vector<int> text;
|
||||
std::vector<std::vector<Unicode> > decomposed_text;
|
||||
};
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
Loading…
Reference in New Issue
Block a user