1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

Implement covered text handling.

This commit is contained in:
Duan Yao 2014-06-15 03:44:28 +08:00
parent 9c0b2a8a74
commit ce28c00a49
19 changed files with 368 additions and 8 deletions

View File

@ -161,6 +161,8 @@ set(PDF2HTMLEX_SRC ${PDF2HTMLEX_SRC}
src/Base64Stream.cc
src/Color.h
src/Color.cc
src/CoveredTextHandler.h
src/CoveredTextHandler.cc
src/HTMLState.h
src/HTMLTextLine.h
src/HTMLTextLine.cc

View File

@ -242,6 +242,11 @@ If set to 0, pdf2htmlEX would try its best to balance the two methods above.
.B --optimize-text <0|1> (Default: 0)
If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for text. Turn it off if anything goes wrong.
.TP
.B --process-covered-text <0|1> (Default: 0)
If set to 1, pdf2htmlEX will try to detect texts covered by other graphics and properly arrange them,
i.e. covered texts are made transparent in text layer, and are drawn on background layer.
.SS Background Image
.TP

View File

@ -63,6 +63,13 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
{
CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
}
// If a char is treated as image, it is not subject to cover test
// (see HTMLRenderer::drawString), so don't increase drawn_char_count.
else if (param.process_covered_text) {
if (html_renderer->get_chars_covered()[drawn_char_count])
CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
drawn_char_count++;
}
}
void CairoBackgroundRenderer::beginTextObject(GfxState *state)
@ -97,6 +104,7 @@ static GBool annot_cb(Annot *, void * pflag) {
bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
{
drawn_char_count = 0;
double page_width;
double page_height;
if(param.use_cropbox)

View File

@ -66,6 +66,7 @@ private:
std::unordered_map<int, int> bitmaps_ref_count;
// id of bitmaps' stream used by current page
std::vector<int> bitmaps_in_current_page;
int drawn_char_count;
};
}

55
src/CoveredTextHandler.cc Normal file
View File

@ -0,0 +1,55 @@
/*
* CoveredTextHandler.cc
*
* Created on: 2014-6-14
* Author: duanyao
*/
#include "CoveredTextHandler.h"
#include "util/math.h"
namespace pdf2htmlEX {
CoveredTextHandler::CoveredTextHandler()
{
// TODO Auto-generated constructor stub
}
CoveredTextHandler::~CoveredTextHandler()
{
// TODO Auto-generated destructor stub
}
void CoveredTextHandler::reset()
{
char_bboxes.clear();
chars_covered.clear();
}
void CoveredTextHandler::add_char_bbox(double * bbox)
{
for (int i = 0; i < 4; i++)
char_bboxes.push_back(bbox[i]);
chars_covered.push_back(false);
}
void CoveredTextHandler::add_non_char_bbox(double * bbox, int index)
{
if (index < 0)
index = chars_covered.size();
for (int i = 0; i < index; i++)
{
if (chars_covered[i])
continue;
double * cbbox = &char_bboxes[i * 4];
if (bbox_intersect(cbbox, bbox))
{
chars_covered[i] = true;
add_non_char_bbox(cbbox, i);
}
}
}
}

62
src/CoveredTextHandler.h Normal file
View File

@ -0,0 +1,62 @@
/*
* CoveredTextHandler.h
*
* Created on: 2014-6-14
* Author: duanyao
*/
#ifndef COVEREDTEXTHANDLER_H__
#define COVEREDTEXTHANDLER_H__
#include <vector>
namespace pdf2htmlEX {
/**
* Detect characters that are covered by non-char graphics on a page.
*/
class CoveredTextHandler
{
public:
CoveredTextHandler();
virtual ~CoveredTextHandler();
/**
* Reset to initial state. Should be called when start drawing a page.
*/
void reset();
/**
* Add a drawn character's bounding box.
* @param bbox (x0, y0, x1, y1)
*/
void add_char_bbox(double * bbox);
/**
* Add a drawn non-char graphics' bounding box.
* If it intersects any previously drawn char's bbox, the char is marked as covered
* and treated as an non-char.
* @param bbox (x0, y0, x1, y1)
* @param index this graphics' drawing order: assume it is drawn after (index-1)th
* char. -1 means after the last char.
*/
void add_non_char_bbox(double * bbox, int index = -1);
/**
* An array of flags indicating whether a char is covered by any non-char graphics.
* Index by the order that these chars are added.
* This vector grows as add_char_bbox() is called, so its size is the count
* of currently drawn chars.
*/
const std::vector<bool> & get_chars_covered() { return chars_covered; }
private:
//covered text test
std::vector<bool> chars_covered;
// x00, y00, x01, y01; x10, y10, x11, y11;...
std::vector<double> char_bboxes;
};
}
#endif /* COVEREDTEXTHANDLER_H__ */

View File

@ -31,6 +31,7 @@
#include "HTMLTextPage.h"
#include "BackgroundRenderer/BackgroundRenderer.h"
#include "CoveredTextHandler.h"
#include "util/const.h"
#include "util/misc.h"
@ -125,6 +126,15 @@ public:
virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg);
virtual void drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
int width, int height,
GfxImageColorMap *colorMap,
GBool interpolate,
Stream *maskStr,
int maskWidth, int maskHeight,
GfxImageColorMap *maskColorMap,
GBool maskInterpolate);
virtual void stroke(GfxState *state) { css_do_path(state, false); }
virtual void fill(GfxState *state) { css_do_path(state, true); }
virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax);
@ -135,6 +145,8 @@ public:
bool can_stroke(GfxState *state) { return css_do_path(state, false, true); }
bool can_fill(GfxState *state) { return css_do_path(state, true, true); }
const std::vector<bool> & get_chars_covered() { return covered_text_handler.get_chars_covered(); }
protected:
////////////////////////////////////////////////////
// misc
@ -215,6 +227,19 @@ protected:
const GfxRGB * line_color, const GfxRGB * fill_color,
void (*style_function)(void *, std::ostream &) = nullptr, void * style_function_data = nullptr );
////////////////////////////////////////////////////
// Covered text handling
////////////////////////////////////////////////////
/*
* Cue CoveredTextHandler that a character is drawn
* x, y: glyph-drawing position, in PDF text object space.
* ax, ay: glyph advance, in glyph space.
*/
void add_char_bbox(GfxState *state, double x, double y, double ax, double ay);
/*
* Cue CoveredTextHandler that an image is drawn
*/
void add_image_bbox(GfxState *state);
////////////////////////////////////////////////////
// PDF stuffs
@ -338,6 +363,8 @@ protected:
std::string cur_page_filename;
static const std::string MANIFEST_FILENAME;
CoveredTextHandler covered_text_handler;
};
} //namespace pdf2htmlEX

View File

@ -133,13 +133,10 @@ void HTMLRenderer::process(PDFDoc *doc)
cur_page_filename = filled_template_filename;
}
if(param.process_nontext)
{
fallback_bg_required = !bg_renderer->render_page(doc, i);
if (fallback_bg_required && fallback_bg_renderer != nullptr)
fallback_bg_renderer->render_page(doc, i);
}
// We handle covered texts during doc->displayPage(this...),
// and bg_renderer->render_page() depends on the result, so it must be called after
// doc->displayPage(this...).
covered_text_handler.reset();
doc->displayPage(this, i,
text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI,
0,
@ -148,6 +145,13 @@ void HTMLRenderer::process(PDFDoc *doc)
false, // printing
nullptr, nullptr, nullptr, nullptr);
if(param.process_nontext)
{
fallback_bg_required = !bg_renderer->render_page(doc, i);
if (fallback_bg_required && fallback_bg_renderer != nullptr)
fallback_bg_renderer->render_page(doc, i);
}
if(param.split_pages)
{
delete f_curpage;

View File

@ -14,6 +14,8 @@ namespace pdf2htmlEX {
void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg)
{
add_image_bbox(state);
return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg);
#if 0
@ -62,4 +64,30 @@ void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int w
#endif
}
void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
int width, int height,
GfxImageColorMap *colorMap,
GBool interpolate,
Stream *maskStr,
int maskWidth, int maskHeight,
GfxImageColorMap *maskColorMap,
GBool maskInterpolate)
{
add_image_bbox(state);
return OutputDev::drawSoftMaskedImage(state,ref,str,
width,height,colorMap,interpolate,
maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate);
}
void HTMLRenderer::add_image_bbox(GfxState *state)
{
if (!param.process_covered_text)
return;
auto ctm = state->getCTM();
double bbox[4] {0, 0, 1, 1};
tm_transform_bbox(ctm, bbox);
covered_text_handler.add_non_char_bbox(bbox);
}
} // namespace pdf2htmlEX

View File

@ -119,6 +119,9 @@ void HTMLRenderer::reset_state()
cur_line_state.y = 0;
memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));
if (param.process_covered_text)
cur_line_state.chars_covered = &covered_text_handler.get_chars_covered();
cur_clip_state.xmin = 0;
cur_clip_state.xmax = 0;
cur_clip_state.ymin = 0;
@ -502,6 +505,8 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
double rise_x, rise_y;
state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y);
state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y);
if (param.process_covered_text)
cur_line_state.first_char_index = covered_text_handler.get_chars_covered().size();
html_text_page.open_new_line(cur_line_state);
cur_text_state.vertical_align = 0;

View File

@ -72,6 +72,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
cerr << "TODO: non-zero origins" << endl;
}
add_char_bbox(state, dx, dy, dx1, dy1);
bool is_space = false;
if (n == 1 && *p == ' ')
{
@ -143,4 +145,43 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
draw_ty += dy;
}
void HTMLRenderer::add_char_bbox(GfxState *state, double x, double y, double ax, double ay)
{
if (!param.process_covered_text)
return;
Matrix tm_ctm, tm, itm;
memcpy(tm_ctm.m, this->cur_text_tm, sizeof(tm_ctm.m));
memcpy(tm.m, state->getTextMat(), sizeof(tm.m));
double fs = state->getFontSize();
double cx = state->getCurX(), cy = state->getCurY(),
ry = state->getRise(), h = state->getHorizScaling();
//cx and cy has been transformed by text matrix, we need to reverse them.
tm.invertTo(&itm);
double char_cx, char_cy;
itm.transform(cx, cy, &char_cx, &char_cy);
//TODO Vertical? Currently vertical/type3 chars are treated as non-chars.
double tchar[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry};
double tfinal[6];
tm_multiply(tfinal, tm_ctm.m, tchar);
auto font = state->getFont();
double bbox[4] {0, 0, ax, ay};
double desc = font->getDescent(), asc = font->getAscent();
if (font->getWMode() == 0)
{
bbox[1] += desc;
bbox[3] += asc;
}
else
{//TODO Vertical?
}
tm_transform_bbox(tfinal, bbox);
covered_text_handler.add_char_bbox(bbox);
}
} // namespace pdf2htmlEX

View File

@ -62,6 +62,9 @@ struct HTMLLineState
{
double x,y;
double transform_matrix[4];
// The page-cope char index(in drawing order) of the first char in this line.
int first_char_index = -1;
const std::vector<bool> * chars_covered = nullptr;
};
struct HTMLClipState

View File

@ -69,6 +69,32 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state)
last_state.font_size *= last_state.font_info->font_size_scale;
}
void HTMLTextLine::dump_chars(ostream & out, const Unicode * u, int uLen)
{
if (!line_state.chars_covered)
{
writeUnicodes(out, u, uLen);
return;
}
//TODO merge sibling invisiable spans
int start = this->line_state.first_char_index + dumped_char_count;
for(int i = 0; i < uLen; i++)
{
if (!(*line_state.chars_covered)[start + i]) //visible
{
writeUnicodes(out, u + i, 1);
}
else
{
out << "<span style=\"color:transparent\">";
writeUnicodes(out, u + i, 1);
out << "</span>";
}
}
dumped_char_count += uLen;
}
void HTMLTextLine::dump_text(ostream & out)
{
/*
@ -84,6 +110,8 @@ void HTMLTextLine::dump_text(ostream & out)
return;
}
dumped_char_count = 0;
// Start Output
{
// open <div> for the current text line
@ -216,7 +244,7 @@ void HTMLTextLine::dump_text(ostream & out)
size_t next_text_idx = text_idx2;
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
next_text_idx = cur_offset_iter->start_idx;
writeUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
dump_chars(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
cur_text_idx = next_text_idx;
}
}

View File

@ -91,6 +91,7 @@ public:
private:
void optimize_normal(std::vector<HTMLTextLine*> &);
void optimize_aggressive(std::vector<HTMLTextLine*> &);
void dump_chars(std::ostream & out, const Unicode * u, int uLen);
const Param & param;
AllStateManager & all_manager;
@ -103,6 +104,8 @@ private:
std::vector<State> states;
std::vector<Offset> offsets;
std::vector<Unicode> text;
int dumped_char_count;
};
} // namespace pdf2htmlEX

View File

@ -38,6 +38,7 @@ struct Param
int process_nontext;
int process_outline;
int process_annotation;
int process_covered_text;
int printing;
int fallback;
int tmp_file_size_limit;

View File

@ -187,6 +187,7 @@ void parse_options (int argc, char **argv)
.add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets")
.add("tounicode", &param.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)")
.add("optimize-text", &param.optimize_text, 0, "try to reduce the number of HTML elements used for text")
.add("process-covered-text", &param.process_covered_text, 0, "try to detect texts covered by other graphics and properly arrange them")
// background image
.add("bg-format", &param.bg_format, "png", "specify background image format")

72
src/util/math.bk.cc Normal file
View File

@ -0,0 +1,72 @@
#include <cstring>
#include <limits>
#include <algorithm>
#include "math.h"
using std::min;
using std::max;
namespace pdf2htmlEX {
void tm_transform(const double * tm, double & x, double & y, bool is_delta)
{
double xx = x, yy = y;
x = tm[0] * xx + tm[2] * yy;
y = tm[1] * xx + tm[3] * yy;
if(!is_delta)
{
x += tm[4];
y += tm[5];
}
}
void tm_multiply(const double * tm_left, const double * tm_right, double * tm_result)
{
double old[4];
memcpy(old, tm_left, sizeof(old));
tm_result[0] = tm_left[0] * tm_right[0] + tm_left[2] * tm_right[1];
tm_result[1] = tm_left[1] * tm_right[0] + tm_left[3] * tm_right[1];
tm_result[2] = tm_left[0] * tm_right[2] + tm_left[2] * tm_right[3];
tm_result[3] = tm_left[1] * tm_right[2] + tm_left[3] * tm_right[3];
tm_result[4] += tm_left[0] * tm_right[4] + tm_left[2] * tm_right[5];
tm_result[5] += tm_left[1] * tm_right[4] + tm_left[3] * tm_right[5];
}
void tm_transform_bbox(const double * tm, double * bbox)
{
double & x1 = bbox[0];
double & y1 = bbox[1];
double & x2 = bbox[2];
double & y2 = bbox[3];
double _[4][2];
_[0][0] = _[1][0] = x1;
_[0][1] = _[2][1] = y1;
_[2][0] = _[3][0] = x2;
_[1][1] = _[3][1] = y2;
x1 = y1 = std::numeric_limits<double>::max();
x2 = y2 = std::numeric_limits<double>::min();
for(int i = 0; i < 4; ++i)
{
auto & x = _[i][0];
auto & y = _[i][1];
tm_transform(tm, x, y);
if(x < x1) x1 = x;
if(x > x2) x2 = x;
if(y < y1) y1 = y;
if(y > y2) y2 = y;
}
}
bool bbox_intersect(double * bbox1, double * bbox2)
{
return min(bbox1[0], bbox1[2]) < max(bbox2[0], bbox2[2])
&& max(bbox1[0], bbox1[2]) > min(bbox2[0], bbox2[2])
&& min(bbox1[1], bbox1[3]) < max(bbox2[1], bbox2[3])
&& max(bbox1[1], bbox1[3]) > min(bbox2[1], bbox2[3]);
}
} //namespace pdf2htmlEX

View File

@ -1,8 +1,12 @@
#include <cstring>
#include <limits>
#include <algorithm>
#include "math.h"
using std::min;
using std::max;
namespace pdf2htmlEX {
void tm_transform(const double * tm, double & x, double & y, bool is_delta)
@ -56,5 +60,13 @@ void tm_transform_bbox(const double * tm, double * bbox)
}
}
bool bbox_intersect(double * bbox1, double * bbox2)
{
return min(bbox1[0], bbox1[2]) < max(bbox2[0], bbox2[2])
&& max(bbox1[0], bbox1[2]) > min(bbox2[0], bbox2[2])
&& min(bbox1[1], bbox1[3]) < max(bbox2[1], bbox2[3])
&& max(bbox1[1], bbox1[3]) > min(bbox2[1], bbox2[3]);
}
} //namespace pdf2htmlEX

View File

@ -40,5 +40,7 @@ void tm_transform(const double * tm, double & x, double & y, bool is_delta = fal
void tm_multiply(double * tm_left, const double * tm_right);
void tm_transform_bbox(const double * tm, double * bbox);
bool bbox_intersect(double * bbox1, double * bbox2);
} //namespace pdf2htmlEX
#endif //MATH_H__