mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 04:50:09 +00:00
Merge pull request #365 from duanyao/covered_text_handling
Covered text handling
This commit is contained in:
commit
80b8e1f5de
@ -161,6 +161,10 @@ set(PDF2HTMLEX_SRC ${PDF2HTMLEX_SRC}
|
||||
src/Base64Stream.cc
|
||||
src/Color.h
|
||||
src/Color.cc
|
||||
src/CoveredTextDetector.h
|
||||
src/CoveredTextDetector.cc
|
||||
src/DrawingTracer.h
|
||||
src/DrawingTracer.cc
|
||||
src/HTMLState.h
|
||||
src/HTMLTextLine.h
|
||||
src/HTMLTextLine.cc
|
||||
|
@ -242,6 +242,11 @@ If set to 0, pdf2htmlEX would try its best to balance the two methods above.
|
||||
.B \-\-optimize\-text <0|1> (Default: 0)
|
||||
If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for text. Turn it off if anything goes wrong.
|
||||
|
||||
.TP
|
||||
.B --correct-text-visibility <0|1> (Default: 0)
|
||||
If set to 1, pdf2htmlEX will try to detect texts covered by other graphics and properly arrange them,
|
||||
i.e. covered texts are made transparent in text layer, and are drawn on background layer.
|
||||
|
||||
.SS Background Image
|
||||
|
||||
.TP
|
||||
|
@ -63,6 +63,13 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
|
||||
{
|
||||
CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
|
||||
}
|
||||
// If a char is treated as image, it is not subject to cover test
|
||||
// (see HTMLRenderer::drawString), so don't increase drawn_char_count.
|
||||
else if (param.correct_text_visibility) {
|
||||
if (html_renderer->is_char_covered(drawn_char_count))
|
||||
CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
|
||||
drawn_char_count++;
|
||||
}
|
||||
}
|
||||
|
||||
void CairoBackgroundRenderer::beginTextObject(GfxState *state)
|
||||
@ -104,6 +111,7 @@ static GBool annot_cb(Annot *, void * pflag) {
|
||||
|
||||
bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
|
||||
{
|
||||
drawn_char_count = 0;
|
||||
double page_width;
|
||||
double page_height;
|
||||
if(param.use_cropbox)
|
||||
|
@ -67,6 +67,7 @@ private:
|
||||
std::unordered_map<int, int> bitmaps_ref_count;
|
||||
// id of bitmaps' stream used by current page
|
||||
std::vector<int> bitmaps_in_current_page;
|
||||
int drawn_char_count;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -88,6 +88,13 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
|
||||
{
|
||||
SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
|
||||
}
|
||||
// If a char is treated as image, it is not subject to cover test
|
||||
// (see HTMLRenderer::drawString), so don't increase drawn_char_count.
|
||||
else if (param.correct_text_visibility) {
|
||||
if (html_renderer->is_char_covered(drawn_char_count))
|
||||
SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
|
||||
drawn_char_count++;
|
||||
}
|
||||
}
|
||||
|
||||
void SplashBackgroundRenderer::beginTextObject(GfxState *state)
|
||||
@ -129,6 +136,7 @@ static GBool annot_cb(Annot *, void * pflag) {
|
||||
|
||||
bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
|
||||
{
|
||||
drawn_char_count = 0;
|
||||
bool process_annotation = param.process_annotation;
|
||||
doc->displayPage(this, pageno, param.h_dpi, param.v_dpi,
|
||||
0,
|
||||
|
@ -71,6 +71,7 @@ protected:
|
||||
HTMLRenderer * html_renderer;
|
||||
const Param & param;
|
||||
std::string format;
|
||||
int drawn_char_count;
|
||||
};
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
51
src/CoveredTextDetector.cc
Normal file
51
src/CoveredTextDetector.cc
Normal file
@ -0,0 +1,51 @@
|
||||
/*
|
||||
* CoveredTextDetector.cc
|
||||
*
|
||||
* Created on: 2014-6-14
|
||||
* Author: duanyao
|
||||
*/
|
||||
|
||||
#include "CoveredTextDetector.h"
|
||||
|
||||
#include "util/math.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
void CoveredTextDetector::reset()
|
||||
{
|
||||
char_bboxes.clear();
|
||||
chars_covered.clear();
|
||||
}
|
||||
|
||||
void CoveredTextDetector::add_char_bbox(double * bbox)
|
||||
{
|
||||
char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4);
|
||||
chars_covered.push_back(false);
|
||||
}
|
||||
|
||||
void CoveredTextDetector::add_char_bbox_clipped(double * bbox, bool patially)
|
||||
{
|
||||
char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4);
|
||||
chars_covered.push_back(true);
|
||||
if (patially)
|
||||
add_non_char_bbox(bbox, chars_covered.size() - 1);
|
||||
}
|
||||
|
||||
void CoveredTextDetector::add_non_char_bbox(double * bbox, int index)
|
||||
{
|
||||
if (index < 0)
|
||||
index = chars_covered.size();
|
||||
for (int i = 0; i < index; i++)
|
||||
{
|
||||
if (chars_covered[i])
|
||||
continue;
|
||||
double * cbbox = &char_bboxes[i * 4];
|
||||
if (bbox_intersect(cbbox, bbox))
|
||||
{
|
||||
chars_covered[i] = true;
|
||||
add_non_char_bbox(cbbox, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
61
src/CoveredTextDetector.h
Normal file
61
src/CoveredTextDetector.h
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* CoveredTextDetector.h
|
||||
*
|
||||
* Created on: 2014-6-14
|
||||
* Author: duanyao
|
||||
*/
|
||||
|
||||
#ifndef COVEREDTEXTDETECTOR_H__
|
||||
#define COVEREDTEXTDETECTOR_H__
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
/**
|
||||
* Detect characters that are covered by non-char graphics on a page.
|
||||
*/
|
||||
class CoveredTextDetector
|
||||
{
|
||||
public:
|
||||
|
||||
/**
|
||||
* Reset to initial state. Should be called when start drawing a page.
|
||||
*/
|
||||
void reset();
|
||||
|
||||
/**
|
||||
* Add a drawn character's bounding box.
|
||||
* @param bbox (x0, y0, x1, y1)
|
||||
*/
|
||||
void add_char_bbox(double * bbox);
|
||||
|
||||
void add_char_bbox_clipped(double * bbox, bool patially);
|
||||
|
||||
/**
|
||||
* Add a drawn non-char graphics' bounding box.
|
||||
* If it intersects any previously drawn char's bbox, the char is marked as covered
|
||||
* and treated as an non-char.
|
||||
* @param bbox (x0, y0, x1, y1)
|
||||
* @param index this graphics' drawing order: assume it is drawn after (index-1)th
|
||||
* char. -1 means after the last char.
|
||||
*/
|
||||
void add_non_char_bbox(double * bbox, int index = -1);
|
||||
|
||||
/**
|
||||
* An array of flags indicating whether a char is covered by any non-char graphics.
|
||||
* Index by the order that these chars are added.
|
||||
* This vector grows as add_char_bbox() is called, so its size is the count
|
||||
* of currently drawn chars.
|
||||
*/
|
||||
const std::vector<bool> & get_chars_covered() { return chars_covered; }
|
||||
|
||||
private:
|
||||
std::vector<bool> chars_covered;
|
||||
// x00, y00, x01, y01; x10, y10, x11, y11;...
|
||||
std::vector<double> char_bboxes;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif /* COVEREDTEXTDETECTOR_H__ */
|
360
src/DrawingTracer.cc
Normal file
360
src/DrawingTracer.cc
Normal file
@ -0,0 +1,360 @@
|
||||
/*
|
||||
* DrawingTracer.cc
|
||||
*
|
||||
* Created on: 2014-6-15
|
||||
* Author: duanyao
|
||||
*/
|
||||
|
||||
#include "GfxFont.h"
|
||||
|
||||
#include "util/math.h"
|
||||
#include "DrawingTracer.h"
|
||||
|
||||
//#define DT_DEBUG(x) (x)
|
||||
#define DT_DEBUG(x)
|
||||
|
||||
#if !ENABLE_SVG
|
||||
#warning "Cairo is disabled because ENABLE_SVG is off, --correct-text-visibility has limited functionality."
|
||||
#endif
|
||||
|
||||
namespace pdf2htmlEX
|
||||
{
|
||||
|
||||
DrawingTracer::DrawingTracer(const Param & param): param(param)
|
||||
#if ENABLE_SVG
|
||||
, cairo(nullptr)
|
||||
#endif
|
||||
{
|
||||
}
|
||||
|
||||
DrawingTracer::~DrawingTracer()
|
||||
{
|
||||
finish();
|
||||
}
|
||||
|
||||
void DrawingTracer::reset(GfxState *state)
|
||||
{
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
finish();
|
||||
|
||||
#if ENABLE_SVG
|
||||
cairo_rectangle_t page_box {0, 0, width:state->getPageWidth(), height:state->getPageHeight()};
|
||||
cairo_surface_t * surface = cairo_recording_surface_create(CAIRO_CONTENT_COLOR_ALPHA, &page_box);
|
||||
cairo = cairo_create(surface);
|
||||
#endif
|
||||
}
|
||||
|
||||
void DrawingTracer::finish()
|
||||
{
|
||||
#if ENABLE_SVG
|
||||
if (cairo)
|
||||
{
|
||||
cairo_destroy(cairo);
|
||||
cairo = nullptr;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Poppler won't inform us its initial CTM, and the initial CTM is affected by zoom level.
|
||||
// OutputDev::clip() may be called before OutputDev::updateCTM(), so we can't rely on GfxState::getCTM(),
|
||||
// and should trace ctm changes ourself (via cairo).
|
||||
void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m21, double m22, double m31, double m32)
|
||||
{
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
|
||||
#if ENABLE_SVG
|
||||
cairo_matrix_t matrix;
|
||||
matrix.xx = m11;
|
||||
matrix.yx = m12;
|
||||
matrix.xy = m21;
|
||||
matrix.yy = m22;
|
||||
matrix.x0 = m31;
|
||||
matrix.y0 = m32;
|
||||
cairo_transform(cairo, &matrix);
|
||||
#endif
|
||||
}
|
||||
|
||||
void DrawingTracer::clip(GfxState * state, bool even_odd)
|
||||
{
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
#if ENABLE_SVG
|
||||
do_path(state, state->getPath());
|
||||
cairo_set_fill_rule(cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING);
|
||||
cairo_clip (cairo);
|
||||
#endif
|
||||
}
|
||||
|
||||
void DrawingTracer::clip_to_stroke_path(GfxState * state)
|
||||
{
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
// TODO cairo_stroke_to_path() ?
|
||||
}
|
||||
|
||||
void DrawingTracer::save()
|
||||
{
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
#if ENABLE_SVG
|
||||
cairo_save(cairo);
|
||||
#endif
|
||||
}
|
||||
void DrawingTracer::restore()
|
||||
{
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
#if ENABLE_SVG
|
||||
cairo_restore(cairo);
|
||||
#endif
|
||||
}
|
||||
|
||||
void DrawingTracer::do_path(GfxState * state, GfxPath * path)
|
||||
{
|
||||
#if ENABLE_SVG
|
||||
//copy from CairoOutputDev::doPath
|
||||
GfxSubpath *subpath;
|
||||
int i, j;
|
||||
double x, y;
|
||||
cairo_new_path(cairo);
|
||||
for (i = 0; i < path->getNumSubpaths(); ++i) {
|
||||
subpath = path->getSubpath(i);
|
||||
if (subpath->getNumPoints() > 0) {
|
||||
x = subpath->getX(0);
|
||||
y = subpath->getY(0);
|
||||
cairo_move_to(cairo, x, y);
|
||||
j = 1;
|
||||
while (j < subpath->getNumPoints()) {
|
||||
if (subpath->getCurve(j)) {
|
||||
x = subpath->getX(j+2);
|
||||
y = subpath->getY(j+2);
|
||||
cairo_curve_to(cairo,
|
||||
subpath->getX(j), subpath->getY(j),
|
||||
subpath->getX(j+1), subpath->getY(j+1),
|
||||
x, y);
|
||||
j += 3;
|
||||
} else {
|
||||
x = subpath->getX(j);
|
||||
y = subpath->getY(j);
|
||||
cairo_line_to(cairo, x, y);
|
||||
++j;
|
||||
}
|
||||
}
|
||||
if (subpath->isClosed()) {
|
||||
cairo_close_path (cairo);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void DrawingTracer::stroke(GfxState * state)
|
||||
{
|
||||
#if ENABLE_SVG
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
|
||||
DT_DEBUG(printf("DrawingTracer::stroke\n"));
|
||||
|
||||
cairo_set_line_width(cairo, state->getLineWidth());
|
||||
|
||||
// GfxPath is broken into steps, each step makes up a cairo path and its bbox is used for covering test.
|
||||
// TODO
|
||||
// 1. path steps that are not vertical or horizontal lines may still falsely "cover" many chars,
|
||||
// can we slice those steps further?
|
||||
// 2. if the line width is small, can we just ignore the path?
|
||||
// 3. line join feature can't be retained. We use line-cap-square to minimize the problem that
|
||||
// some chars actually covered by a line join are missed. However chars covered by a acute angle
|
||||
// with line-join-miter may be still recognized as not covered.
|
||||
cairo_set_line_cap(cairo, CAIRO_LINE_CAP_SQUARE);
|
||||
GfxPath * path = state->getPath();
|
||||
for (int i = 0; i < path->getNumSubpaths(); ++i) {
|
||||
GfxSubpath * subpath = path->getSubpath(i);
|
||||
if (subpath->getNumPoints() <= 0)
|
||||
continue;
|
||||
double x = subpath->getX(0);
|
||||
double y = subpath->getY(0);
|
||||
//p: loop cursor; j: next point index
|
||||
int p =1, j = 1;
|
||||
int n = subpath->getNumPoints();
|
||||
while (p <= n) {
|
||||
cairo_new_path(cairo);
|
||||
cairo_move_to(cairo, x, y);
|
||||
if (subpath->getCurve(j)) {
|
||||
x = subpath->getX(j+2);
|
||||
y = subpath->getY(j+2);
|
||||
cairo_curve_to(cairo,
|
||||
subpath->getX(j), subpath->getY(j),
|
||||
subpath->getX(j+1), subpath->getY(j+1),
|
||||
x, y);
|
||||
p += 3;
|
||||
} else {
|
||||
x = subpath->getX(j);
|
||||
y = subpath->getY(j);
|
||||
cairo_line_to(cairo, x, y);
|
||||
++p;
|
||||
}
|
||||
|
||||
DT_DEBUG(printf("DrawingTracer::stroke:new box:\n"));
|
||||
double sbox[4];
|
||||
cairo_stroke_extents(cairo, sbox, sbox + 1, sbox + 2, sbox + 3);
|
||||
if (sbox[0] != sbox[2] && sbox[1] != sbox[3])
|
||||
draw_non_char_bbox(state, sbox);
|
||||
else
|
||||
DT_DEBUG(printf("DrawingTracer::stroke:zero box!\n"));
|
||||
|
||||
if (p == n)
|
||||
{
|
||||
if (subpath->isClosed())
|
||||
j = 0; // if sub path is closed, go back to starting point
|
||||
else
|
||||
break;
|
||||
}
|
||||
else
|
||||
j = p;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void DrawingTracer::fill(GfxState * state, bool even_odd)
|
||||
{
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
|
||||
#if ENABLE_SVG
|
||||
do_path(state, state->getPath());
|
||||
//cairo_fill_extents don't take fill rule into account.
|
||||
//cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING);
|
||||
double fbox[4];
|
||||
cairo_fill_extents(cairo, fbox, fbox + 1, fbox + 2, fbox + 3);
|
||||
draw_non_char_bbox(state, fbox);
|
||||
#endif
|
||||
}
|
||||
|
||||
void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox)
|
||||
{
|
||||
#if ENABLE_SVG
|
||||
double cbox[4];
|
||||
cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
|
||||
if(bbox_intersect(cbox, bbox, bbox))
|
||||
#endif
|
||||
{
|
||||
transform_bbox_by_ctm(bbox, state);
|
||||
DT_DEBUG(printf("DrawingTracer::draw_non_char_bbox:[%f,%f,%f,%f]\n", bbox[0],bbox[1],bbox[2],bbox[3]));
|
||||
if (on_non_char_drawn)
|
||||
on_non_char_drawn(bbox);
|
||||
}
|
||||
}
|
||||
|
||||
void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox)
|
||||
{
|
||||
#if ENABLE_SVG
|
||||
// Note: even if 4 corners of the char are all in or all out of the clip area,
|
||||
// it could still be partially clipped.
|
||||
// TODO better solution?
|
||||
int pt_in = 0;
|
||||
if (cairo_in_clip(cairo, bbox[0], bbox[1]))
|
||||
++pt_in;
|
||||
if (cairo_in_clip(cairo, bbox[2], bbox[3]))
|
||||
++pt_in;
|
||||
if (cairo_in_clip(cairo, bbox[2], bbox[1]))
|
||||
++pt_in;
|
||||
if (cairo_in_clip(cairo, bbox[0], bbox[3]))
|
||||
++pt_in;
|
||||
|
||||
if (pt_in == 0)
|
||||
{
|
||||
transform_bbox_by_ctm(bbox);
|
||||
if(on_char_clipped)
|
||||
on_char_clipped(bbox, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (pt_in < 4)
|
||||
{
|
||||
double cbox[4];
|
||||
cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
|
||||
bbox_intersect(cbox, bbox, bbox);
|
||||
}
|
||||
transform_bbox_by_ctm(bbox);
|
||||
if (pt_in < 4)
|
||||
{
|
||||
if(on_char_clipped)
|
||||
on_char_clipped(bbox, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (on_char_drawn)
|
||||
on_char_drawn(bbox);
|
||||
}
|
||||
}
|
||||
#else
|
||||
transform_bbox_by_ctm(bbox, state);
|
||||
if (on_char_drawn)
|
||||
on_char_drawn(bbox);
|
||||
#endif
|
||||
DT_DEBUG(printf("DrawingTracer::draw_char_bbox:[%f,%f,%f,%f]\n",bbox[0],bbox[1],bbox[2],bbox[3]));
|
||||
}
|
||||
|
||||
void DrawingTracer::draw_image(GfxState *state)
|
||||
{
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
double bbox[4] {0, 0, 1, 1};
|
||||
draw_non_char_bbox(state, bbox);
|
||||
}
|
||||
|
||||
void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, double ay)
|
||||
{
|
||||
if (!param.correct_text_visibility)
|
||||
return;
|
||||
|
||||
Matrix tm, itm;
|
||||
memcpy(tm.m, state->getTextMat(), sizeof(tm.m));
|
||||
|
||||
double cx = state->getCurX(), cy = state->getCurY(), fs = state->getFontSize(),
|
||||
ry = state->getRise(), h = state->getHorizScaling();
|
||||
|
||||
//cx and cy has been transformed by text matrix, we need to reverse them.
|
||||
tm.invertTo(&itm);
|
||||
double char_cx, char_cy;
|
||||
itm.transform(cx, cy, &char_cx, &char_cy);
|
||||
|
||||
//TODO Vertical? Currently vertical/type3 chars are treated as non-chars.
|
||||
double char_m[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry};
|
||||
|
||||
double final_m[6];
|
||||
tm_multiply(final_m, tm.m, char_m);
|
||||
|
||||
auto font = state->getFont();
|
||||
double bbox[4] {0, 0, ax, ay};
|
||||
double desc = font->getDescent(), asc = font->getAscent();
|
||||
if (font->getWMode() == 0)
|
||||
{
|
||||
bbox[1] += desc;
|
||||
bbox[3] += asc;
|
||||
}
|
||||
else
|
||||
{//TODO Vertical?
|
||||
}
|
||||
tm_transform_bbox(final_m, bbox);
|
||||
draw_char_bbox(state, bbox);
|
||||
}
|
||||
|
||||
|
||||
void DrawingTracer::transform_bbox_by_ctm(double * bbox, GfxState * state)
|
||||
{
|
||||
#if ENABLE_SVG
|
||||
cairo_matrix_t mat;
|
||||
cairo_get_matrix(cairo, &mat);
|
||||
double mat_a[6] {mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0};
|
||||
tm_transform_bbox(mat_a, bbox);
|
||||
#else
|
||||
tm_transform_bbox(state->getCTM(), bbox);
|
||||
#endif
|
||||
}
|
||||
|
||||
} /* namespace pdf2htmlEX */
|
79
src/DrawingTracer.h
Normal file
79
src/DrawingTracer.h
Normal file
@ -0,0 +1,79 @@
|
||||
/*
|
||||
* DrawingTracer.h
|
||||
*
|
||||
* Created on: 2014-6-15
|
||||
* Author: duanyao
|
||||
*/
|
||||
|
||||
#ifndef DRAWINGTRACER_H__
|
||||
#define DRAWINGTRACER_H__
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include <GfxState.h>
|
||||
|
||||
#include "pdf2htmlEX-config.h"
|
||||
|
||||
#if ENABLE_SVG
|
||||
#include <cairo.h>
|
||||
#endif
|
||||
|
||||
#include "Param.h"
|
||||
|
||||
namespace pdf2htmlEX
|
||||
{
|
||||
|
||||
class DrawingTracer
|
||||
{
|
||||
public:
|
||||
/*
|
||||
* The callback to receive drawn event.
|
||||
* bbox in device space.
|
||||
*/
|
||||
// a non-char graphics is drawn
|
||||
std::function<void(double * bbox)> on_non_char_drawn;
|
||||
// a char is drawn in the clip area
|
||||
std::function<void(double * bbox)> on_char_drawn;
|
||||
// a char is drawn out of/partially in the clip area
|
||||
std::function<void(double * bbox, bool patially)> on_char_clipped;
|
||||
|
||||
DrawingTracer(const Param & param);
|
||||
virtual ~DrawingTracer();
|
||||
void reset(GfxState * state);
|
||||
|
||||
/*
|
||||
* A character is drawing
|
||||
* x, y: glyph-drawing position, in PDF text object space.
|
||||
* ax, ay: glyph advance, in glyph space.
|
||||
*/
|
||||
void draw_char(GfxState * state, double x, double y, double ax, double ay);
|
||||
/*
|
||||
* An image is drawing
|
||||
*/
|
||||
void draw_image(GfxState * state);
|
||||
void update_ctm(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
|
||||
void clip(GfxState * state, bool even_odd = false);
|
||||
void clip_to_stroke_path(GfxState * state);
|
||||
void fill(GfxState * state, bool even_odd = false);
|
||||
void stroke(GfxState * state);
|
||||
void save();
|
||||
void restore();
|
||||
|
||||
private:
|
||||
void finish();
|
||||
// Following methods operate in user space (just before CTM is applied)
|
||||
void do_path(GfxState * state, GfxPath * path);
|
||||
void draw_non_char_bbox(GfxState * state, double * bbox);
|
||||
void draw_char_bbox(GfxState * state, double * bbox);
|
||||
// If cairo is available, parameter state is ignored
|
||||
void transform_bbox_by_ctm(double * bbox, GfxState * state = nullptr);
|
||||
|
||||
const Param & param;
|
||||
|
||||
#if ENABLE_SVG
|
||||
cairo_t * cairo;
|
||||
#endif
|
||||
};
|
||||
|
||||
} /* namespace pdf2htmlEX */
|
||||
#endif /* DRAWINGTRACER_H__ */
|
@ -31,10 +31,13 @@
|
||||
#include "HTMLTextPage.h"
|
||||
|
||||
#include "BackgroundRenderer/BackgroundRenderer.h"
|
||||
#include "CoveredTextDetector.h"
|
||||
#include "DrawingTracer.h"
|
||||
|
||||
#include "util/const.h"
|
||||
#include "util/misc.h"
|
||||
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
class HTMLRenderer : public OutputDev
|
||||
@ -89,7 +92,9 @@ public:
|
||||
* We just mark as changed, and recheck if they have been changed when we are about to output a new string
|
||||
*/
|
||||
|
||||
virtual void restoreState(GfxState * state) { updateAll(state); }
|
||||
virtual void restoreState(GfxState * state);
|
||||
|
||||
virtual void saveState(GfxState *state);
|
||||
|
||||
virtual void updateAll(GfxState * state);
|
||||
|
||||
@ -125,15 +130,34 @@ public:
|
||||
|
||||
virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg);
|
||||
|
||||
virtual void stroke(GfxState *state) { css_do_path(state, false); }
|
||||
virtual void fill(GfxState *state) { css_do_path(state, true); }
|
||||
virtual void drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
|
||||
int width, int height,
|
||||
GfxImageColorMap *colorMap,
|
||||
GBool interpolate,
|
||||
Stream *maskStr,
|
||||
int maskWidth, int maskHeight,
|
||||
GfxImageColorMap *maskColorMap,
|
||||
GBool maskInterpolate);
|
||||
|
||||
virtual void stroke(GfxState *state); ////{ css_do_path(state, false); }
|
||||
virtual void fill(GfxState *state); ////{ css_do_path(state, true); }
|
||||
virtual void eoFill(GfxState *state);
|
||||
virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax);
|
||||
|
||||
virtual void processLink(AnnotLink * al);
|
||||
|
||||
/* capacity test */
|
||||
bool can_stroke(GfxState *state) { return css_do_path(state, false, true); }
|
||||
bool can_fill(GfxState *state) { return css_do_path(state, true, true); }
|
||||
bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); }
|
||||
bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); }
|
||||
|
||||
/*
|
||||
* Covered text handling.
|
||||
*/
|
||||
// Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page.
|
||||
// Does not fail on out-of-bound conditions, but return false.
|
||||
bool is_char_covered(int index);
|
||||
// Currently drawn char (glyph) count in current page.
|
||||
int get_char_count() { return (int)covered_text_detecor.get_chars_covered().size(); }
|
||||
|
||||
protected:
|
||||
////////////////////////////////////////////////////
|
||||
@ -195,6 +219,7 @@ protected:
|
||||
// make sure the current HTML style consistent with PDF
|
||||
void prepare_text_line(GfxState * state);
|
||||
|
||||
#if 0 //disable CSS drawing
|
||||
////////////////////////////////////////////////////
|
||||
// CSS drawing
|
||||
////////////////////////////////////////////////////
|
||||
@ -214,6 +239,7 @@ protected:
|
||||
double * line_width_array, int line_width_count,
|
||||
const GfxRGB * line_color, const GfxRGB * fill_color,
|
||||
void (*style_function)(void *, std::ostream &) = nullptr, void * style_function_data = nullptr );
|
||||
#endif //disable CSS drawing
|
||||
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
@ -328,7 +354,6 @@ protected:
|
||||
#endif
|
||||
BackgroundRenderer * bg_renderer;
|
||||
BackgroundRenderer * fallback_bg_renderer;
|
||||
bool fallback_bg_required;
|
||||
|
||||
struct {
|
||||
std::ofstream fs;
|
||||
@ -338,6 +363,9 @@ protected:
|
||||
std::string cur_page_filename;
|
||||
|
||||
static const std::string MANIFEST_FILENAME;
|
||||
|
||||
CoveredTextDetector covered_text_detecor;
|
||||
DrawingTracer tracer;
|
||||
};
|
||||
|
||||
} //namespace pdf2htmlEX
|
||||
|
@ -30,6 +30,39 @@ using std::sqrt;
|
||||
using std::vector;
|
||||
using std::ostream;
|
||||
|
||||
void HTMLRenderer::restoreState(GfxState * state)
|
||||
{
|
||||
updateAll(state);
|
||||
tracer.restore();
|
||||
}
|
||||
|
||||
void HTMLRenderer::saveState(GfxState *state)
|
||||
{
|
||||
tracer.save();
|
||||
}
|
||||
|
||||
void HTMLRenderer::stroke(GfxState * state)
|
||||
{
|
||||
tracer.stroke(state);
|
||||
}
|
||||
|
||||
void HTMLRenderer::fill(GfxState * state)
|
||||
{
|
||||
tracer.fill(state);
|
||||
}
|
||||
|
||||
void HTMLRenderer::eoFill(GfxState * state)
|
||||
{
|
||||
tracer.fill(state, true);
|
||||
}
|
||||
|
||||
GBool HTMLRenderer::axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax)
|
||||
{
|
||||
tracer.fill(state); //TODO correct?
|
||||
return true;
|
||||
}
|
||||
|
||||
#if 0 //disable css drawing
|
||||
static bool is_horizontal_line(GfxSubpath * path)
|
||||
{
|
||||
return ((path->getNumPoints() == 2)
|
||||
@ -415,6 +448,7 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co
|
||||
|
||||
(*f_curpage) << "\"></div>";
|
||||
}
|
||||
#endif //disable css drawing
|
||||
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
|
||||
#include <GlobalParams.h>
|
||||
|
||||
@ -46,6 +47,7 @@ HTMLRenderer::HTMLRenderer(const Param & param)
|
||||
,html_text_page(param, all_manager)
|
||||
,preprocessor(param)
|
||||
,tmp_files(param)
|
||||
,tracer(param)
|
||||
{
|
||||
if(!(param.debug))
|
||||
{
|
||||
@ -76,6 +78,13 @@ HTMLRenderer::HTMLRenderer(const Param & param)
|
||||
all_manager.height .set_eps(EPS);
|
||||
all_manager.width .set_eps(EPS);
|
||||
all_manager.bottom .set_eps(EPS);
|
||||
|
||||
tracer.on_char_drawn =
|
||||
[this](double * box) { covered_text_detecor.add_char_bbox(box); };
|
||||
tracer.on_char_clipped =
|
||||
[this](double * box, bool partial) { covered_text_detecor.add_char_bbox_clipped(box, partial); };
|
||||
tracer.on_non_char_drawn =
|
||||
[this](double * box) { covered_text_detecor.add_non_char_bbox(box); };
|
||||
}
|
||||
|
||||
HTMLRenderer::~HTMLRenderer()
|
||||
@ -133,13 +142,6 @@ void HTMLRenderer::process(PDFDoc *doc)
|
||||
cur_page_filename = filled_template_filename;
|
||||
}
|
||||
|
||||
if(param.process_nontext)
|
||||
{
|
||||
fallback_bg_required = !bg_renderer->render_page(doc, i);
|
||||
if (fallback_bg_required && fallback_bg_renderer != nullptr)
|
||||
fallback_bg_renderer->render_page(doc, i);
|
||||
}
|
||||
|
||||
doc->displayPage(this, i,
|
||||
text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI,
|
||||
0,
|
||||
@ -190,15 +192,20 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
||||
void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
|
||||
#endif
|
||||
{
|
||||
covered_text_detecor.reset();
|
||||
tracer.reset(state);
|
||||
|
||||
this->pageNum = pageNum;
|
||||
|
||||
double pageWidth = state->getPageWidth();
|
||||
double pageHeight = state->getPageHeight();
|
||||
html_text_page.set_page_size(state->getPageWidth(), state->getPageHeight());
|
||||
|
||||
html_text_page.set_page_size(pageWidth, pageHeight);
|
||||
reset_state();
|
||||
}
|
||||
|
||||
void HTMLRenderer::endPage() {
|
||||
long long wid = all_manager.width.install(html_text_page.get_width());
|
||||
long long hid = all_manager.height.install(html_text_page.get_height());
|
||||
|
||||
long long wid = all_manager.width.install(pageWidth);
|
||||
long long hid = all_manager.height.install(pageHeight);
|
||||
(*f_curpage)
|
||||
<< "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
|
||||
<< "\" class=\"" << CSS::PAGE_FRAME_CN
|
||||
@ -231,16 +238,15 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
|
||||
|
||||
if(param.process_nontext)
|
||||
{
|
||||
if (!fallback_bg_required)
|
||||
if (bg_renderer->render_page(cur_doc, pageNum))
|
||||
bg_renderer->embed_image(pageNum);
|
||||
else if (fallback_bg_renderer != nullptr)
|
||||
{
|
||||
if (fallback_bg_renderer->render_page(cur_doc, pageNum))
|
||||
fallback_bg_renderer->embed_image(pageNum);
|
||||
}
|
||||
|
||||
reset_state();
|
||||
}
|
||||
|
||||
void HTMLRenderer::endPage() {
|
||||
// dump all text
|
||||
html_text_page.dump_text(*f_curpage);
|
||||
html_text_page.dump_css(f_css.fs);
|
||||
|
@ -14,6 +14,8 @@ namespace pdf2htmlEX {
|
||||
|
||||
void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg)
|
||||
{
|
||||
tracer.draw_image(state);
|
||||
|
||||
return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg);
|
||||
|
||||
#if 0
|
||||
@ -62,4 +64,20 @@ void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int w
|
||||
#endif
|
||||
}
|
||||
|
||||
void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
|
||||
int width, int height,
|
||||
GfxImageColorMap *colorMap,
|
||||
GBool interpolate,
|
||||
Stream *maskStr,
|
||||
int maskWidth, int maskHeight,
|
||||
GfxImageColorMap *maskColorMap,
|
||||
GBool maskInterpolate)
|
||||
{
|
||||
tracer.draw_image(state);
|
||||
|
||||
return OutputDev::drawSoftMaskedImage(state,ref,str, // TODO really required?
|
||||
width,height,colorMap,interpolate,
|
||||
maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate);
|
||||
}
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
@ -46,6 +46,7 @@ void HTMLRenderer::updateFont(GfxState * state)
|
||||
void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32)
|
||||
{
|
||||
ctm_changed = true;
|
||||
tracer.update_ctm(state, m11, m12, m21, m22, m31, m32);
|
||||
}
|
||||
void HTMLRenderer::updateTextMat(GfxState * state)
|
||||
{
|
||||
@ -89,14 +90,17 @@ void HTMLRenderer::updateStrokeColor(GfxState * state)
|
||||
void HTMLRenderer::clip(GfxState * state)
|
||||
{
|
||||
clip_changed = true;
|
||||
tracer.clip(state);
|
||||
}
|
||||
void HTMLRenderer::eoClip(GfxState * state)
|
||||
{
|
||||
clip_changed = true;
|
||||
tracer.clip(state, true);
|
||||
}
|
||||
void HTMLRenderer::clipToStrokePath(GfxState * state)
|
||||
{
|
||||
clip_changed = true;
|
||||
tracer.clip_to_stroke_path(state);
|
||||
}
|
||||
void HTMLRenderer::reset_state()
|
||||
{
|
||||
@ -119,6 +123,8 @@ void HTMLRenderer::reset_state()
|
||||
cur_line_state.y = 0;
|
||||
memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));
|
||||
|
||||
cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);};
|
||||
|
||||
cur_clip_state.xmin = 0;
|
||||
cur_clip_state.xmax = 0;
|
||||
cur_clip_state.ymin = 0;
|
||||
@ -502,6 +508,10 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
|
||||
double rise_x, rise_y;
|
||||
state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y);
|
||||
state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y);
|
||||
|
||||
if (param.correct_text_visibility)
|
||||
cur_line_state.first_char_index = get_char_count();
|
||||
|
||||
html_text_page.open_new_line(cur_line_state);
|
||||
|
||||
cur_text_state.vertical_align = 0;
|
||||
|
@ -14,6 +14,9 @@
|
||||
#include "util/namespace.h"
|
||||
#include "util/unicode.h"
|
||||
|
||||
//#define HR_DEBUG(x) (x)
|
||||
#define HR_DEBUG(x)
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
using std::all_of;
|
||||
@ -51,26 +54,35 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
char *p = s->getCString();
|
||||
int len = s->getLength();
|
||||
|
||||
//accumulated displacement of chars in this string, in text object space
|
||||
double dx = 0;
|
||||
double dy = 0;
|
||||
double dx1,dy1;
|
||||
//displacement of current char, in text object space, including letter space but not word space.
|
||||
double ddx, ddy;
|
||||
//advance of current char, in glyph space
|
||||
double ax, ay;
|
||||
//origin of current char, in glyph space
|
||||
double ox, oy;
|
||||
|
||||
int nChars = 0;
|
||||
int nSpaces = 0;
|
||||
int uLen;
|
||||
|
||||
CharCode code;
|
||||
Unicode *u = nullptr;
|
||||
|
||||
HR_DEBUG(printf("HTMLRenderer::drawString:len=%d\n", len));
|
||||
|
||||
while (len > 0)
|
||||
{
|
||||
auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy);
|
||||
auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
|
||||
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
|
||||
|
||||
if(!(equal(ox, 0) && equal(oy, 0)))
|
||||
{
|
||||
cerr << "TODO: non-zero origins" << endl;
|
||||
}
|
||||
ddx = ax * cur_font_size + cur_letter_space;
|
||||
ddy = ay * cur_font_size;
|
||||
tracer.draw_char(state, dx, dy, ax, ay);
|
||||
|
||||
bool is_space = false;
|
||||
if (n == 1 && *p == ' ')
|
||||
@ -85,19 +97,19 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
* There are always ugly PDF files with no useful info at all.
|
||||
*/
|
||||
is_space = true;
|
||||
++nSpaces;
|
||||
}
|
||||
|
||||
if(is_space && (param.space_as_offset))
|
||||
{
|
||||
html_text_page.get_cur_line()->append_padding_char();
|
||||
// ignore horiz_scaling, as it has been merged into CTM
|
||||
html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
|
||||
html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
|
||||
{
|
||||
html_text_page.get_cur_line()->append_unicodes(u, uLen, (dx1 * cur_font_size + cur_letter_space));
|
||||
html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -110,7 +122,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
{
|
||||
uu = unicode_from_font(code, font);
|
||||
}
|
||||
html_text_page.get_cur_line()->append_unicodes(&uu, 1, (dx1 * cur_font_size + cur_letter_space));
|
||||
html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
|
||||
/*
|
||||
* In PDF, word_space is appended if (n == 1 and *p = ' ')
|
||||
* but in HTML, word_space is appended if (uu == ' ')
|
||||
@ -123,19 +135,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
}
|
||||
}
|
||||
|
||||
dx += dx1;
|
||||
dy += dy1;
|
||||
dx += ddx * cur_horiz_scaling;
|
||||
dy += ddy;
|
||||
if (is_space)
|
||||
dx += cur_word_space * cur_horiz_scaling;
|
||||
|
||||
++nChars;
|
||||
p += n;
|
||||
len -= n;
|
||||
}
|
||||
|
||||
// horiz_scaling is merged into ctm now,
|
||||
// so the coordinate system is ugly
|
||||
dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * cur_horiz_scaling;
|
||||
dy *= cur_font_size;
|
||||
|
||||
cur_tx += dx;
|
||||
cur_ty += dy;
|
||||
|
||||
@ -143,4 +151,16 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
draw_ty += dy;
|
||||
}
|
||||
|
||||
bool HTMLRenderer::is_char_covered(int index)
|
||||
{
|
||||
auto covered = covered_text_detecor.get_chars_covered();
|
||||
if (index < 0 || index >= (int)covered.size())
|
||||
{
|
||||
std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: "
|
||||
<< index << ", size: " << covered.size() <<endl;
|
||||
return false;
|
||||
}
|
||||
return covered[index];
|
||||
}
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
@ -5,6 +5,8 @@
|
||||
#ifndef HTMLSTATE_H__
|
||||
#define HTMLSTATE_H__
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "Color.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
@ -62,6 +64,12 @@ struct HTMLLineState
|
||||
{
|
||||
double x,y;
|
||||
double transform_matrix[4];
|
||||
// The page-cope char index(in drawing order) of the first char in this line.
|
||||
int first_char_index;
|
||||
// A function to determine whether a char is covered at a given index.
|
||||
std::function<bool(int)> is_char_covered;
|
||||
|
||||
HTMLLineState(): first_char_index(-1) { }
|
||||
};
|
||||
|
||||
struct HTMLClipState
|
||||
|
@ -36,7 +36,14 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
|
||||
|
||||
void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
|
||||
{
|
||||
text.insert(text.end(), u, u+l);
|
||||
if (l == 1)
|
||||
text.push_back(min(u[0], (unsigned)INT_MAX));
|
||||
else if (l > 1)
|
||||
{
|
||||
text.push_back(- decomposed_text.size() - 1);
|
||||
decomposed_text.emplace_back();
|
||||
decomposed_text.back().assign(u, u + l);
|
||||
}
|
||||
this->width += width;
|
||||
}
|
||||
|
||||
@ -69,6 +76,60 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state)
|
||||
last_state.font_size *= last_state.font_info->font_size_scale;
|
||||
}
|
||||
|
||||
void HTMLTextLine::dump_char(std::ostream & out, int pos)
|
||||
{
|
||||
int c = text[pos];
|
||||
if (c > 0)
|
||||
{
|
||||
Unicode u = c;
|
||||
writeUnicodes(out, &u, 1);
|
||||
}
|
||||
else if (c < 0)
|
||||
{
|
||||
auto dt = decomposed_text[- c - 1];
|
||||
writeUnicodes(out, &dt.front(), dt.size());
|
||||
}
|
||||
}
|
||||
|
||||
void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
|
||||
{
|
||||
static const Color transparent(0, 0, 0, true);
|
||||
|
||||
if (line_state.first_char_index < 0)
|
||||
{
|
||||
for (int i = 0; i < len; i++)
|
||||
dump_char(out, begin + i);
|
||||
return;
|
||||
}
|
||||
|
||||
bool invisible_group_open = false;
|
||||
for(int i = 0; i < len; i++)
|
||||
{
|
||||
if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible
|
||||
{
|
||||
if (invisible_group_open)
|
||||
{
|
||||
invisible_group_open = false;
|
||||
out << "</span>";
|
||||
}
|
||||
dump_char(out, begin + i);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!invisible_group_open)
|
||||
{
|
||||
out << "<span class=\"" << all_manager.fill_color.get_css_class_name()
|
||||
<< all_manager.fill_color.install(transparent) << " " << all_manager.stroke_color.get_css_class_name()
|
||||
<< all_manager.stroke_color.install(transparent) << "\">";
|
||||
invisible_group_open = true;
|
||||
}
|
||||
dump_char(out, begin + i);
|
||||
}
|
||||
}
|
||||
if (invisible_group_open)
|
||||
out << "</span>";
|
||||
}
|
||||
|
||||
void HTMLTextLine::dump_text(ostream & out)
|
||||
{
|
||||
/*
|
||||
@ -216,7 +277,7 @@ void HTMLTextLine::dump_text(ostream & out)
|
||||
size_t next_text_idx = text_idx2;
|
||||
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
|
||||
next_text_idx = cur_offset_iter->start_idx;
|
||||
writeUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
|
||||
dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx);
|
||||
cur_text_idx = next_text_idx;
|
||||
}
|
||||
}
|
||||
|
@ -73,7 +73,16 @@ public:
|
||||
double width;
|
||||
};
|
||||
|
||||
/**
|
||||
* Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
|
||||
* multiple code points.
|
||||
*/
|
||||
void append_unicodes(const Unicode * u, int l, double width);
|
||||
/**
|
||||
* Append a special padding char with 0 width, in order to keep char index consistent.
|
||||
* The padding char is ignored during output.
|
||||
*/
|
||||
void append_padding_char() { text.push_back(0); }
|
||||
void append_offset(double width);
|
||||
void append_state(const HTMLTextState & text_state);
|
||||
void dump_text(std::ostream & out);
|
||||
@ -92,6 +101,13 @@ private:
|
||||
void optimize_normal(std::vector<HTMLTextLine*> &);
|
||||
void optimize_aggressive(std::vector<HTMLTextLine*> &);
|
||||
|
||||
/**
|
||||
* Dump chars' unicode to output stream.
|
||||
* begin/pos is the index in 'text'.
|
||||
*/
|
||||
void dump_chars(std::ostream & out, int begin, int len);
|
||||
void dump_char(std::ostream & out, int pos);
|
||||
|
||||
const Param & param;
|
||||
AllStateManager & all_manager;
|
||||
|
||||
@ -102,7 +118,16 @@ private:
|
||||
|
||||
std::vector<State> states;
|
||||
std::vector<Offset> offsets;
|
||||
std::vector<Unicode> text;
|
||||
|
||||
/**
|
||||
* Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
|
||||
* - If c > 0, it is the unicode code point corresponds to the glyph;
|
||||
* - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
|
||||
* - If c < -1, this glyph corresponds to more than one unicode code points,
|
||||
* which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
|
||||
*/
|
||||
std::vector<int> text;
|
||||
std::vector<std::vector<Unicode> > decomposed_text;
|
||||
};
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
@ -39,6 +39,9 @@ public:
|
||||
void set_page_size(double width, double height);
|
||||
void clip(const HTMLClipState & clip_state);
|
||||
|
||||
double get_width() { return page_width; }
|
||||
double get_height() { return page_height; }
|
||||
|
||||
private:
|
||||
void optimize(void);
|
||||
|
||||
|
@ -38,6 +38,7 @@ struct Param
|
||||
int process_nontext;
|
||||
int process_outline;
|
||||
int process_annotation;
|
||||
int correct_text_visibility;
|
||||
int printing;
|
||||
int fallback;
|
||||
int tmp_file_size_limit;
|
||||
|
@ -187,6 +187,7 @@ void parse_options (int argc, char **argv)
|
||||
.add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets")
|
||||
.add("tounicode", ¶m.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)")
|
||||
.add("optimize-text", ¶m.optimize_text, 0, "try to reduce the number of HTML elements used for text")
|
||||
.add("correct-text-visibility", ¶m.correct_text_visibility, 0, "try to detect texts covered by other graphics and properly arrange them")
|
||||
|
||||
// background image
|
||||
.add("bg-format", ¶m.bg_format, "png", "specify background image format")
|
||||
|
@ -1,8 +1,12 @@
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
|
||||
#include "math.h"
|
||||
|
||||
using std::min;
|
||||
using std::max;
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
void tm_transform(const double * tm, double & x, double & y, bool is_delta)
|
||||
@ -56,5 +60,31 @@ void tm_transform_bbox(const double * tm, double * bbox)
|
||||
}
|
||||
}
|
||||
|
||||
bool bbox_intersect(const double * bbox1, const double * bbox2, double * result)
|
||||
{
|
||||
double x0, y0, x1, y1;
|
||||
|
||||
x0 = max(min(bbox1[0], bbox1[2]), min(bbox2[0], bbox2[2]));
|
||||
x1 = min(max(bbox1[0], bbox1[2]), max(bbox2[0], bbox2[2]));
|
||||
|
||||
if (x0 >= x1)
|
||||
return false;
|
||||
|
||||
y0 = max(min(bbox1[1], bbox1[3]), min(bbox2[1], bbox2[3]));
|
||||
y1 = min(max(bbox1[1], bbox1[3]), max(bbox2[1], bbox2[3]));
|
||||
|
||||
if (y0 >= y1)
|
||||
return false;
|
||||
|
||||
if (result)
|
||||
{
|
||||
result[0] = x0;
|
||||
result[1] = y0;
|
||||
result[2] = x1;
|
||||
result[3] = y1;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} //namespace pdf2htmlEX
|
||||
|
||||
|
@ -24,6 +24,13 @@ static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void tm_init(double * tm)
|
||||
{
|
||||
tm[0] = tm[3] = 1;
|
||||
tm[1] = tm[2] = tm[4] = tm[5] = 0;
|
||||
}
|
||||
|
||||
static inline void tm_multiply(double * result, const double * m1, const double * m2)
|
||||
{
|
||||
result[0] = m1[0] * m2[0] + m1[2] * m2[1];
|
||||
@ -39,6 +46,14 @@ static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); }
|
||||
void tm_transform(const double * tm, double & x, double & y, bool is_delta = false);
|
||||
void tm_multiply(double * tm_left, const double * tm_right);
|
||||
void tm_transform_bbox(const double * tm, double * bbox);
|
||||
/**
|
||||
* Calculate the intersection of 2 boxes.
|
||||
* If they are intersecting, store the result to result (if not null) and return true.
|
||||
* Otherwise return false, and result is not touched.
|
||||
* Param result can be same as one of bbox1 and bbox2.
|
||||
* Data in boxes are expected in the order of (x0, y0, x1, y1).
|
||||
*/
|
||||
bool bbox_intersect(const double * bbox1, const double * bbox2, double * result = nullptr);
|
||||
|
||||
} //namespace pdf2htmlEX
|
||||
#endif //MATH_H__
|
||||
|
Loading…
Reference in New Issue
Block a user