1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-03 00:35:40 +00:00

Merge pull request #365 from duanyao/covered_text_handling

Covered text handling
This commit is contained in:
Lu Wang 2014-07-13 17:41:31 -07:00
commit 80b8e1f5de
24 changed files with 881 additions and 43 deletions

View File

@ -161,6 +161,10 @@ set(PDF2HTMLEX_SRC ${PDF2HTMLEX_SRC}
src/Base64Stream.cc
src/Color.h
src/Color.cc
src/CoveredTextDetector.h
src/CoveredTextDetector.cc
src/DrawingTracer.h
src/DrawingTracer.cc
src/HTMLState.h
src/HTMLTextLine.h
src/HTMLTextLine.cc

View File

@ -242,6 +242,11 @@ If set to 0, pdf2htmlEX would try its best to balance the two methods above.
.B \-\-optimize\-text <0|1> (Default: 0)
If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for text. Turn it off if anything goes wrong.
.TP
.B --correct-text-visibility <0|1> (Default: 0)
If set to 1, pdf2htmlEX will try to detect texts covered by other graphics and properly arrange them,
i.e. covered texts are made transparent in text layer, and are drawn on background layer.
.SS Background Image
.TP

View File

@ -63,6 +63,13 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
{
CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
}
// If a char is treated as image, it is not subject to cover test
// (see HTMLRenderer::drawString), so don't increase drawn_char_count.
else if (param.correct_text_visibility) {
if (html_renderer->is_char_covered(drawn_char_count))
CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
drawn_char_count++;
}
}
void CairoBackgroundRenderer::beginTextObject(GfxState *state)
@ -104,6 +111,7 @@ static GBool annot_cb(Annot *, void * pflag) {
bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
{
drawn_char_count = 0;
double page_width;
double page_height;
if(param.use_cropbox)

View File

@ -67,6 +67,7 @@ private:
std::unordered_map<int, int> bitmaps_ref_count;
// id of bitmaps' stream used by current page
std::vector<int> bitmaps_in_current_page;
int drawn_char_count;
};
}

View File

@ -88,6 +88,13 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
{
SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
}
// If a char is treated as image, it is not subject to cover test
// (see HTMLRenderer::drawString), so don't increase drawn_char_count.
else if (param.correct_text_visibility) {
if (html_renderer->is_char_covered(drawn_char_count))
SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
drawn_char_count++;
}
}
void SplashBackgroundRenderer::beginTextObject(GfxState *state)
@ -129,6 +136,7 @@ static GBool annot_cb(Annot *, void * pflag) {
bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
{
drawn_char_count = 0;
bool process_annotation = param.process_annotation;
doc->displayPage(this, pageno, param.h_dpi, param.v_dpi,
0,

View File

@ -71,6 +71,7 @@ protected:
HTMLRenderer * html_renderer;
const Param & param;
std::string format;
int drawn_char_count;
};
} // namespace pdf2htmlEX

View File

@ -0,0 +1,51 @@
/*
* CoveredTextDetector.cc
*
* Created on: 2014-6-14
* Author: duanyao
*/
#include "CoveredTextDetector.h"
#include "util/math.h"
namespace pdf2htmlEX {
void CoveredTextDetector::reset()
{
char_bboxes.clear();
chars_covered.clear();
}
void CoveredTextDetector::add_char_bbox(double * bbox)
{
char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4);
chars_covered.push_back(false);
}
void CoveredTextDetector::add_char_bbox_clipped(double * bbox, bool patially)
{
char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4);
chars_covered.push_back(true);
if (patially)
add_non_char_bbox(bbox, chars_covered.size() - 1);
}
void CoveredTextDetector::add_non_char_bbox(double * bbox, int index)
{
if (index < 0)
index = chars_covered.size();
for (int i = 0; i < index; i++)
{
if (chars_covered[i])
continue;
double * cbbox = &char_bboxes[i * 4];
if (bbox_intersect(cbbox, bbox))
{
chars_covered[i] = true;
add_non_char_bbox(cbbox, i);
}
}
}
}

61
src/CoveredTextDetector.h Normal file
View File

@ -0,0 +1,61 @@
/*
* CoveredTextDetector.h
*
* Created on: 2014-6-14
* Author: duanyao
*/
#ifndef COVEREDTEXTDETECTOR_H__
#define COVEREDTEXTDETECTOR_H__
#include <vector>
namespace pdf2htmlEX {
/**
* Detect characters that are covered by non-char graphics on a page.
*/
class CoveredTextDetector
{
public:
/**
* Reset to initial state. Should be called when start drawing a page.
*/
void reset();
/**
* Add a drawn character's bounding box.
* @param bbox (x0, y0, x1, y1)
*/
void add_char_bbox(double * bbox);
void add_char_bbox_clipped(double * bbox, bool patially);
/**
* Add a drawn non-char graphics' bounding box.
* If it intersects any previously drawn char's bbox, the char is marked as covered
* and treated as an non-char.
* @param bbox (x0, y0, x1, y1)
* @param index this graphics' drawing order: assume it is drawn after (index-1)th
* char. -1 means after the last char.
*/
void add_non_char_bbox(double * bbox, int index = -1);
/**
* An array of flags indicating whether a char is covered by any non-char graphics.
* Index by the order that these chars are added.
* This vector grows as add_char_bbox() is called, so its size is the count
* of currently drawn chars.
*/
const std::vector<bool> & get_chars_covered() { return chars_covered; }
private:
std::vector<bool> chars_covered;
// x00, y00, x01, y01; x10, y10, x11, y11;...
std::vector<double> char_bboxes;
};
}
#endif /* COVEREDTEXTDETECTOR_H__ */

360
src/DrawingTracer.cc Normal file
View File

@ -0,0 +1,360 @@
/*
* DrawingTracer.cc
*
* Created on: 2014-6-15
* Author: duanyao
*/
#include "GfxFont.h"
#include "util/math.h"
#include "DrawingTracer.h"
//#define DT_DEBUG(x) (x)
#define DT_DEBUG(x)
#if !ENABLE_SVG
#warning "Cairo is disabled because ENABLE_SVG is off, --correct-text-visibility has limited functionality."
#endif
namespace pdf2htmlEX
{
DrawingTracer::DrawingTracer(const Param & param): param(param)
#if ENABLE_SVG
, cairo(nullptr)
#endif
{
}
DrawingTracer::~DrawingTracer()
{
finish();
}
void DrawingTracer::reset(GfxState *state)
{
if (!param.correct_text_visibility)
return;
finish();
#if ENABLE_SVG
cairo_rectangle_t page_box {0, 0, width:state->getPageWidth(), height:state->getPageHeight()};
cairo_surface_t * surface = cairo_recording_surface_create(CAIRO_CONTENT_COLOR_ALPHA, &page_box);
cairo = cairo_create(surface);
#endif
}
void DrawingTracer::finish()
{
#if ENABLE_SVG
if (cairo)
{
cairo_destroy(cairo);
cairo = nullptr;
}
#endif
}
// Poppler won't inform us its initial CTM, and the initial CTM is affected by zoom level.
// OutputDev::clip() may be called before OutputDev::updateCTM(), so we can't rely on GfxState::getCTM(),
// and should trace ctm changes ourself (via cairo).
void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m21, double m22, double m31, double m32)
{
if (!param.correct_text_visibility)
return;
#if ENABLE_SVG
cairo_matrix_t matrix;
matrix.xx = m11;
matrix.yx = m12;
matrix.xy = m21;
matrix.yy = m22;
matrix.x0 = m31;
matrix.y0 = m32;
cairo_transform(cairo, &matrix);
#endif
}
void DrawingTracer::clip(GfxState * state, bool even_odd)
{
if (!param.correct_text_visibility)
return;
#if ENABLE_SVG
do_path(state, state->getPath());
cairo_set_fill_rule(cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING);
cairo_clip (cairo);
#endif
}
void DrawingTracer::clip_to_stroke_path(GfxState * state)
{
if (!param.correct_text_visibility)
return;
// TODO cairo_stroke_to_path() ?
}
void DrawingTracer::save()
{
if (!param.correct_text_visibility)
return;
#if ENABLE_SVG
cairo_save(cairo);
#endif
}
void DrawingTracer::restore()
{
if (!param.correct_text_visibility)
return;
#if ENABLE_SVG
cairo_restore(cairo);
#endif
}
void DrawingTracer::do_path(GfxState * state, GfxPath * path)
{
#if ENABLE_SVG
//copy from CairoOutputDev::doPath
GfxSubpath *subpath;
int i, j;
double x, y;
cairo_new_path(cairo);
for (i = 0; i < path->getNumSubpaths(); ++i) {
subpath = path->getSubpath(i);
if (subpath->getNumPoints() > 0) {
x = subpath->getX(0);
y = subpath->getY(0);
cairo_move_to(cairo, x, y);
j = 1;
while (j < subpath->getNumPoints()) {
if (subpath->getCurve(j)) {
x = subpath->getX(j+2);
y = subpath->getY(j+2);
cairo_curve_to(cairo,
subpath->getX(j), subpath->getY(j),
subpath->getX(j+1), subpath->getY(j+1),
x, y);
j += 3;
} else {
x = subpath->getX(j);
y = subpath->getY(j);
cairo_line_to(cairo, x, y);
++j;
}
}
if (subpath->isClosed()) {
cairo_close_path (cairo);
}
}
}
#endif
}
void DrawingTracer::stroke(GfxState * state)
{
#if ENABLE_SVG
if (!param.correct_text_visibility)
return;
DT_DEBUG(printf("DrawingTracer::stroke\n"));
cairo_set_line_width(cairo, state->getLineWidth());
// GfxPath is broken into steps, each step makes up a cairo path and its bbox is used for covering test.
// TODO
// 1. path steps that are not vertical or horizontal lines may still falsely "cover" many chars,
// can we slice those steps further?
// 2. if the line width is small, can we just ignore the path?
// 3. line join feature can't be retained. We use line-cap-square to minimize the problem that
// some chars actually covered by a line join are missed. However chars covered by a acute angle
// with line-join-miter may be still recognized as not covered.
cairo_set_line_cap(cairo, CAIRO_LINE_CAP_SQUARE);
GfxPath * path = state->getPath();
for (int i = 0; i < path->getNumSubpaths(); ++i) {
GfxSubpath * subpath = path->getSubpath(i);
if (subpath->getNumPoints() <= 0)
continue;
double x = subpath->getX(0);
double y = subpath->getY(0);
//p: loop cursor; j: next point index
int p =1, j = 1;
int n = subpath->getNumPoints();
while (p <= n) {
cairo_new_path(cairo);
cairo_move_to(cairo, x, y);
if (subpath->getCurve(j)) {
x = subpath->getX(j+2);
y = subpath->getY(j+2);
cairo_curve_to(cairo,
subpath->getX(j), subpath->getY(j),
subpath->getX(j+1), subpath->getY(j+1),
x, y);
p += 3;
} else {
x = subpath->getX(j);
y = subpath->getY(j);
cairo_line_to(cairo, x, y);
++p;
}
DT_DEBUG(printf("DrawingTracer::stroke:new box:\n"));
double sbox[4];
cairo_stroke_extents(cairo, sbox, sbox + 1, sbox + 2, sbox + 3);
if (sbox[0] != sbox[2] && sbox[1] != sbox[3])
draw_non_char_bbox(state, sbox);
else
DT_DEBUG(printf("DrawingTracer::stroke:zero box!\n"));
if (p == n)
{
if (subpath->isClosed())
j = 0; // if sub path is closed, go back to starting point
else
break;
}
else
j = p;
}
}
#endif
}
void DrawingTracer::fill(GfxState * state, bool even_odd)
{
if (!param.correct_text_visibility)
return;
#if ENABLE_SVG
do_path(state, state->getPath());
//cairo_fill_extents don't take fill rule into account.
//cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING);
double fbox[4];
cairo_fill_extents(cairo, fbox, fbox + 1, fbox + 2, fbox + 3);
draw_non_char_bbox(state, fbox);
#endif
}
void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox)
{
#if ENABLE_SVG
double cbox[4];
cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
if(bbox_intersect(cbox, bbox, bbox))
#endif
{
transform_bbox_by_ctm(bbox, state);
DT_DEBUG(printf("DrawingTracer::draw_non_char_bbox:[%f,%f,%f,%f]\n", bbox[0],bbox[1],bbox[2],bbox[3]));
if (on_non_char_drawn)
on_non_char_drawn(bbox);
}
}
void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox)
{
#if ENABLE_SVG
// Note: even if 4 corners of the char are all in or all out of the clip area,
// it could still be partially clipped.
// TODO better solution?
int pt_in = 0;
if (cairo_in_clip(cairo, bbox[0], bbox[1]))
++pt_in;
if (cairo_in_clip(cairo, bbox[2], bbox[3]))
++pt_in;
if (cairo_in_clip(cairo, bbox[2], bbox[1]))
++pt_in;
if (cairo_in_clip(cairo, bbox[0], bbox[3]))
++pt_in;
if (pt_in == 0)
{
transform_bbox_by_ctm(bbox);
if(on_char_clipped)
on_char_clipped(bbox, false);
}
else
{
if (pt_in < 4)
{
double cbox[4];
cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
bbox_intersect(cbox, bbox, bbox);
}
transform_bbox_by_ctm(bbox);
if (pt_in < 4)
{
if(on_char_clipped)
on_char_clipped(bbox, true);
}
else
{
if (on_char_drawn)
on_char_drawn(bbox);
}
}
#else
transform_bbox_by_ctm(bbox, state);
if (on_char_drawn)
on_char_drawn(bbox);
#endif
DT_DEBUG(printf("DrawingTracer::draw_char_bbox:[%f,%f,%f,%f]\n",bbox[0],bbox[1],bbox[2],bbox[3]));
}
void DrawingTracer::draw_image(GfxState *state)
{
if (!param.correct_text_visibility)
return;
double bbox[4] {0, 0, 1, 1};
draw_non_char_bbox(state, bbox);
}
void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, double ay)
{
if (!param.correct_text_visibility)
return;
Matrix tm, itm;
memcpy(tm.m, state->getTextMat(), sizeof(tm.m));
double cx = state->getCurX(), cy = state->getCurY(), fs = state->getFontSize(),
ry = state->getRise(), h = state->getHorizScaling();
//cx and cy has been transformed by text matrix, we need to reverse them.
tm.invertTo(&itm);
double char_cx, char_cy;
itm.transform(cx, cy, &char_cx, &char_cy);
//TODO Vertical? Currently vertical/type3 chars are treated as non-chars.
double char_m[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry};
double final_m[6];
tm_multiply(final_m, tm.m, char_m);
auto font = state->getFont();
double bbox[4] {0, 0, ax, ay};
double desc = font->getDescent(), asc = font->getAscent();
if (font->getWMode() == 0)
{
bbox[1] += desc;
bbox[3] += asc;
}
else
{//TODO Vertical?
}
tm_transform_bbox(final_m, bbox);
draw_char_bbox(state, bbox);
}
void DrawingTracer::transform_bbox_by_ctm(double * bbox, GfxState * state)
{
#if ENABLE_SVG
cairo_matrix_t mat;
cairo_get_matrix(cairo, &mat);
double mat_a[6] {mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0};
tm_transform_bbox(mat_a, bbox);
#else
tm_transform_bbox(state->getCTM(), bbox);
#endif
}
} /* namespace pdf2htmlEX */

79
src/DrawingTracer.h Normal file
View File

@ -0,0 +1,79 @@
/*
* DrawingTracer.h
*
* Created on: 2014-6-15
* Author: duanyao
*/
#ifndef DRAWINGTRACER_H__
#define DRAWINGTRACER_H__
#include <functional>
#include <GfxState.h>
#include "pdf2htmlEX-config.h"
#if ENABLE_SVG
#include <cairo.h>
#endif
#include "Param.h"
namespace pdf2htmlEX
{
class DrawingTracer
{
public:
/*
* The callback to receive drawn event.
* bbox in device space.
*/
// a non-char graphics is drawn
std::function<void(double * bbox)> on_non_char_drawn;
// a char is drawn in the clip area
std::function<void(double * bbox)> on_char_drawn;
// a char is drawn out of/partially in the clip area
std::function<void(double * bbox, bool patially)> on_char_clipped;
DrawingTracer(const Param & param);
virtual ~DrawingTracer();
void reset(GfxState * state);
/*
* A character is drawing
* x, y: glyph-drawing position, in PDF text object space.
* ax, ay: glyph advance, in glyph space.
*/
void draw_char(GfxState * state, double x, double y, double ax, double ay);
/*
* An image is drawing
*/
void draw_image(GfxState * state);
void update_ctm(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
void clip(GfxState * state, bool even_odd = false);
void clip_to_stroke_path(GfxState * state);
void fill(GfxState * state, bool even_odd = false);
void stroke(GfxState * state);
void save();
void restore();
private:
void finish();
// Following methods operate in user space (just before CTM is applied)
void do_path(GfxState * state, GfxPath * path);
void draw_non_char_bbox(GfxState * state, double * bbox);
void draw_char_bbox(GfxState * state, double * bbox);
// If cairo is available, parameter state is ignored
void transform_bbox_by_ctm(double * bbox, GfxState * state = nullptr);
const Param & param;
#if ENABLE_SVG
cairo_t * cairo;
#endif
};
} /* namespace pdf2htmlEX */
#endif /* DRAWINGTRACER_H__ */

View File

@ -31,10 +31,13 @@
#include "HTMLTextPage.h"
#include "BackgroundRenderer/BackgroundRenderer.h"
#include "CoveredTextDetector.h"
#include "DrawingTracer.h"
#include "util/const.h"
#include "util/misc.h"
namespace pdf2htmlEX {
class HTMLRenderer : public OutputDev
@ -89,7 +92,9 @@ public:
* We just mark as changed, and recheck if they have been changed when we are about to output a new string
*/
virtual void restoreState(GfxState * state) { updateAll(state); }
virtual void restoreState(GfxState * state);
virtual void saveState(GfxState *state);
virtual void updateAll(GfxState * state);
@ -125,15 +130,34 @@ public:
virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg);
virtual void stroke(GfxState *state) { css_do_path(state, false); }
virtual void fill(GfxState *state) { css_do_path(state, true); }
virtual void drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
int width, int height,
GfxImageColorMap *colorMap,
GBool interpolate,
Stream *maskStr,
int maskWidth, int maskHeight,
GfxImageColorMap *maskColorMap,
GBool maskInterpolate);
virtual void stroke(GfxState *state); ////{ css_do_path(state, false); }
virtual void fill(GfxState *state); ////{ css_do_path(state, true); }
virtual void eoFill(GfxState *state);
virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax);
virtual void processLink(AnnotLink * al);
/* capacity test */
bool can_stroke(GfxState *state) { return css_do_path(state, false, true); }
bool can_fill(GfxState *state) { return css_do_path(state, true, true); }
bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); }
bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); }
/*
* Covered text handling.
*/
// Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page.
// Does not fail on out-of-bound conditions, but return false.
bool is_char_covered(int index);
// Currently drawn char (glyph) count in current page.
int get_char_count() { return (int)covered_text_detecor.get_chars_covered().size(); }
protected:
////////////////////////////////////////////////////
@ -195,6 +219,7 @@ protected:
// make sure the current HTML style consistent with PDF
void prepare_text_line(GfxState * state);
#if 0 //disable CSS drawing
////////////////////////////////////////////////////
// CSS drawing
////////////////////////////////////////////////////
@ -214,6 +239,7 @@ protected:
double * line_width_array, int line_width_count,
const GfxRGB * line_color, const GfxRGB * fill_color,
void (*style_function)(void *, std::ostream &) = nullptr, void * style_function_data = nullptr );
#endif //disable CSS drawing
////////////////////////////////////////////////////
@ -328,7 +354,6 @@ protected:
#endif
BackgroundRenderer * bg_renderer;
BackgroundRenderer * fallback_bg_renderer;
bool fallback_bg_required;
struct {
std::ofstream fs;
@ -338,6 +363,9 @@ protected:
std::string cur_page_filename;
static const std::string MANIFEST_FILENAME;
CoveredTextDetector covered_text_detecor;
DrawingTracer tracer;
};
} //namespace pdf2htmlEX

View File

@ -30,6 +30,39 @@ using std::sqrt;
using std::vector;
using std::ostream;
void HTMLRenderer::restoreState(GfxState * state)
{
updateAll(state);
tracer.restore();
}
void HTMLRenderer::saveState(GfxState *state)
{
tracer.save();
}
void HTMLRenderer::stroke(GfxState * state)
{
tracer.stroke(state);
}
void HTMLRenderer::fill(GfxState * state)
{
tracer.fill(state);
}
void HTMLRenderer::eoFill(GfxState * state)
{
tracer.fill(state, true);
}
GBool HTMLRenderer::axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax)
{
tracer.fill(state); //TODO correct?
return true;
}
#if 0 //disable css drawing
static bool is_horizontal_line(GfxSubpath * path)
{
return ((path->getNumPoints() == 2)
@ -415,6 +448,7 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co
(*f_curpage) << "\"></div>";
}
#endif //disable css drawing
} // namespace pdf2htmlEX

View File

@ -11,6 +11,7 @@
#include <cmath>
#include <algorithm>
#include <vector>
#include <functional>
#include <GlobalParams.h>
@ -46,6 +47,7 @@ HTMLRenderer::HTMLRenderer(const Param & param)
,html_text_page(param, all_manager)
,preprocessor(param)
,tmp_files(param)
,tracer(param)
{
if(!(param.debug))
{
@ -76,6 +78,13 @@ HTMLRenderer::HTMLRenderer(const Param & param)
all_manager.height .set_eps(EPS);
all_manager.width .set_eps(EPS);
all_manager.bottom .set_eps(EPS);
tracer.on_char_drawn =
[this](double * box) { covered_text_detecor.add_char_bbox(box); };
tracer.on_char_clipped =
[this](double * box, bool partial) { covered_text_detecor.add_char_bbox_clipped(box, partial); };
tracer.on_non_char_drawn =
[this](double * box) { covered_text_detecor.add_non_char_bbox(box); };
}
HTMLRenderer::~HTMLRenderer()
@ -133,13 +142,6 @@ void HTMLRenderer::process(PDFDoc *doc)
cur_page_filename = filled_template_filename;
}
if(param.process_nontext)
{
fallback_bg_required = !bg_renderer->render_page(doc, i);
if (fallback_bg_required && fallback_bg_renderer != nullptr)
fallback_bg_renderer->render_page(doc, i);
}
doc->displayPage(this, i,
text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI,
0,
@ -190,15 +192,20 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
#endif
{
covered_text_detecor.reset();
tracer.reset(state);
this->pageNum = pageNum;
double pageWidth = state->getPageWidth();
double pageHeight = state->getPageHeight();
html_text_page.set_page_size(state->getPageWidth(), state->getPageHeight());
html_text_page.set_page_size(pageWidth, pageHeight);
reset_state();
}
void HTMLRenderer::endPage() {
long long wid = all_manager.width.install(html_text_page.get_width());
long long hid = all_manager.height.install(html_text_page.get_height());
long long wid = all_manager.width.install(pageWidth);
long long hid = all_manager.height.install(pageHeight);
(*f_curpage)
<< "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
<< "\" class=\"" << CSS::PAGE_FRAME_CN
@ -231,16 +238,15 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
if(param.process_nontext)
{
if (!fallback_bg_required)
if (bg_renderer->render_page(cur_doc, pageNum))
bg_renderer->embed_image(pageNum);
else if (fallback_bg_renderer != nullptr)
fallback_bg_renderer->embed_image(pageNum);
{
if (fallback_bg_renderer->render_page(cur_doc, pageNum))
fallback_bg_renderer->embed_image(pageNum);
}
}
reset_state();
}
void HTMLRenderer::endPage() {
// dump all text
html_text_page.dump_text(*f_curpage);
html_text_page.dump_css(f_css.fs);

View File

@ -14,6 +14,8 @@ namespace pdf2htmlEX {
void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg)
{
tracer.draw_image(state);
return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg);
#if 0
@ -62,4 +64,20 @@ void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int w
#endif
}
void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
int width, int height,
GfxImageColorMap *colorMap,
GBool interpolate,
Stream *maskStr,
int maskWidth, int maskHeight,
GfxImageColorMap *maskColorMap,
GBool maskInterpolate)
{
tracer.draw_image(state);
return OutputDev::drawSoftMaskedImage(state,ref,str, // TODO really required?
width,height,colorMap,interpolate,
maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate);
}
} // namespace pdf2htmlEX

View File

@ -46,6 +46,7 @@ void HTMLRenderer::updateFont(GfxState * state)
void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32)
{
ctm_changed = true;
tracer.update_ctm(state, m11, m12, m21, m22, m31, m32);
}
void HTMLRenderer::updateTextMat(GfxState * state)
{
@ -89,14 +90,17 @@ void HTMLRenderer::updateStrokeColor(GfxState * state)
void HTMLRenderer::clip(GfxState * state)
{
clip_changed = true;
tracer.clip(state);
}
void HTMLRenderer::eoClip(GfxState * state)
{
clip_changed = true;
tracer.clip(state, true);
}
void HTMLRenderer::clipToStrokePath(GfxState * state)
{
clip_changed = true;
tracer.clip_to_stroke_path(state);
}
void HTMLRenderer::reset_state()
{
@ -119,6 +123,8 @@ void HTMLRenderer::reset_state()
cur_line_state.y = 0;
memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));
cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);};
cur_clip_state.xmin = 0;
cur_clip_state.xmax = 0;
cur_clip_state.ymin = 0;
@ -502,6 +508,10 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
double rise_x, rise_y;
state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y);
state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y);
if (param.correct_text_visibility)
cur_line_state.first_char_index = get_char_count();
html_text_page.open_new_line(cur_line_state);
cur_text_state.vertical_align = 0;

View File

@ -14,6 +14,9 @@
#include "util/namespace.h"
#include "util/unicode.h"
//#define HR_DEBUG(x) (x)
#define HR_DEBUG(x)
namespace pdf2htmlEX {
using std::all_of;
@ -51,26 +54,35 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
char *p = s->getCString();
int len = s->getLength();
//accumulated displacement of chars in this string, in text object space
double dx = 0;
double dy = 0;
double dx1,dy1;
//displacement of current char, in text object space, including letter space but not word space.
double ddx, ddy;
//advance of current char, in glyph space
double ax, ay;
//origin of current char, in glyph space
double ox, oy;
int nChars = 0;
int nSpaces = 0;
int uLen;
CharCode code;
Unicode *u = nullptr;
HR_DEBUG(printf("HTMLRenderer::drawString:len=%d\n", len));
while (len > 0)
{
auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy);
auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
if(!(equal(ox, 0) && equal(oy, 0)))
{
cerr << "TODO: non-zero origins" << endl;
}
ddx = ax * cur_font_size + cur_letter_space;
ddy = ay * cur_font_size;
tracer.draw_char(state, dx, dy, ax, ay);
bool is_space = false;
if (n == 1 && *p == ' ')
@ -85,19 +97,19 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
* There are always ugly PDF files with no useful info at all.
*/
is_space = true;
++nSpaces;
}
if(is_space && (param.space_as_offset))
{
html_text_page.get_cur_line()->append_padding_char();
// ignore horiz_scaling, as it has been merged into CTM
html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
}
else
{
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
{
html_text_page.get_cur_line()->append_unicodes(u, uLen, (dx1 * cur_font_size + cur_letter_space));
html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
}
else
{
@ -110,7 +122,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
{
uu = unicode_from_font(code, font);
}
html_text_page.get_cur_line()->append_unicodes(&uu, 1, (dx1 * cur_font_size + cur_letter_space));
html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
/*
* In PDF, word_space is appended if (n == 1 and *p = ' ')
* but in HTML, word_space is appended if (uu == ' ')
@ -123,19 +135,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
}
}
dx += dx1;
dy += dy1;
dx += ddx * cur_horiz_scaling;
dy += ddy;
if (is_space)
dx += cur_word_space * cur_horiz_scaling;
++nChars;
p += n;
len -= n;
}
// horiz_scaling is merged into ctm now,
// so the coordinate system is ugly
dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * cur_horiz_scaling;
dy *= cur_font_size;
cur_tx += dx;
cur_ty += dy;
@ -143,4 +151,16 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
draw_ty += dy;
}
bool HTMLRenderer::is_char_covered(int index)
{
auto covered = covered_text_detecor.get_chars_covered();
if (index < 0 || index >= (int)covered.size())
{
std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: "
<< index << ", size: " << covered.size() <<endl;
return false;
}
return covered[index];
}
} // namespace pdf2htmlEX

View File

@ -5,6 +5,8 @@
#ifndef HTMLSTATE_H__
#define HTMLSTATE_H__
#include <functional>
#include "Color.h"
namespace pdf2htmlEX {
@ -62,6 +64,12 @@ struct HTMLLineState
{
double x,y;
double transform_matrix[4];
// The page-cope char index(in drawing order) of the first char in this line.
int first_char_index;
// A function to determine whether a char is covered at a given index.
std::function<bool(int)> is_char_covered;
HTMLLineState(): first_char_index(-1) { }
};
struct HTMLClipState

View File

@ -36,7 +36,14 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
{
text.insert(text.end(), u, u+l);
if (l == 1)
text.push_back(min(u[0], (unsigned)INT_MAX));
else if (l > 1)
{
text.push_back(- decomposed_text.size() - 1);
decomposed_text.emplace_back();
decomposed_text.back().assign(u, u + l);
}
this->width += width;
}
@ -69,6 +76,60 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state)
last_state.font_size *= last_state.font_info->font_size_scale;
}
void HTMLTextLine::dump_char(std::ostream & out, int pos)
{
int c = text[pos];
if (c > 0)
{
Unicode u = c;
writeUnicodes(out, &u, 1);
}
else if (c < 0)
{
auto dt = decomposed_text[- c - 1];
writeUnicodes(out, &dt.front(), dt.size());
}
}
void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
{
static const Color transparent(0, 0, 0, true);
if (line_state.first_char_index < 0)
{
for (int i = 0; i < len; i++)
dump_char(out, begin + i);
return;
}
bool invisible_group_open = false;
for(int i = 0; i < len; i++)
{
if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible
{
if (invisible_group_open)
{
invisible_group_open = false;
out << "</span>";
}
dump_char(out, begin + i);
}
else
{
if (!invisible_group_open)
{
out << "<span class=\"" << all_manager.fill_color.get_css_class_name()
<< all_manager.fill_color.install(transparent) << " " << all_manager.stroke_color.get_css_class_name()
<< all_manager.stroke_color.install(transparent) << "\">";
invisible_group_open = true;
}
dump_char(out, begin + i);
}
}
if (invisible_group_open)
out << "</span>";
}
void HTMLTextLine::dump_text(ostream & out)
{
/*
@ -216,7 +277,7 @@ void HTMLTextLine::dump_text(ostream & out)
size_t next_text_idx = text_idx2;
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
next_text_idx = cur_offset_iter->start_idx;
writeUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx);
dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx);
cur_text_idx = next_text_idx;
}
}

View File

@ -73,7 +73,16 @@ public:
double width;
};
/**
* Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
* multiple code points.
*/
void append_unicodes(const Unicode * u, int l, double width);
/**
* Append a special padding char with 0 width, in order to keep char index consistent.
* The padding char is ignored during output.
*/
void append_padding_char() { text.push_back(0); }
void append_offset(double width);
void append_state(const HTMLTextState & text_state);
void dump_text(std::ostream & out);
@ -92,6 +101,13 @@ private:
void optimize_normal(std::vector<HTMLTextLine*> &);
void optimize_aggressive(std::vector<HTMLTextLine*> &);
/**
* Dump chars' unicode to output stream.
* begin/pos is the index in 'text'.
*/
void dump_chars(std::ostream & out, int begin, int len);
void dump_char(std::ostream & out, int pos);
const Param & param;
AllStateManager & all_manager;
@ -102,7 +118,16 @@ private:
std::vector<State> states;
std::vector<Offset> offsets;
std::vector<Unicode> text;
/**
* Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
* - If c > 0, it is the unicode code point corresponds to the glyph;
* - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
* - If c < -1, this glyph corresponds to more than one unicode code points,
* which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
*/
std::vector<int> text;
std::vector<std::vector<Unicode> > decomposed_text;
};
} // namespace pdf2htmlEX

View File

@ -39,6 +39,9 @@ public:
void set_page_size(double width, double height);
void clip(const HTMLClipState & clip_state);
double get_width() { return page_width; }
double get_height() { return page_height; }
private:
void optimize(void);

View File

@ -38,6 +38,7 @@ struct Param
int process_nontext;
int process_outline;
int process_annotation;
int correct_text_visibility;
int printing;
int fallback;
int tmp_file_size_limit;

View File

@ -187,6 +187,7 @@ void parse_options (int argc, char **argv)
.add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets")
.add("tounicode", &param.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)")
.add("optimize-text", &param.optimize_text, 0, "try to reduce the number of HTML elements used for text")
.add("correct-text-visibility", &param.correct_text_visibility, 0, "try to detect texts covered by other graphics and properly arrange them")
// background image
.add("bg-format", &param.bg_format, "png", "specify background image format")

View File

@ -1,8 +1,12 @@
#include <cstring>
#include <limits>
#include <algorithm>
#include "math.h"
using std::min;
using std::max;
namespace pdf2htmlEX {
void tm_transform(const double * tm, double & x, double & y, bool is_delta)
@ -56,5 +60,31 @@ void tm_transform_bbox(const double * tm, double * bbox)
}
}
bool bbox_intersect(const double * bbox1, const double * bbox2, double * result)
{
double x0, y0, x1, y1;
x0 = max(min(bbox1[0], bbox1[2]), min(bbox2[0], bbox2[2]));
x1 = min(max(bbox1[0], bbox1[2]), max(bbox2[0], bbox2[2]));
if (x0 >= x1)
return false;
y0 = max(min(bbox1[1], bbox1[3]), min(bbox2[1], bbox2[3]));
y1 = min(max(bbox1[1], bbox1[3]), max(bbox2[1], bbox2[3]));
if (y0 >= y1)
return false;
if (result)
{
result[0] = x0;
result[1] = y0;
result[2] = x1;
result[3] = y1;
}
return true;
}
} //namespace pdf2htmlEX

View File

@ -24,6 +24,13 @@ static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6
return false;
return true;
}
static inline void tm_init(double * tm)
{
tm[0] = tm[3] = 1;
tm[1] = tm[2] = tm[4] = tm[5] = 0;
}
static inline void tm_multiply(double * result, const double * m1, const double * m2)
{
result[0] = m1[0] * m2[0] + m1[2] * m2[1];
@ -39,6 +46,14 @@ static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); }
void tm_transform(const double * tm, double & x, double & y, bool is_delta = false);
void tm_multiply(double * tm_left, const double * tm_right);
void tm_transform_bbox(const double * tm, double * bbox);
/**
* Calculate the intersection of 2 boxes.
* If they are intersecting, store the result to result (if not null) and return true.
* Otherwise return false, and result is not touched.
* Param result can be same as one of bbox1 and bbox2.
* Data in boxes are expected in the order of (x0, y0, x1, y1).
*/
bool bbox_intersect(const double * bbox1, const double * bbox2, double * result = nullptr);
} //namespace pdf2htmlEX
#endif //MATH_H__