diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ce16cc..8f0aad4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,6 +161,10 @@ set(PDF2HTMLEX_SRC ${PDF2HTMLEX_SRC} src/Base64Stream.cc src/Color.h src/Color.cc + src/CoveredTextDetector.h + src/CoveredTextDetector.cc + src/DrawingTracer.h + src/DrawingTracer.cc src/HTMLState.h src/HTMLTextLine.h src/HTMLTextLine.cc diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 8835269..1f44984 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -242,6 +242,11 @@ If set to 0, pdf2htmlEX would try its best to balance the two methods above. .B \-\-optimize\-text <0|1> (Default: 0) If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for text. Turn it off if anything goes wrong. +.TP +.B --correct-text-visibility <0|1> (Default: 0) +If set to 1, pdf2htmlEX will try to detect texts covered by other graphics and properly arrange them, +i.e. covered texts are made transparent in text layer, and are drawn on background layer. + .SS Background Image .TP diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc index 7debc03..d7c48b9 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.cc +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc @@ -63,6 +63,13 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y, { CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); } + // If a char is treated as image, it is not subject to cover test + // (see HTMLRenderer::drawString), so don't increase drawn_char_count. + else if (param.correct_text_visibility) { + if (html_renderer->is_char_covered(drawn_char_count)) + CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); + drawn_char_count++; + } } void CairoBackgroundRenderer::beginTextObject(GfxState *state) @@ -104,6 +111,7 @@ static GBool annot_cb(Annot *, void * pflag) { bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno) { + drawn_char_count = 0; double page_width; double page_height; if(param.use_cropbox) diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.h b/src/BackgroundRenderer/CairoBackgroundRenderer.h index 2d3667d..4ed9c86 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.h +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.h @@ -67,6 +67,7 @@ private: std::unordered_map bitmaps_ref_count; // id of bitmaps' stream used by current page std::vector bitmaps_in_current_page; + int drawn_char_count; }; } diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc index 6b7f68c..7a9f79b 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.cc +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -88,6 +88,13 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y, { SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); } + // If a char is treated as image, it is not subject to cover test + // (see HTMLRenderer::drawString), so don't increase drawn_char_count. + else if (param.correct_text_visibility) { + if (html_renderer->is_char_covered(drawn_char_count)) + SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); + drawn_char_count++; + } } void SplashBackgroundRenderer::beginTextObject(GfxState *state) @@ -129,6 +136,7 @@ static GBool annot_cb(Annot *, void * pflag) { bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno) { + drawn_char_count = 0; bool process_annotation = param.process_annotation; doc->displayPage(this, pageno, param.h_dpi, param.v_dpi, 0, diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.h b/src/BackgroundRenderer/SplashBackgroundRenderer.h index 8e064e7..aafc1c2 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.h +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.h @@ -71,6 +71,7 @@ protected: HTMLRenderer * html_renderer; const Param & param; std::string format; + int drawn_char_count; }; } // namespace pdf2htmlEX diff --git a/src/CoveredTextDetector.cc b/src/CoveredTextDetector.cc new file mode 100644 index 0000000..e109b3f --- /dev/null +++ b/src/CoveredTextDetector.cc @@ -0,0 +1,51 @@ +/* + * CoveredTextDetector.cc + * + * Created on: 2014-6-14 + * Author: duanyao + */ + +#include "CoveredTextDetector.h" + +#include "util/math.h" + +namespace pdf2htmlEX { + +void CoveredTextDetector::reset() +{ + char_bboxes.clear(); + chars_covered.clear(); +} + +void CoveredTextDetector::add_char_bbox(double * bbox) +{ + char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4); + chars_covered.push_back(false); +} + +void CoveredTextDetector::add_char_bbox_clipped(double * bbox, bool patially) +{ + char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4); + chars_covered.push_back(true); + if (patially) + add_non_char_bbox(bbox, chars_covered.size() - 1); +} + +void CoveredTextDetector::add_non_char_bbox(double * bbox, int index) +{ + if (index < 0) + index = chars_covered.size(); + for (int i = 0; i < index; i++) + { + if (chars_covered[i]) + continue; + double * cbbox = &char_bboxes[i * 4]; + if (bbox_intersect(cbbox, bbox)) + { + chars_covered[i] = true; + add_non_char_bbox(cbbox, i); + } + } +} + +} diff --git a/src/CoveredTextDetector.h b/src/CoveredTextDetector.h new file mode 100644 index 0000000..bee6c17 --- /dev/null +++ b/src/CoveredTextDetector.h @@ -0,0 +1,61 @@ +/* + * CoveredTextDetector.h + * + * Created on: 2014-6-14 + * Author: duanyao + */ + +#ifndef COVEREDTEXTDETECTOR_H__ +#define COVEREDTEXTDETECTOR_H__ + +#include + +namespace pdf2htmlEX { + +/** + * Detect characters that are covered by non-char graphics on a page. + */ +class CoveredTextDetector +{ +public: + + /** + * Reset to initial state. Should be called when start drawing a page. + */ + void reset(); + + /** + * Add a drawn character's bounding box. + * @param bbox (x0, y0, x1, y1) + */ + void add_char_bbox(double * bbox); + + void add_char_bbox_clipped(double * bbox, bool patially); + + /** + * Add a drawn non-char graphics' bounding box. + * If it intersects any previously drawn char's bbox, the char is marked as covered + * and treated as an non-char. + * @param bbox (x0, y0, x1, y1) + * @param index this graphics' drawing order: assume it is drawn after (index-1)th + * char. -1 means after the last char. + */ + void add_non_char_bbox(double * bbox, int index = -1); + + /** + * An array of flags indicating whether a char is covered by any non-char graphics. + * Index by the order that these chars are added. + * This vector grows as add_char_bbox() is called, so its size is the count + * of currently drawn chars. + */ + const std::vector & get_chars_covered() { return chars_covered; } + +private: + std::vector chars_covered; + // x00, y00, x01, y01; x10, y10, x11, y11;... + std::vector char_bboxes; +}; + +} + +#endif /* COVEREDTEXTDETECTOR_H__ */ diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc new file mode 100644 index 0000000..8db1437 --- /dev/null +++ b/src/DrawingTracer.cc @@ -0,0 +1,360 @@ +/* + * DrawingTracer.cc + * + * Created on: 2014-6-15 + * Author: duanyao + */ + +#include "GfxFont.h" + +#include "util/math.h" +#include "DrawingTracer.h" + +//#define DT_DEBUG(x) (x) +#define DT_DEBUG(x) + +#if !ENABLE_SVG +#warning "Cairo is disabled because ENABLE_SVG is off, --correct-text-visibility has limited functionality." +#endif + +namespace pdf2htmlEX +{ + +DrawingTracer::DrawingTracer(const Param & param): param(param) +#if ENABLE_SVG +, cairo(nullptr) +#endif +{ +} + +DrawingTracer::~DrawingTracer() +{ + finish(); +} + +void DrawingTracer::reset(GfxState *state) +{ + if (!param.correct_text_visibility) + return; + finish(); + +#if ENABLE_SVG + cairo_rectangle_t page_box {0, 0, width:state->getPageWidth(), height:state->getPageHeight()}; + cairo_surface_t * surface = cairo_recording_surface_create(CAIRO_CONTENT_COLOR_ALPHA, &page_box); + cairo = cairo_create(surface); +#endif +} + +void DrawingTracer::finish() +{ +#if ENABLE_SVG + if (cairo) + { + cairo_destroy(cairo); + cairo = nullptr; + } +#endif +} + +// Poppler won't inform us its initial CTM, and the initial CTM is affected by zoom level. +// OutputDev::clip() may be called before OutputDev::updateCTM(), so we can't rely on GfxState::getCTM(), +// and should trace ctm changes ourself (via cairo). +void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m21, double m22, double m31, double m32) +{ + if (!param.correct_text_visibility) + return; + +#if ENABLE_SVG + cairo_matrix_t matrix; + matrix.xx = m11; + matrix.yx = m12; + matrix.xy = m21; + matrix.yy = m22; + matrix.x0 = m31; + matrix.y0 = m32; + cairo_transform(cairo, &matrix); +#endif +} + +void DrawingTracer::clip(GfxState * state, bool even_odd) +{ + if (!param.correct_text_visibility) + return; +#if ENABLE_SVG + do_path(state, state->getPath()); + cairo_set_fill_rule(cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); + cairo_clip (cairo); +#endif +} + +void DrawingTracer::clip_to_stroke_path(GfxState * state) +{ + if (!param.correct_text_visibility) + return; + // TODO cairo_stroke_to_path() ? +} + +void DrawingTracer::save() +{ + if (!param.correct_text_visibility) + return; +#if ENABLE_SVG + cairo_save(cairo); +#endif +} +void DrawingTracer::restore() +{ + if (!param.correct_text_visibility) + return; +#if ENABLE_SVG + cairo_restore(cairo); +#endif +} + +void DrawingTracer::do_path(GfxState * state, GfxPath * path) +{ +#if ENABLE_SVG + //copy from CairoOutputDev::doPath + GfxSubpath *subpath; + int i, j; + double x, y; + cairo_new_path(cairo); + for (i = 0; i < path->getNumSubpaths(); ++i) { + subpath = path->getSubpath(i); + if (subpath->getNumPoints() > 0) { + x = subpath->getX(0); + y = subpath->getY(0); + cairo_move_to(cairo, x, y); + j = 1; + while (j < subpath->getNumPoints()) { + if (subpath->getCurve(j)) { + x = subpath->getX(j+2); + y = subpath->getY(j+2); + cairo_curve_to(cairo, + subpath->getX(j), subpath->getY(j), + subpath->getX(j+1), subpath->getY(j+1), + x, y); + j += 3; + } else { + x = subpath->getX(j); + y = subpath->getY(j); + cairo_line_to(cairo, x, y); + ++j; + } + } + if (subpath->isClosed()) { + cairo_close_path (cairo); + } + } + } +#endif +} + +void DrawingTracer::stroke(GfxState * state) +{ +#if ENABLE_SVG + if (!param.correct_text_visibility) + return; + + DT_DEBUG(printf("DrawingTracer::stroke\n")); + + cairo_set_line_width(cairo, state->getLineWidth()); + + // GfxPath is broken into steps, each step makes up a cairo path and its bbox is used for covering test. + // TODO + // 1. path steps that are not vertical or horizontal lines may still falsely "cover" many chars, + // can we slice those steps further? + // 2. if the line width is small, can we just ignore the path? + // 3. line join feature can't be retained. We use line-cap-square to minimize the problem that + // some chars actually covered by a line join are missed. However chars covered by a acute angle + // with line-join-miter may be still recognized as not covered. + cairo_set_line_cap(cairo, CAIRO_LINE_CAP_SQUARE); + GfxPath * path = state->getPath(); + for (int i = 0; i < path->getNumSubpaths(); ++i) { + GfxSubpath * subpath = path->getSubpath(i); + if (subpath->getNumPoints() <= 0) + continue; + double x = subpath->getX(0); + double y = subpath->getY(0); + //p: loop cursor; j: next point index + int p =1, j = 1; + int n = subpath->getNumPoints(); + while (p <= n) { + cairo_new_path(cairo); + cairo_move_to(cairo, x, y); + if (subpath->getCurve(j)) { + x = subpath->getX(j+2); + y = subpath->getY(j+2); + cairo_curve_to(cairo, + subpath->getX(j), subpath->getY(j), + subpath->getX(j+1), subpath->getY(j+1), + x, y); + p += 3; + } else { + x = subpath->getX(j); + y = subpath->getY(j); + cairo_line_to(cairo, x, y); + ++p; + } + + DT_DEBUG(printf("DrawingTracer::stroke:new box:\n")); + double sbox[4]; + cairo_stroke_extents(cairo, sbox, sbox + 1, sbox + 2, sbox + 3); + if (sbox[0] != sbox[2] && sbox[1] != sbox[3]) + draw_non_char_bbox(state, sbox); + else + DT_DEBUG(printf("DrawingTracer::stroke:zero box!\n")); + + if (p == n) + { + if (subpath->isClosed()) + j = 0; // if sub path is closed, go back to starting point + else + break; + } + else + j = p; + } + } +#endif +} + +void DrawingTracer::fill(GfxState * state, bool even_odd) +{ + if (!param.correct_text_visibility) + return; + +#if ENABLE_SVG + do_path(state, state->getPath()); + //cairo_fill_extents don't take fill rule into account. + //cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); + double fbox[4]; + cairo_fill_extents(cairo, fbox, fbox + 1, fbox + 2, fbox + 3); + draw_non_char_bbox(state, fbox); +#endif +} + +void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox) +{ +#if ENABLE_SVG + double cbox[4]; + cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); + if(bbox_intersect(cbox, bbox, bbox)) +#endif + { + transform_bbox_by_ctm(bbox, state); + DT_DEBUG(printf("DrawingTracer::draw_non_char_bbox:[%f,%f,%f,%f]\n", bbox[0],bbox[1],bbox[2],bbox[3])); + if (on_non_char_drawn) + on_non_char_drawn(bbox); + } +} + +void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) +{ +#if ENABLE_SVG + // Note: even if 4 corners of the char are all in or all out of the clip area, + // it could still be partially clipped. + // TODO better solution? + int pt_in = 0; + if (cairo_in_clip(cairo, bbox[0], bbox[1])) + ++pt_in; + if (cairo_in_clip(cairo, bbox[2], bbox[3])) + ++pt_in; + if (cairo_in_clip(cairo, bbox[2], bbox[1])) + ++pt_in; + if (cairo_in_clip(cairo, bbox[0], bbox[3])) + ++pt_in; + + if (pt_in == 0) + { + transform_bbox_by_ctm(bbox); + if(on_char_clipped) + on_char_clipped(bbox, false); + } + else + { + if (pt_in < 4) + { + double cbox[4]; + cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); + bbox_intersect(cbox, bbox, bbox); + } + transform_bbox_by_ctm(bbox); + if (pt_in < 4) + { + if(on_char_clipped) + on_char_clipped(bbox, true); + } + else + { + if (on_char_drawn) + on_char_drawn(bbox); + } + } +#else + transform_bbox_by_ctm(bbox, state); + if (on_char_drawn) + on_char_drawn(bbox); +#endif + DT_DEBUG(printf("DrawingTracer::draw_char_bbox:[%f,%f,%f,%f]\n",bbox[0],bbox[1],bbox[2],bbox[3])); +} + +void DrawingTracer::draw_image(GfxState *state) +{ + if (!param.correct_text_visibility) + return; + double bbox[4] {0, 0, 1, 1}; + draw_non_char_bbox(state, bbox); +} + +void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, double ay) +{ + if (!param.correct_text_visibility) + return; + + Matrix tm, itm; + memcpy(tm.m, state->getTextMat(), sizeof(tm.m)); + + double cx = state->getCurX(), cy = state->getCurY(), fs = state->getFontSize(), + ry = state->getRise(), h = state->getHorizScaling(); + + //cx and cy has been transformed by text matrix, we need to reverse them. + tm.invertTo(&itm); + double char_cx, char_cy; + itm.transform(cx, cy, &char_cx, &char_cy); + + //TODO Vertical? Currently vertical/type3 chars are treated as non-chars. + double char_m[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry}; + + double final_m[6]; + tm_multiply(final_m, tm.m, char_m); + + auto font = state->getFont(); + double bbox[4] {0, 0, ax, ay}; + double desc = font->getDescent(), asc = font->getAscent(); + if (font->getWMode() == 0) + { + bbox[1] += desc; + bbox[3] += asc; + } + else + {//TODO Vertical? + } + tm_transform_bbox(final_m, bbox); + draw_char_bbox(state, bbox); +} + + +void DrawingTracer::transform_bbox_by_ctm(double * bbox, GfxState * state) +{ +#if ENABLE_SVG + cairo_matrix_t mat; + cairo_get_matrix(cairo, &mat); + double mat_a[6] {mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0}; + tm_transform_bbox(mat_a, bbox); +#else + tm_transform_bbox(state->getCTM(), bbox); +#endif +} + +} /* namespace pdf2htmlEX */ diff --git a/src/DrawingTracer.h b/src/DrawingTracer.h new file mode 100644 index 0000000..2e3159d --- /dev/null +++ b/src/DrawingTracer.h @@ -0,0 +1,79 @@ +/* + * DrawingTracer.h + * + * Created on: 2014-6-15 + * Author: duanyao + */ + +#ifndef DRAWINGTRACER_H__ +#define DRAWINGTRACER_H__ + +#include + +#include + +#include "pdf2htmlEX-config.h" + +#if ENABLE_SVG +#include +#endif + +#include "Param.h" + +namespace pdf2htmlEX +{ + +class DrawingTracer +{ +public: + /* + * The callback to receive drawn event. + * bbox in device space. + */ + // a non-char graphics is drawn + std::function on_non_char_drawn; + // a char is drawn in the clip area + std::function on_char_drawn; + // a char is drawn out of/partially in the clip area + std::function on_char_clipped; + + DrawingTracer(const Param & param); + virtual ~DrawingTracer(); + void reset(GfxState * state); + + /* + * A character is drawing + * x, y: glyph-drawing position, in PDF text object space. + * ax, ay: glyph advance, in glyph space. + */ + void draw_char(GfxState * state, double x, double y, double ax, double ay); + /* + * An image is drawing + */ + void draw_image(GfxState * state); + void update_ctm(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32); + void clip(GfxState * state, bool even_odd = false); + void clip_to_stroke_path(GfxState * state); + void fill(GfxState * state, bool even_odd = false); + void stroke(GfxState * state); + void save(); + void restore(); + +private: + void finish(); + // Following methods operate in user space (just before CTM is applied) + void do_path(GfxState * state, GfxPath * path); + void draw_non_char_bbox(GfxState * state, double * bbox); + void draw_char_bbox(GfxState * state, double * bbox); + // If cairo is available, parameter state is ignored + void transform_bbox_by_ctm(double * bbox, GfxState * state = nullptr); + + const Param & param; + +#if ENABLE_SVG + cairo_t * cairo; +#endif +}; + +} /* namespace pdf2htmlEX */ +#endif /* DRAWINGTRACER_H__ */ diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 8ea408c..45e6e23 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -31,10 +31,13 @@ #include "HTMLTextPage.h" #include "BackgroundRenderer/BackgroundRenderer.h" +#include "CoveredTextDetector.h" +#include "DrawingTracer.h" #include "util/const.h" #include "util/misc.h" + namespace pdf2htmlEX { class HTMLRenderer : public OutputDev @@ -89,7 +92,9 @@ public: * We just mark as changed, and recheck if they have been changed when we are about to output a new string */ - virtual void restoreState(GfxState * state) { updateAll(state); } + virtual void restoreState(GfxState * state); + + virtual void saveState(GfxState *state); virtual void updateAll(GfxState * state); @@ -125,15 +130,34 @@ public: virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg); - virtual void stroke(GfxState *state) { css_do_path(state, false); } - virtual void fill(GfxState *state) { css_do_path(state, true); } + virtual void drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, + int width, int height, + GfxImageColorMap *colorMap, + GBool interpolate, + Stream *maskStr, + int maskWidth, int maskHeight, + GfxImageColorMap *maskColorMap, + GBool maskInterpolate); + + virtual void stroke(GfxState *state); ////{ css_do_path(state, false); } + virtual void fill(GfxState *state); ////{ css_do_path(state, true); } + virtual void eoFill(GfxState *state); virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax); virtual void processLink(AnnotLink * al); /* capacity test */ - bool can_stroke(GfxState *state) { return css_do_path(state, false, true); } - bool can_fill(GfxState *state) { return css_do_path(state, true, true); } + bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); } + bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); } + + /* + * Covered text handling. + */ + // Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page. + // Does not fail on out-of-bound conditions, but return false. + bool is_char_covered(int index); + // Currently drawn char (glyph) count in current page. + int get_char_count() { return (int)covered_text_detecor.get_chars_covered().size(); } protected: //////////////////////////////////////////////////// @@ -195,6 +219,7 @@ protected: // make sure the current HTML style consistent with PDF void prepare_text_line(GfxState * state); +#if 0 //disable CSS drawing //////////////////////////////////////////////////// // CSS drawing //////////////////////////////////////////////////// @@ -214,6 +239,7 @@ protected: double * line_width_array, int line_width_count, const GfxRGB * line_color, const GfxRGB * fill_color, void (*style_function)(void *, std::ostream &) = nullptr, void * style_function_data = nullptr ); +#endif //disable CSS drawing //////////////////////////////////////////////////// @@ -328,7 +354,6 @@ protected: #endif BackgroundRenderer * bg_renderer; BackgroundRenderer * fallback_bg_renderer; - bool fallback_bg_required; struct { std::ofstream fs; @@ -338,6 +363,9 @@ protected: std::string cur_page_filename; static const std::string MANIFEST_FILENAME; + + CoveredTextDetector covered_text_detecor; + DrawingTracer tracer; }; } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc index 7a84b1a..9b3f1bd 100644 --- a/src/HTMLRenderer/draw.cc +++ b/src/HTMLRenderer/draw.cc @@ -30,6 +30,39 @@ using std::sqrt; using std::vector; using std::ostream; +void HTMLRenderer::restoreState(GfxState * state) +{ + updateAll(state); + tracer.restore(); +} + +void HTMLRenderer::saveState(GfxState *state) +{ + tracer.save(); +} + +void HTMLRenderer::stroke(GfxState * state) +{ + tracer.stroke(state); +} + +void HTMLRenderer::fill(GfxState * state) +{ + tracer.fill(state); +} + +void HTMLRenderer::eoFill(GfxState * state) +{ + tracer.fill(state, true); +} + +GBool HTMLRenderer::axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax) +{ + tracer.fill(state); //TODO correct? + return true; +} + +#if 0 //disable css drawing static bool is_horizontal_line(GfxSubpath * path) { return ((path->getNumPoints() == 2) @@ -415,6 +448,7 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co (*f_curpage) << "\">"; } +#endif //disable css drawing } // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 907c350..a7d4e4e 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -46,6 +47,7 @@ HTMLRenderer::HTMLRenderer(const Param & param) ,html_text_page(param, all_manager) ,preprocessor(param) ,tmp_files(param) + ,tracer(param) { if(!(param.debug)) { @@ -76,6 +78,13 @@ HTMLRenderer::HTMLRenderer(const Param & param) all_manager.height .set_eps(EPS); all_manager.width .set_eps(EPS); all_manager.bottom .set_eps(EPS); + + tracer.on_char_drawn = + [this](double * box) { covered_text_detecor.add_char_bbox(box); }; + tracer.on_char_clipped = + [this](double * box, bool partial) { covered_text_detecor.add_char_bbox_clipped(box, partial); }; + tracer.on_non_char_drawn = + [this](double * box) { covered_text_detecor.add_non_char_bbox(box); }; } HTMLRenderer::~HTMLRenderer() @@ -133,13 +142,6 @@ void HTMLRenderer::process(PDFDoc *doc) cur_page_filename = filled_template_filename; } - if(param.process_nontext) - { - fallback_bg_required = !bg_renderer->render_page(doc, i); - if (fallback_bg_required && fallback_bg_renderer != nullptr) - fallback_bg_renderer->render_page(doc, i); - } - doc->displayPage(this, i, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, 0, @@ -190,15 +192,20 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) #endif { + covered_text_detecor.reset(); + tracer.reset(state); + this->pageNum = pageNum; - double pageWidth = state->getPageWidth(); - double pageHeight = state->getPageHeight(); + html_text_page.set_page_size(state->getPageWidth(), state->getPageHeight()); - html_text_page.set_page_size(pageWidth, pageHeight); + reset_state(); +} + +void HTMLRenderer::endPage() { + long long wid = all_manager.width.install(html_text_page.get_width()); + long long hid = all_manager.height.install(html_text_page.get_height()); - long long wid = all_manager.width.install(pageWidth); - long long hid = all_manager.height.install(pageHeight); (*f_curpage) << "
render_page(cur_doc, pageNum)) bg_renderer->embed_image(pageNum); else if (fallback_bg_renderer != nullptr) - fallback_bg_renderer->embed_image(pageNum); + { + if (fallback_bg_renderer->render_page(cur_doc, pageNum)) + fallback_bg_renderer->embed_image(pageNum); + } } - reset_state(); -} - -void HTMLRenderer::endPage() { // dump all text html_text_page.dump_text(*f_curpage); html_text_page.dump_css(f_css.fs); diff --git a/src/HTMLRenderer/image.cc b/src/HTMLRenderer/image.cc index 9c3da52..91ca767 100644 --- a/src/HTMLRenderer/image.cc +++ b/src/HTMLRenderer/image.cc @@ -14,6 +14,8 @@ namespace pdf2htmlEX { void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg) { + tracer.draw_image(state); + return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg); #if 0 @@ -62,4 +64,20 @@ void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int w #endif } +void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, + int width, int height, + GfxImageColorMap *colorMap, + GBool interpolate, + Stream *maskStr, + int maskWidth, int maskHeight, + GfxImageColorMap *maskColorMap, + GBool maskInterpolate) +{ + tracer.draw_image(state); + + return OutputDev::drawSoftMaskedImage(state,ref,str, // TODO really required? + width,height,colorMap,interpolate, + maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate); +} + } // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 23f505f..f26b17f 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -46,6 +46,7 @@ void HTMLRenderer::updateFont(GfxState * state) void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) { ctm_changed = true; + tracer.update_ctm(state, m11, m12, m21, m22, m31, m32); } void HTMLRenderer::updateTextMat(GfxState * state) { @@ -89,14 +90,17 @@ void HTMLRenderer::updateStrokeColor(GfxState * state) void HTMLRenderer::clip(GfxState * state) { clip_changed = true; + tracer.clip(state); } void HTMLRenderer::eoClip(GfxState * state) { clip_changed = true; + tracer.clip(state, true); } void HTMLRenderer::clipToStrokePath(GfxState * state) { clip_changed = true; + tracer.clip_to_stroke_path(state); } void HTMLRenderer::reset_state() { @@ -119,6 +123,8 @@ void HTMLRenderer::reset_state() cur_line_state.y = 0; memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix)); + cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);}; + cur_clip_state.xmin = 0; cur_clip_state.xmax = 0; cur_clip_state.ymin = 0; @@ -502,6 +508,10 @@ void HTMLRenderer::prepare_text_line(GfxState * state) double rise_x, rise_y; state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y); state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y); + + if (param.correct_text_visibility) + cur_line_state.first_char_index = get_char_count(); + html_text_page.open_new_line(cur_line_state); cur_text_state.vertical_align = 0; diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index dafe510..5d5ecd9 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -14,6 +14,9 @@ #include "util/namespace.h" #include "util/unicode.h" +//#define HR_DEBUG(x) (x) +#define HR_DEBUG(x) + namespace pdf2htmlEX { using std::all_of; @@ -51,26 +54,35 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) char *p = s->getCString(); int len = s->getLength(); + //accumulated displacement of chars in this string, in text object space double dx = 0; double dy = 0; - double dx1,dy1; + //displacement of current char, in text object space, including letter space but not word space. + double ddx, ddy; + //advance of current char, in glyph space + double ax, ay; + //origin of current char, in glyph space double ox, oy; - int nChars = 0; - int nSpaces = 0; int uLen; CharCode code; Unicode *u = nullptr; + HR_DEBUG(printf("HTMLRenderer::drawString:len=%d\n", len)); + while (len > 0) { - auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy); + auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy); + HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0])); if(!(equal(ox, 0) && equal(oy, 0))) { cerr << "TODO: non-zero origins" << endl; } + ddx = ax * cur_font_size + cur_letter_space; + ddy = ay * cur_font_size; + tracer.draw_char(state, dx, dy, ax, ay); bool is_space = false; if (n == 1 && *p == ' ') @@ -85,19 +97,19 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) * There are always ugly PDF files with no useful info at all. */ is_space = true; - ++nSpaces; } if(is_space && (param.space_as_offset)) { + html_text_page.get_cur_line()->append_padding_char(); // ignore horiz_scaling, as it has been merged into CTM - html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); + html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); } else { if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) { - html_text_page.get_cur_line()->append_unicodes(u, uLen, (dx1 * cur_font_size + cur_letter_space)); + html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx); } else { @@ -110,7 +122,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) { uu = unicode_from_font(code, font); } - html_text_page.get_cur_line()->append_unicodes(&uu, 1, (dx1 * cur_font_size + cur_letter_space)); + html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx); /* * In PDF, word_space is appended if (n == 1 and *p = ' ') * but in HTML, word_space is appended if (uu == ' ') @@ -123,19 +135,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) } } - dx += dx1; - dy += dy1; + dx += ddx * cur_horiz_scaling; + dy += ddy; + if (is_space) + dx += cur_word_space * cur_horiz_scaling; - ++nChars; p += n; len -= n; } - // horiz_scaling is merged into ctm now, - // so the coordinate system is ugly - dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * cur_horiz_scaling; - dy *= cur_font_size; - cur_tx += dx; cur_ty += dy; @@ -143,4 +151,16 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) draw_ty += dy; } +bool HTMLRenderer::is_char_covered(int index) +{ + auto covered = covered_text_detecor.get_chars_covered(); + if (index < 0 || index >= (int)covered.size()) + { + std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: " + << index << ", size: " << covered.size() < + #include "Color.h" namespace pdf2htmlEX { @@ -62,6 +64,12 @@ struct HTMLLineState { double x,y; double transform_matrix[4]; + // The page-cope char index(in drawing order) of the first char in this line. + int first_char_index; + // A function to determine whether a char is covered at a given index. + std::function is_char_covered; + + HTMLLineState(): first_char_index(-1) { } }; struct HTMLClipState diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index 4397070..ee73934 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -36,7 +36,14 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width) { - text.insert(text.end(), u, u+l); + if (l == 1) + text.push_back(min(u[0], (unsigned)INT_MAX)); + else if (l > 1) + { + text.push_back(- decomposed_text.size() - 1); + decomposed_text.emplace_back(); + decomposed_text.back().assign(u, u + l); + } this->width += width; } @@ -69,6 +76,60 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state) last_state.font_size *= last_state.font_info->font_size_scale; } +void HTMLTextLine::dump_char(std::ostream & out, int pos) +{ + int c = text[pos]; + if (c > 0) + { + Unicode u = c; + writeUnicodes(out, &u, 1); + } + else if (c < 0) + { + auto dt = decomposed_text[- c - 1]; + writeUnicodes(out, &dt.front(), dt.size()); + } +} + +void HTMLTextLine::dump_chars(ostream & out, int begin, int len) +{ + static const Color transparent(0, 0, 0, true); + + if (line_state.first_char_index < 0) + { + for (int i = 0; i < len; i++) + dump_char(out, begin + i); + return; + } + + bool invisible_group_open = false; + for(int i = 0; i < len; i++) + { + if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible + { + if (invisible_group_open) + { + invisible_group_open = false; + out << ""; + } + dump_char(out, begin + i); + } + else + { + if (!invisible_group_open) + { + out << ""; + invisible_group_open = true; + } + dump_char(out, begin + i); + } + } + if (invisible_group_open) + out << ""; +} + void HTMLTextLine::dump_text(ostream & out) { /* @@ -216,7 +277,7 @@ void HTMLTextLine::dump_text(ostream & out) size_t next_text_idx = text_idx2; if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx) next_text_idx = cur_offset_iter->start_idx; - writeUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx); + dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx); cur_text_idx = next_text_idx; } } diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h index 5378f24..fcce811 100644 --- a/src/HTMLTextLine.h +++ b/src/HTMLTextLine.h @@ -73,7 +73,16 @@ public: double width; }; + /** + * Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to + * multiple code points. + */ void append_unicodes(const Unicode * u, int l, double width); + /** + * Append a special padding char with 0 width, in order to keep char index consistent. + * The padding char is ignored during output. + */ + void append_padding_char() { text.push_back(0); } void append_offset(double width); void append_state(const HTMLTextState & text_state); void dump_text(std::ostream & out); @@ -92,6 +101,13 @@ private: void optimize_normal(std::vector &); void optimize_aggressive(std::vector &); + /** + * Dump chars' unicode to output stream. + * begin/pos is the index in 'text'. + */ + void dump_chars(std::ostream & out, int begin, int len); + void dump_char(std::ostream & out, int pos); + const Param & param; AllStateManager & all_manager; @@ -102,7 +118,16 @@ private: std::vector states; std::vector offsets; - std::vector text; + + /** + * Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text': + * - If c > 0, it is the unicode code point corresponds to the glyph; + * - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?); + * - If c < -1, this glyph corresponds to more than one unicode code points, + * which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'. + */ + std::vector text; + std::vector > decomposed_text; }; } // namespace pdf2htmlEX diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h index 7bffec4..ccaa564 100644 --- a/src/HTMLTextPage.h +++ b/src/HTMLTextPage.h @@ -39,6 +39,9 @@ public: void set_page_size(double width, double height); void clip(const HTMLClipState & clip_state); + double get_width() { return page_width; } + double get_height() { return page_height; } + private: void optimize(void); diff --git a/src/Param.h b/src/Param.h index 84a2f55..314ea9b 100644 --- a/src/Param.h +++ b/src/Param.h @@ -38,6 +38,7 @@ struct Param int process_nontext; int process_outline; int process_annotation; + int correct_text_visibility; int printing; int fallback; int tmp_file_size_limit; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index f84d719..39525a4 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -187,6 +187,7 @@ void parse_options (int argc, char **argv) .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") .add("tounicode", ¶m.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)") .add("optimize-text", ¶m.optimize_text, 0, "try to reduce the number of HTML elements used for text") + .add("correct-text-visibility", ¶m.correct_text_visibility, 0, "try to detect texts covered by other graphics and properly arrange them") // background image .add("bg-format", ¶m.bg_format, "png", "specify background image format") diff --git a/src/util/math.cc b/src/util/math.cc index fd8e77c..1ddabce 100644 --- a/src/util/math.cc +++ b/src/util/math.cc @@ -1,8 +1,12 @@ #include #include +#include #include "math.h" +using std::min; +using std::max; + namespace pdf2htmlEX { void tm_transform(const double * tm, double & x, double & y, bool is_delta) @@ -56,5 +60,31 @@ void tm_transform_bbox(const double * tm, double * bbox) } } +bool bbox_intersect(const double * bbox1, const double * bbox2, double * result) +{ + double x0, y0, x1, y1; + + x0 = max(min(bbox1[0], bbox1[2]), min(bbox2[0], bbox2[2])); + x1 = min(max(bbox1[0], bbox1[2]), max(bbox2[0], bbox2[2])); + + if (x0 >= x1) + return false; + + y0 = max(min(bbox1[1], bbox1[3]), min(bbox2[1], bbox2[3])); + y1 = min(max(bbox1[1], bbox1[3]), max(bbox2[1], bbox2[3])); + + if (y0 >= y1) + return false; + + if (result) + { + result[0] = x0; + result[1] = y0; + result[2] = x1; + result[3] = y1; + } + return true; +} + } //namespace pdf2htmlEX diff --git a/src/util/math.h b/src/util/math.h index 759bbcc..8302a93 100644 --- a/src/util/math.h +++ b/src/util/math.h @@ -24,6 +24,13 @@ static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6 return false; return true; } + +static inline void tm_init(double * tm) +{ + tm[0] = tm[3] = 1; + tm[1] = tm[2] = tm[4] = tm[5] = 0; +} + static inline void tm_multiply(double * result, const double * m1, const double * m2) { result[0] = m1[0] * m2[0] + m1[2] * m2[1]; @@ -39,6 +46,14 @@ static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); } void tm_transform(const double * tm, double & x, double & y, bool is_delta = false); void tm_multiply(double * tm_left, const double * tm_right); void tm_transform_bbox(const double * tm, double * bbox); +/** + * Calculate the intersection of 2 boxes. + * If they are intersecting, store the result to result (if not null) and return true. + * Otherwise return false, and result is not touched. + * Param result can be same as one of bbox1 and bbox2. + * Data in boxes are expected in the order of (x0, y0, x1, y1). + */ +bool bbox_intersect(const double * bbox1, const double * bbox2, double * result = nullptr); } //namespace pdf2htmlEX #endif //MATH_H__