From ce28c00a494ae46d11e77a11af7e63f2d3ffbf85 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Sun, 15 Jun 2014 03:44:28 +0800 Subject: [PATCH 01/23] Implement covered text handling. --- CMakeLists.txt | 2 + pdf2htmlEX.1.in | 5 ++ .../CairoBackgroundRenderer.cc | 8 +++ .../CairoBackgroundRenderer.h | 1 + src/CoveredTextHandler.cc | 55 ++++++++++++++ src/CoveredTextHandler.h | 62 ++++++++++++++++ src/HTMLRenderer/HTMLRenderer.h | 27 +++++++ src/HTMLRenderer/general.cc | 18 +++-- src/HTMLRenderer/image.cc | 28 ++++++++ src/HTMLRenderer/state.cc | 5 ++ src/HTMLRenderer/text.cc | 41 +++++++++++ src/HTMLState.h | 3 + src/HTMLTextLine.cc | 30 +++++++- src/HTMLTextLine.h | 3 + src/Param.h | 1 + src/pdf2htmlEX.cc | 1 + src/util/math.bk.cc | 72 +++++++++++++++++++ src/util/math.cc | 12 ++++ src/util/math.h | 2 + 19 files changed, 368 insertions(+), 8 deletions(-) create mode 100644 src/CoveredTextHandler.cc create mode 100644 src/CoveredTextHandler.h create mode 100644 src/util/math.bk.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ce16cc..bb47b10 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,6 +161,8 @@ set(PDF2HTMLEX_SRC ${PDF2HTMLEX_SRC} src/Base64Stream.cc src/Color.h src/Color.cc + src/CoveredTextHandler.h + src/CoveredTextHandler.cc src/HTMLState.h src/HTMLTextLine.h src/HTMLTextLine.cc diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 3d1d3bb..49266b0 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -242,6 +242,11 @@ If set to 0, pdf2htmlEX would try its best to balance the two methods above. .B --optimize-text <0|1> (Default: 0) If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for text. Turn it off if anything goes wrong. +.TP +.B --process-covered-text <0|1> (Default: 0) +If set to 1, pdf2htmlEX will try to detect texts covered by other graphics and properly arrange them, +i.e. covered texts are made transparent in text layer, and are drawn on background layer. + .SS Background Image .TP diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc index b2733c2..19d5795 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.cc +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc @@ -63,6 +63,13 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y, { CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); } + // If a char is treated as image, it is not subject to cover test + // (see HTMLRenderer::drawString), so don't increase drawn_char_count. + else if (param.process_covered_text) { + if (html_renderer->get_chars_covered()[drawn_char_count]) + CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); + drawn_char_count++; + } } void CairoBackgroundRenderer::beginTextObject(GfxState *state) @@ -97,6 +104,7 @@ static GBool annot_cb(Annot *, void * pflag) { bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno) { + drawn_char_count = 0; double page_width; double page_height; if(param.use_cropbox) diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.h b/src/BackgroundRenderer/CairoBackgroundRenderer.h index b2d2f14..65164e6 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.h +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.h @@ -66,6 +66,7 @@ private: std::unordered_map bitmaps_ref_count; // id of bitmaps' stream used by current page std::vector bitmaps_in_current_page; + int drawn_char_count; }; } diff --git a/src/CoveredTextHandler.cc b/src/CoveredTextHandler.cc new file mode 100644 index 0000000..8557b57 --- /dev/null +++ b/src/CoveredTextHandler.cc @@ -0,0 +1,55 @@ +/* + * CoveredTextHandler.cc + * + * Created on: 2014-6-14 + * Author: duanyao + */ + +#include "CoveredTextHandler.h" + +#include "util/math.h" + +namespace pdf2htmlEX { + +CoveredTextHandler::CoveredTextHandler() +{ + // TODO Auto-generated constructor stub + +} + +CoveredTextHandler::~CoveredTextHandler() +{ + // TODO Auto-generated destructor stub +} + +void CoveredTextHandler::reset() +{ + char_bboxes.clear(); + chars_covered.clear(); +} + +void CoveredTextHandler::add_char_bbox(double * bbox) +{ + for (int i = 0; i < 4; i++) + char_bboxes.push_back(bbox[i]); + chars_covered.push_back(false); +} + +void CoveredTextHandler::add_non_char_bbox(double * bbox, int index) +{ + if (index < 0) + index = chars_covered.size(); + for (int i = 0; i < index; i++) + { + if (chars_covered[i]) + continue; + double * cbbox = &char_bboxes[i * 4]; + if (bbox_intersect(cbbox, bbox)) + { + chars_covered[i] = true; + add_non_char_bbox(cbbox, i); + } + } +} + +} diff --git a/src/CoveredTextHandler.h b/src/CoveredTextHandler.h new file mode 100644 index 0000000..34decf6 --- /dev/null +++ b/src/CoveredTextHandler.h @@ -0,0 +1,62 @@ +/* + * CoveredTextHandler.h + * + * Created on: 2014-6-14 + * Author: duanyao + */ + +#ifndef COVEREDTEXTHANDLER_H__ +#define COVEREDTEXTHANDLER_H__ + +#include + +namespace pdf2htmlEX { + +/** + * Detect characters that are covered by non-char graphics on a page. + */ +class CoveredTextHandler +{ +public: + CoveredTextHandler(); + virtual ~CoveredTextHandler(); + + /** + * Reset to initial state. Should be called when start drawing a page. + */ + void reset(); + + /** + * Add a drawn character's bounding box. + * @param bbox (x0, y0, x1, y1) + */ + void add_char_bbox(double * bbox); + + /** + * Add a drawn non-char graphics' bounding box. + * If it intersects any previously drawn char's bbox, the char is marked as covered + * and treated as an non-char. + * @param bbox (x0, y0, x1, y1) + * @param index this graphics' drawing order: assume it is drawn after (index-1)th + * char. -1 means after the last char. + */ + void add_non_char_bbox(double * bbox, int index = -1); + + /** + * An array of flags indicating whether a char is covered by any non-char graphics. + * Index by the order that these chars are added. + * This vector grows as add_char_bbox() is called, so its size is the count + * of currently drawn chars. + */ + const std::vector & get_chars_covered() { return chars_covered; } + +private: + //covered text test + std::vector chars_covered; + // x00, y00, x01, y01; x10, y10, x11, y11;... + std::vector char_bboxes; +}; + +} + +#endif /* COVEREDTEXTHANDLER_H__ */ diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 73929ab..75293f8 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -31,6 +31,7 @@ #include "HTMLTextPage.h" #include "BackgroundRenderer/BackgroundRenderer.h" +#include "CoveredTextHandler.h" #include "util/const.h" #include "util/misc.h" @@ -125,6 +126,15 @@ public: virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg); + virtual void drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, + int width, int height, + GfxImageColorMap *colorMap, + GBool interpolate, + Stream *maskStr, + int maskWidth, int maskHeight, + GfxImageColorMap *maskColorMap, + GBool maskInterpolate); + virtual void stroke(GfxState *state) { css_do_path(state, false); } virtual void fill(GfxState *state) { css_do_path(state, true); } virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax); @@ -135,6 +145,8 @@ public: bool can_stroke(GfxState *state) { return css_do_path(state, false, true); } bool can_fill(GfxState *state) { return css_do_path(state, true, true); } + const std::vector & get_chars_covered() { return covered_text_handler.get_chars_covered(); } + protected: //////////////////////////////////////////////////// // misc @@ -215,6 +227,19 @@ protected: const GfxRGB * line_color, const GfxRGB * fill_color, void (*style_function)(void *, std::ostream &) = nullptr, void * style_function_data = nullptr ); + //////////////////////////////////////////////////// + // Covered text handling + //////////////////////////////////////////////////// + /* + * Cue CoveredTextHandler that a character is drawn + * x, y: glyph-drawing position, in PDF text object space. + * ax, ay: glyph advance, in glyph space. + */ + void add_char_bbox(GfxState *state, double x, double y, double ax, double ay); + /* + * Cue CoveredTextHandler that an image is drawn + */ + void add_image_bbox(GfxState *state); //////////////////////////////////////////////////// // PDF stuffs @@ -338,6 +363,8 @@ protected: std::string cur_page_filename; static const std::string MANIFEST_FILENAME; + + CoveredTextHandler covered_text_handler; }; } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 803bc2d..3d43ede 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -133,13 +133,10 @@ void HTMLRenderer::process(PDFDoc *doc) cur_page_filename = filled_template_filename; } - if(param.process_nontext) - { - fallback_bg_required = !bg_renderer->render_page(doc, i); - if (fallback_bg_required && fallback_bg_renderer != nullptr) - fallback_bg_renderer->render_page(doc, i); - } - + // We handle covered texts during doc->displayPage(this...), + // and bg_renderer->render_page() depends on the result, so it must be called after + // doc->displayPage(this...). + covered_text_handler.reset(); doc->displayPage(this, i, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, 0, @@ -148,6 +145,13 @@ void HTMLRenderer::process(PDFDoc *doc) false, // printing nullptr, nullptr, nullptr, nullptr); + if(param.process_nontext) + { + fallback_bg_required = !bg_renderer->render_page(doc, i); + if (fallback_bg_required && fallback_bg_renderer != nullptr) + fallback_bg_renderer->render_page(doc, i); + } + if(param.split_pages) { delete f_curpage; diff --git a/src/HTMLRenderer/image.cc b/src/HTMLRenderer/image.cc index 9c3da52..3e4f8d0 100644 --- a/src/HTMLRenderer/image.cc +++ b/src/HTMLRenderer/image.cc @@ -14,6 +14,8 @@ namespace pdf2htmlEX { void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg) { + add_image_bbox(state); + return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg); #if 0 @@ -62,4 +64,30 @@ void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int w #endif } +void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, + int width, int height, + GfxImageColorMap *colorMap, + GBool interpolate, + Stream *maskStr, + int maskWidth, int maskHeight, + GfxImageColorMap *maskColorMap, + GBool maskInterpolate) +{ + add_image_bbox(state); + + return OutputDev::drawSoftMaskedImage(state,ref,str, + width,height,colorMap,interpolate, + maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate); +} + +void HTMLRenderer::add_image_bbox(GfxState *state) +{ + if (!param.process_covered_text) + return; + auto ctm = state->getCTM(); + double bbox[4] {0, 0, 1, 1}; + tm_transform_bbox(ctm, bbox); + covered_text_handler.add_non_char_bbox(bbox); +} + } // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 320fd81..8df9700 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -119,6 +119,9 @@ void HTMLRenderer::reset_state() cur_line_state.y = 0; memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix)); + if (param.process_covered_text) + cur_line_state.chars_covered = &covered_text_handler.get_chars_covered(); + cur_clip_state.xmin = 0; cur_clip_state.xmax = 0; cur_clip_state.ymin = 0; @@ -502,6 +505,8 @@ void HTMLRenderer::prepare_text_line(GfxState * state) double rise_x, rise_y; state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y); state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y); + if (param.process_covered_text) + cur_line_state.first_char_index = covered_text_handler.get_chars_covered().size(); html_text_page.open_new_line(cur_line_state); cur_text_state.vertical_align = 0; diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index dafe510..0122356 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -72,6 +72,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) cerr << "TODO: non-zero origins" << endl; } + add_char_bbox(state, dx, dy, dx1, dy1); + bool is_space = false; if (n == 1 && *p == ' ') { @@ -143,4 +145,43 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) draw_ty += dy; } +void HTMLRenderer::add_char_bbox(GfxState *state, double x, double y, double ax, double ay) +{ + if (!param.process_covered_text) + return; + + Matrix tm_ctm, tm, itm; + memcpy(tm_ctm.m, this->cur_text_tm, sizeof(tm_ctm.m)); + memcpy(tm.m, state->getTextMat(), sizeof(tm.m)); + double fs = state->getFontSize(); + + double cx = state->getCurX(), cy = state->getCurY(), + ry = state->getRise(), h = state->getHorizScaling(); + + //cx and cy has been transformed by text matrix, we need to reverse them. + tm.invertTo(&itm); + double char_cx, char_cy; + itm.transform(cx, cy, &char_cx, &char_cy); + + //TODO Vertical? Currently vertical/type3 chars are treated as non-chars. + double tchar[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry}; + + double tfinal[6]; + tm_multiply(tfinal, tm_ctm.m, tchar); + + auto font = state->getFont(); + double bbox[4] {0, 0, ax, ay}; + double desc = font->getDescent(), asc = font->getAscent(); + if (font->getWMode() == 0) + { + bbox[1] += desc; + bbox[3] += asc; + } + else + {//TODO Vertical? + } + tm_transform_bbox(tfinal, bbox); + covered_text_handler.add_char_bbox(bbox); +} + } // namespace pdf2htmlEX diff --git a/src/HTMLState.h b/src/HTMLState.h index b8a470b..9327e5c 100644 --- a/src/HTMLState.h +++ b/src/HTMLState.h @@ -62,6 +62,9 @@ struct HTMLLineState { double x,y; double transform_matrix[4]; + // The page-cope char index(in drawing order) of the first char in this line. + int first_char_index = -1; + const std::vector * chars_covered = nullptr; }; struct HTMLClipState diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index f2320c5..ba32209 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -69,6 +69,32 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state) last_state.font_size *= last_state.font_info->font_size_scale; } +void HTMLTextLine::dump_chars(ostream & out, const Unicode * u, int uLen) +{ + if (!line_state.chars_covered) + { + writeUnicodes(out, u, uLen); + return; + } + + //TODO merge sibling invisiable spans + int start = this->line_state.first_char_index + dumped_char_count; + for(int i = 0; i < uLen; i++) + { + if (!(*line_state.chars_covered)[start + i]) //visible + { + writeUnicodes(out, u + i, 1); + } + else + { + out << ""; + writeUnicodes(out, u + i, 1); + out << ""; + } + } + dumped_char_count += uLen; +} + void HTMLTextLine::dump_text(ostream & out) { /* @@ -84,6 +110,8 @@ void HTMLTextLine::dump_text(ostream & out) return; } + dumped_char_count = 0; + // Start Output { // open
for the current text line @@ -216,7 +244,7 @@ void HTMLTextLine::dump_text(ostream & out) size_t next_text_idx = text_idx2; if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx) next_text_idx = cur_offset_iter->start_idx; - writeUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx); + dump_chars(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx); cur_text_idx = next_text_idx; } } diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h index 782b491..8fa814f 100644 --- a/src/HTMLTextLine.h +++ b/src/HTMLTextLine.h @@ -91,6 +91,7 @@ public: private: void optimize_normal(std::vector &); void optimize_aggressive(std::vector &); + void dump_chars(std::ostream & out, const Unicode * u, int uLen); const Param & param; AllStateManager & all_manager; @@ -103,6 +104,8 @@ private: std::vector states; std::vector offsets; std::vector text; + + int dumped_char_count; }; } // namespace pdf2htmlEX diff --git a/src/Param.h b/src/Param.h index 84a2f55..d3d67b2 100644 --- a/src/Param.h +++ b/src/Param.h @@ -38,6 +38,7 @@ struct Param int process_nontext; int process_outline; int process_annotation; + int process_covered_text; int printing; int fallback; int tmp_file_size_limit; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index cd1ae46..d338ab9 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -187,6 +187,7 @@ void parse_options (int argc, char **argv) .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") .add("tounicode", ¶m.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)") .add("optimize-text", ¶m.optimize_text, 0, "try to reduce the number of HTML elements used for text") + .add("process-covered-text", ¶m.process_covered_text, 0, "try to detect texts covered by other graphics and properly arrange them") // background image .add("bg-format", ¶m.bg_format, "png", "specify background image format") diff --git a/src/util/math.bk.cc b/src/util/math.bk.cc new file mode 100644 index 0000000..0abafaf --- /dev/null +++ b/src/util/math.bk.cc @@ -0,0 +1,72 @@ +#include +#include +#include + +#include "math.h" + +using std::min; +using std::max; + +namespace pdf2htmlEX { + +void tm_transform(const double * tm, double & x, double & y, bool is_delta) +{ + double xx = x, yy = y; + x = tm[0] * xx + tm[2] * yy; + y = tm[1] * xx + tm[3] * yy; + if(!is_delta) + { + x += tm[4]; + y += tm[5]; + } +} + +void tm_multiply(const double * tm_left, const double * tm_right, double * tm_result) +{ + double old[4]; + memcpy(old, tm_left, sizeof(old)); + + tm_result[0] = tm_left[0] * tm_right[0] + tm_left[2] * tm_right[1]; + tm_result[1] = tm_left[1] * tm_right[0] + tm_left[3] * tm_right[1]; + tm_result[2] = tm_left[0] * tm_right[2] + tm_left[2] * tm_right[3]; + tm_result[3] = tm_left[1] * tm_right[2] + tm_left[3] * tm_right[3]; + tm_result[4] += tm_left[0] * tm_right[4] + tm_left[2] * tm_right[5]; + tm_result[5] += tm_left[1] * tm_right[4] + tm_left[3] * tm_right[5]; +} + +void tm_transform_bbox(const double * tm, double * bbox) +{ + double & x1 = bbox[0]; + double & y1 = bbox[1]; + double & x2 = bbox[2]; + double & y2 = bbox[3]; + double _[4][2]; + _[0][0] = _[1][0] = x1; + _[0][1] = _[2][1] = y1; + _[2][0] = _[3][0] = x2; + _[1][1] = _[3][1] = y2; + + x1 = y1 = std::numeric_limits::max(); + x2 = y2 = std::numeric_limits::min(); + for(int i = 0; i < 4; ++i) + { + auto & x = _[i][0]; + auto & y = _[i][1]; + tm_transform(tm, x, y); + if(x < x1) x1 = x; + if(x > x2) x2 = x; + if(y < y1) y1 = y; + if(y > y2) y2 = y; + } +} + +bool bbox_intersect(double * bbox1, double * bbox2) +{ + return min(bbox1[0], bbox1[2]) < max(bbox2[0], bbox2[2]) + && max(bbox1[0], bbox1[2]) > min(bbox2[0], bbox2[2]) + && min(bbox1[1], bbox1[3]) < max(bbox2[1], bbox2[3]) + && max(bbox1[1], bbox1[3]) > min(bbox2[1], bbox2[3]); +} + +} //namespace pdf2htmlEX + diff --git a/src/util/math.cc b/src/util/math.cc index fd8e77c..fb898c6 100644 --- a/src/util/math.cc +++ b/src/util/math.cc @@ -1,8 +1,12 @@ #include #include +#include #include "math.h" +using std::min; +using std::max; + namespace pdf2htmlEX { void tm_transform(const double * tm, double & x, double & y, bool is_delta) @@ -56,5 +60,13 @@ void tm_transform_bbox(const double * tm, double * bbox) } } +bool bbox_intersect(double * bbox1, double * bbox2) +{ + return min(bbox1[0], bbox1[2]) < max(bbox2[0], bbox2[2]) + && max(bbox1[0], bbox1[2]) > min(bbox2[0], bbox2[2]) + && min(bbox1[1], bbox1[3]) < max(bbox2[1], bbox2[3]) + && max(bbox1[1], bbox1[3]) > min(bbox2[1], bbox2[3]); +} + } //namespace pdf2htmlEX diff --git a/src/util/math.h b/src/util/math.h index 759bbcc..fcdebc4 100644 --- a/src/util/math.h +++ b/src/util/math.h @@ -40,5 +40,7 @@ void tm_transform(const double * tm, double & x, double & y, bool is_delta = fal void tm_multiply(double * tm_left, const double * tm_right); void tm_transform_bbox(const double * tm, double * bbox); +bool bbox_intersect(double * bbox1, double * bbox2); + } //namespace pdf2htmlEX #endif //MATH_H__ From bd3f165ae2d3c70045c1a1bfbf1d39a5258f1e34 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Sun, 15 Jun 2014 13:35:24 +0800 Subject: [PATCH 02/23] Handle texts covered by paths; concentrate drawing tracing codes to DrawingTracer calss; disable CSS drawing. --- CMakeLists.txt | 2 + src/DrawingTracer.cc | 218 ++++++++++++++++++++++++++++++++ src/DrawingTracer.h | 66 ++++++++++ src/HTMLRenderer/HTMLRenderer.h | 33 +++-- src/HTMLRenderer/draw.cc | 33 +++++ src/HTMLRenderer/general.cc | 13 +- src/HTMLRenderer/image.cc | 16 +-- src/HTMLRenderer/state.cc | 4 + src/HTMLRenderer/text.cc | 41 +----- 9 files changed, 354 insertions(+), 72 deletions(-) create mode 100644 src/DrawingTracer.cc create mode 100644 src/DrawingTracer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index bb47b10..8d75405 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,8 @@ set(PDF2HTMLEX_SRC ${PDF2HTMLEX_SRC} src/Color.cc src/CoveredTextHandler.h src/CoveredTextHandler.cc + src/DrawingTracer.h + src/DrawingTracer.cc src/HTMLState.h src/HTMLTextLine.h src/HTMLTextLine.cc diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc new file mode 100644 index 0000000..15b9f20 --- /dev/null +++ b/src/DrawingTracer.cc @@ -0,0 +1,218 @@ +/* + * DrawingTracer.cc + * + * Created on: 2014-6-15 + * Author: duanyao + */ + +#include "GfxFont.h" + +#include "util/math.h" +#include "DrawingTracer.h" + +namespace pdf2htmlEX +{ + +DrawingTracer::DrawingTracer(const Param & param):param(param) +{ +} + +DrawingTracer::~DrawingTracer() +{ + finish(); +} + +void DrawingTracer::reset(GfxState *state) +{ + if (!param.process_covered_text) + return; + finish(); + cairo_rectangle_t page_box {0, 0, width:state->getPageWidth(), height:state->getPageHeight()}; + cairo_surface_t * surface = cairo_recording_surface_create(CAIRO_CONTENT_COLOR_ALPHA, &page_box); + cairo = cairo_create(surface); +} + +void DrawingTracer::finish() +{ + if (cairo) + { + cairo_destroy(cairo); + cairo = nullptr; + } +} + +void DrawingTracer::set_ctm(GfxState *state) +{ + if (!param.process_covered_text) + return; + double * ctm = state->getCTM(); + cairo_matrix_t matrix; + matrix.xx = ctm[0]; + matrix.yx = ctm[1]; + matrix.xy = ctm[2]; + matrix.yy = ctm[3]; + matrix.x0 = ctm[4]; + matrix.y0 = ctm[5]; + cairo_set_matrix (cairo, &matrix); +} + +void DrawingTracer::clip(GfxState * state, bool even_odd) +{ + if (!param.process_covered_text) + return; + do_path (state, state->getPath()); + cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); + cairo_clip (cairo); +} + +void DrawingTracer::clip_to_stroke_path(GfxState * state) +{ + if (!param.process_covered_text) + return; + // TODO cairo_stroke_to_path() ? +} + +void DrawingTracer::save() +{ + if (!param.process_covered_text) + return; + cairo_save(cairo); +} +void DrawingTracer::restore() +{ + if (!param.process_covered_text) + return; + cairo_restore(cairo); +} + +void DrawingTracer::do_path(GfxState * state, GfxPath * path) +{ + //copy from CairoOutputDev::doPath + GfxSubpath *subpath; + int i, j; + double x, y; + cairo_new_path (cairo); + for (i = 0; i < path->getNumSubpaths(); ++i) { + subpath = path->getSubpath(i); + if (subpath->getNumPoints() > 0) { + x = subpath->getX(0); + y = subpath->getY(0); + cairo_move_to (cairo, x, y); + j = 1; + while (j < subpath->getNumPoints()) { + if (subpath->getCurve(j)) { + x = subpath->getX(j+2); + y = subpath->getY(j+2); + cairo_curve_to(cairo, + subpath->getX(j), subpath->getY(j), + subpath->getX(j+1), subpath->getY(j+1), + x, y); + j += 3; + } else { + x = subpath->getX(j); + y = subpath->getY(j); + cairo_line_to (cairo, x, y); + ++j; + } + } + if (subpath->isClosed()) { + cairo_close_path (cairo); + } + } + } +} + +void DrawingTracer::stroke(GfxState * state) +{ + if (!param.process_covered_text) + return; + // TODO + // 1. if stroke extents is large, break the path into pieces and handle each of them; + // 2. if the line width is small, could just ignore the path? + do_path(state, state->getPath()); + cairo_set_line_width(cairo, state->getLineWidth()); + double sbox[4]; + cairo_stroke_extents(cairo, sbox, sbox + 1, sbox + 2, sbox + 3); + draw_non_char_bbox(state, sbox); +} + +void DrawingTracer::fill(GfxState * state, bool even_odd) +{ + if (!param.process_covered_text) + return; + do_path(state, state->getPath()); + //cairo_fill_extents don't take fill rule into account. + //cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); + double fbox[4]; + cairo_fill_extents(cairo, fbox, fbox + 1, fbox + 2, fbox + 3); + draw_non_char_bbox(state, fbox); +} + +void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox) +{ + double cbox[4], result[4]; + cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); + // TODO intersect + tm_transform_bbox(state->getCTM(), bbox); + if (on_non_char_drawn) + on_non_char_drawn(bbox); +} + +void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) +{ + double cbox[4], result[4]; + cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); + // TODO intersect + tm_transform_bbox(state->getCTM(), bbox); + if (on_char_drawn) + on_char_drawn(bbox); +} + +void DrawingTracer::draw_image(GfxState *state) +{ + if (!param.process_covered_text) + return; + double bbox[4] {0, 0, 1, 1}; + draw_non_char_bbox(state, bbox); +} + +void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, double ay) +{ + if (!param.process_covered_text) + return; + + Matrix tm, itm; + //memcpy(tm_ctm.m, this->cur_text_tm, sizeof(tm_ctm.m)); + memcpy(tm.m, state->getTextMat(), sizeof(tm.m)); + double fs = state->getFontSize(); + + double cx = state->getCurX(), cy = state->getCurY(), + ry = state->getRise(), h = state->getHorizScaling(); + + //cx and cy has been transformed by text matrix, we need to reverse them. + tm.invertTo(&itm); + double char_cx, char_cy; + itm.transform(cx, cy, &char_cx, &char_cy); + + //TODO Vertical? Currently vertical/type3 chars are treated as non-chars. + double tchar[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry}; + + double tfinal[6]; + tm_multiply(tfinal, tm.m, tchar); + + auto font = state->getFont(); + double bbox[4] {0, 0, ax, ay}; + double desc = font->getDescent(), asc = font->getAscent(); + if (font->getWMode() == 0) + { + bbox[1] += desc; + bbox[3] += asc; + } + else + {//TODO Vertical? + } + tm_transform_bbox(tfinal, bbox); + draw_char_bbox(state, bbox); +} + +} /* namespace pdf2htmlEX */ diff --git a/src/DrawingTracer.h b/src/DrawingTracer.h new file mode 100644 index 0000000..81fd4b0 --- /dev/null +++ b/src/DrawingTracer.h @@ -0,0 +1,66 @@ +/* + * DrawingTracer.h + * + * Created on: 2014-6-15 + * Author: duanyao + */ + +#ifndef DRAWINGTRACER_H__ +#define DRAWINGTRACER_H__ + +#include + +#include +#include + +#include "Param.h" + +namespace pdf2htmlEX +{ + +class DrawingTracer +{ +public: + /* + * The callback to receive drawn event. + * bbox in device space. + */ + std::function on_non_char_drawn; + std::function on_char_drawn; + std::function on_char_clipped; + + DrawingTracer(const Param & param); + virtual ~DrawingTracer(); + void reset(GfxState * state); + + /* + * A character is drawing + * x, y: glyph-drawing position, in PDF text object space. + * ax, ay: glyph advance, in glyph space. + */ + void draw_char(GfxState * state, double x, double y, double ax, double ay); + /* + * An image is drawing + */ + void draw_image(GfxState * state); + void set_ctm(GfxState * state); + void clip(GfxState * state, bool even_odd = false); + void clip_to_stroke_path(GfxState * state); + void fill(GfxState * state, bool even_odd = false); + void stroke(GfxState * state); + void save(); + void restore(); + +private: + void finish(); + // Following methods operate in user space (just before CTM is applied) + void do_path(GfxState * state, GfxPath * path); + void draw_non_char_bbox(GfxState * state, double * bbox); + void draw_char_bbox(GfxState * state, double * bbox); + + const Param & param; + cairo_t * cairo = nullptr; +}; + +} /* namespace pdf2htmlEX */ +#endif /* DRAWINGTRACER_H__ */ diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 75293f8..8aa0a0f 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -20,6 +20,8 @@ #include #include +#include + #include "pdf2htmlEX-config.h" #include "Param.h" @@ -32,10 +34,12 @@ #include "BackgroundRenderer/BackgroundRenderer.h" #include "CoveredTextHandler.h" +#include "DrawingTracer.h" #include "util/const.h" #include "util/misc.h" + namespace pdf2htmlEX { class HTMLRenderer : public OutputDev @@ -90,7 +94,9 @@ public: * We just mark as changed, and recheck if they have been changed when we are about to output a new string */ - virtual void restoreState(GfxState * state) { updateAll(state); } + virtual void restoreState(GfxState * state); + + virtual void saveState(GfxState *state); virtual void updateAll(GfxState * state); @@ -135,15 +141,16 @@ public: GfxImageColorMap *maskColorMap, GBool maskInterpolate); - virtual void stroke(GfxState *state) { css_do_path(state, false); } - virtual void fill(GfxState *state) { css_do_path(state, true); } + virtual void stroke(GfxState *state); ////{ css_do_path(state, false); } + virtual void fill(GfxState *state); ////{ css_do_path(state, true); } + virtual void eoFill(GfxState *state); virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax); virtual void processLink(AnnotLink * al); /* capacity test */ - bool can_stroke(GfxState *state) { return css_do_path(state, false, true); } - bool can_fill(GfxState *state) { return css_do_path(state, true, true); } + bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); } + bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); } const std::vector & get_chars_covered() { return covered_text_handler.get_chars_covered(); } @@ -207,6 +214,7 @@ protected: // make sure the current HTML style consistent with PDF void prepare_text_line(GfxState * state); +#if 0 //disable CSS drawing //////////////////////////////////////////////////// // CSS drawing //////////////////////////////////////////////////// @@ -226,20 +234,8 @@ protected: double * line_width_array, int line_width_count, const GfxRGB * line_color, const GfxRGB * fill_color, void (*style_function)(void *, std::ostream &) = nullptr, void * style_function_data = nullptr ); +#endif //disable CSS drawing - //////////////////////////////////////////////////// - // Covered text handling - //////////////////////////////////////////////////// - /* - * Cue CoveredTextHandler that a character is drawn - * x, y: glyph-drawing position, in PDF text object space. - * ax, ay: glyph advance, in glyph space. - */ - void add_char_bbox(GfxState *state, double x, double y, double ax, double ay); - /* - * Cue CoveredTextHandler that an image is drawn - */ - void add_image_bbox(GfxState *state); //////////////////////////////////////////////////// // PDF stuffs @@ -365,6 +361,7 @@ protected: static const std::string MANIFEST_FILENAME; CoveredTextHandler covered_text_handler; + DrawingTracer tracer; }; } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc index 7a84b1a..b87c897 100644 --- a/src/HTMLRenderer/draw.cc +++ b/src/HTMLRenderer/draw.cc @@ -30,6 +30,38 @@ using std::sqrt; using std::vector; using std::ostream; +void HTMLRenderer::restoreState(GfxState * state) +{ + updateAll(state); tracer.restore(); +} + +void HTMLRenderer::saveState(GfxState *state) +{ + tracer.save(); +} + +void HTMLRenderer::stroke(GfxState * state) +{ + tracer.stroke(state); +} + +void HTMLRenderer::fill(GfxState * state) +{ + tracer.fill(state); +} + +void HTMLRenderer::eoFill(GfxState * state) +{ + tracer.fill(state, true); +} + +GBool HTMLRenderer::axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax) +{ + tracer.fill(state); //TODO correct? + return true; +} + +#if 0 //disable css drawing static bool is_horizontal_line(GfxSubpath * path) { return ((path->getNumPoints() == 2) @@ -415,6 +447,7 @@ void HTMLRenderer::css_draw_rectangle(double x, double y, double w, double h, co (*f_curpage) << "\">
"; } +#endif //disable css drawing } // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 3d43ede..d784f3b 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -46,6 +47,7 @@ HTMLRenderer::HTMLRenderer(const Param & param) ,html_text_page(param, all_manager) ,preprocessor(param) ,tmp_files(param) + ,tracer(param) { if(!(param.debug)) { @@ -76,6 +78,13 @@ HTMLRenderer::HTMLRenderer(const Param & param) all_manager.height .set_eps(EPS); all_manager.width .set_eps(EPS); all_manager.bottom .set_eps(EPS); + + tracer.on_char_drawn = + [this](double * box) { covered_text_handler.add_char_bbox(box); }; + tracer.on_char_clipped = + [this](double * box) { covered_text_handler.add_char_bbox(box); }; //TODO + tracer.on_non_char_drawn = + [this](double * box) { covered_text_handler.add_non_char_bbox(box); }; } HTMLRenderer::~HTMLRenderer() @@ -136,7 +145,6 @@ void HTMLRenderer::process(PDFDoc *doc) // We handle covered texts during doc->displayPage(this...), // and bg_renderer->render_page() depends on the result, so it must be called after // doc->displayPage(this...). - covered_text_handler.reset(); doc->displayPage(this, i, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, 0, @@ -194,6 +202,9 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) #endif { + covered_text_handler.reset(); + tracer.reset(state); + this->pageNum = pageNum; double pageWidth = state->getPageWidth(); diff --git a/src/HTMLRenderer/image.cc b/src/HTMLRenderer/image.cc index 3e4f8d0..91ca767 100644 --- a/src/HTMLRenderer/image.cc +++ b/src/HTMLRenderer/image.cc @@ -14,7 +14,7 @@ namespace pdf2htmlEX { void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg) { - add_image_bbox(state); + tracer.draw_image(state); return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg); @@ -73,21 +73,11 @@ void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str GfxImageColorMap *maskColorMap, GBool maskInterpolate) { - add_image_bbox(state); + tracer.draw_image(state); - return OutputDev::drawSoftMaskedImage(state,ref,str, + return OutputDev::drawSoftMaskedImage(state,ref,str, // TODO really required? width,height,colorMap,interpolate, maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate); } -void HTMLRenderer::add_image_bbox(GfxState *state) -{ - if (!param.process_covered_text) - return; - auto ctm = state->getCTM(); - double bbox[4] {0, 0, 1, 1}; - tm_transform_bbox(ctm, bbox); - covered_text_handler.add_non_char_bbox(bbox); -} - } // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 8df9700..498655a 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -46,6 +46,7 @@ void HTMLRenderer::updateFont(GfxState * state) void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) { ctm_changed = true; + tracer.set_ctm(state); } void HTMLRenderer::updateTextMat(GfxState * state) { @@ -89,14 +90,17 @@ void HTMLRenderer::updateStrokeColor(GfxState * state) void HTMLRenderer::clip(GfxState * state) { clip_changed = true; + tracer.clip(state); } void HTMLRenderer::eoClip(GfxState * state) { clip_changed = true; + tracer.clip(state, true); } void HTMLRenderer::clipToStrokePath(GfxState * state) { clip_changed = true; + tracer.clip_to_stroke_path(state); } void HTMLRenderer::reset_state() { diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 0122356..13f2065 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -72,7 +72,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) cerr << "TODO: non-zero origins" << endl; } - add_char_bbox(state, dx, dy, dx1, dy1); + tracer.draw_char(state, dx, dy, dx1, dy1); //TODO dx dy seems not correct? bool is_space = false; if (n == 1 && *p == ' ') @@ -145,43 +145,4 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) draw_ty += dy; } -void HTMLRenderer::add_char_bbox(GfxState *state, double x, double y, double ax, double ay) -{ - if (!param.process_covered_text) - return; - - Matrix tm_ctm, tm, itm; - memcpy(tm_ctm.m, this->cur_text_tm, sizeof(tm_ctm.m)); - memcpy(tm.m, state->getTextMat(), sizeof(tm.m)); - double fs = state->getFontSize(); - - double cx = state->getCurX(), cy = state->getCurY(), - ry = state->getRise(), h = state->getHorizScaling(); - - //cx and cy has been transformed by text matrix, we need to reverse them. - tm.invertTo(&itm); - double char_cx, char_cy; - itm.transform(cx, cy, &char_cx, &char_cy); - - //TODO Vertical? Currently vertical/type3 chars are treated as non-chars. - double tchar[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry}; - - double tfinal[6]; - tm_multiply(tfinal, tm_ctm.m, tchar); - - auto font = state->getFont(); - double bbox[4] {0, 0, ax, ay}; - double desc = font->getDescent(), asc = font->getAscent(); - if (font->getWMode() == 0) - { - bbox[1] += desc; - bbox[3] += asc; - } - else - {//TODO Vertical? - } - tm_transform_bbox(tfinal, bbox); - covered_text_handler.add_char_bbox(bbox); -} - } // namespace pdf2htmlEX From 1d95b73eee696c045840bf3ebb9b78bf07bfc8fd Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Sun, 15 Jun 2014 16:42:34 +0800 Subject: [PATCH 03/23] Handle clips during processing of covered text. --- src/CoveredTextHandler.cc | 11 ++++-- src/CoveredTextHandler.h | 3 +- src/DrawingTracer.cc | 68 +++++++++++++++++++++++++++---------- src/DrawingTracer.h | 5 ++- src/HTMLRenderer/draw.cc | 3 +- src/HTMLRenderer/general.cc | 2 +- src/util/math.cc | 28 ++++++++++++--- src/util/math.h | 10 ++++-- 8 files changed, 100 insertions(+), 30 deletions(-) diff --git a/src/CoveredTextHandler.cc b/src/CoveredTextHandler.cc index 8557b57..9f8074a 100644 --- a/src/CoveredTextHandler.cc +++ b/src/CoveredTextHandler.cc @@ -30,11 +30,18 @@ void CoveredTextHandler::reset() void CoveredTextHandler::add_char_bbox(double * bbox) { - for (int i = 0; i < 4; i++) - char_bboxes.push_back(bbox[i]); + char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4); chars_covered.push_back(false); } +void CoveredTextHandler::add_char_bbox_clipped(double * bbox, bool patially) +{ + char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4); + chars_covered.push_back(true); + if (patially) + add_non_char_bbox(bbox, chars_covered.size() - 1); +} + void CoveredTextHandler::add_non_char_bbox(double * bbox, int index) { if (index < 0) diff --git a/src/CoveredTextHandler.h b/src/CoveredTextHandler.h index 34decf6..0531236 100644 --- a/src/CoveredTextHandler.h +++ b/src/CoveredTextHandler.h @@ -32,6 +32,8 @@ public: */ void add_char_bbox(double * bbox); + void add_char_bbox_clipped(double * bbox, bool patially); + /** * Add a drawn non-char graphics' bounding box. * If it intersects any previously drawn char's bbox, the char is marked as covered @@ -51,7 +53,6 @@ public: const std::vector & get_chars_covered() { return chars_covered; } private: - //covered text test std::vector chars_covered; // x00, y00, x01, y01; x10, y10, x11, y11;... std::vector char_bboxes; diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc index 15b9f20..7c55f2e 100644 --- a/src/DrawingTracer.cc +++ b/src/DrawingTracer.cc @@ -53,15 +53,15 @@ void DrawingTracer::set_ctm(GfxState *state) matrix.yy = ctm[3]; matrix.x0 = ctm[4]; matrix.y0 = ctm[5]; - cairo_set_matrix (cairo, &matrix); + cairo_set_matrix(cairo, &matrix); } void DrawingTracer::clip(GfxState * state, bool even_odd) { if (!param.process_covered_text) return; - do_path (state, state->getPath()); - cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); + do_path(state, state->getPath()); + cairo_set_fill_rule(cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); cairo_clip (cairo); } @@ -91,13 +91,13 @@ void DrawingTracer::do_path(GfxState * state, GfxPath * path) GfxSubpath *subpath; int i, j; double x, y; - cairo_new_path (cairo); + cairo_new_path(cairo); for (i = 0; i < path->getNumSubpaths(); ++i) { subpath = path->getSubpath(i); if (subpath->getNumPoints() > 0) { x = subpath->getX(0); y = subpath->getY(0); - cairo_move_to (cairo, x, y); + cairo_move_to(cairo, x, y); j = 1; while (j < subpath->getNumPoints()) { if (subpath->getCurve(j)) { @@ -111,7 +111,7 @@ void DrawingTracer::do_path(GfxState * state, GfxPath * path) } else { x = subpath->getX(j); y = subpath->getY(j); - cairo_line_to (cairo, x, y); + cairo_line_to(cairo, x, y); ++j; } } @@ -150,22 +150,56 @@ void DrawingTracer::fill(GfxState * state, bool even_odd) void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox) { - double cbox[4], result[4]; + double cbox[4]; cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); - // TODO intersect - tm_transform_bbox(state->getCTM(), bbox); - if (on_non_char_drawn) - on_non_char_drawn(bbox); + if(bbox_intersect(cbox, bbox, bbox)) + { + tm_transform_bbox(state->getCTM(), bbox); + if (on_non_char_drawn) + on_non_char_drawn(bbox); + } } void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) { - double cbox[4], result[4]; - cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); - // TODO intersect - tm_transform_bbox(state->getCTM(), bbox); - if (on_char_drawn) - on_char_drawn(bbox); + // Note: even if 4 corner of the char are all in the clip area, + // it still could be partially clipped. + // TODO better solution? + int pt_in = 0; + if (cairo_in_clip(cairo, bbox[0], bbox[1])) + ++pt_in; + if (cairo_in_clip(cairo, bbox[2], bbox[3])) + ++pt_in; + if (cairo_in_clip(cairo, bbox[2], bbox[1])) + ++pt_in; + if (cairo_in_clip(cairo, bbox[0], bbox[3])) + ++pt_in; + + if (pt_in == 0) + { + if(on_char_clipped) + on_char_clipped(bbox, false); + } + else + { + if (pt_in < 4) + { + double cbox[4]; + cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); + bbox_intersect(cbox, bbox, bbox); + } + tm_transform_bbox(state->getCTM(), bbox); + if (pt_in < 4) + { + if(on_char_clipped) + on_char_clipped(bbox, true); + } + else + { + if (on_char_drawn) + on_char_drawn(bbox); + } + } } void DrawingTracer::draw_image(GfxState *state) diff --git a/src/DrawingTracer.h b/src/DrawingTracer.h index 81fd4b0..032c511 100644 --- a/src/DrawingTracer.h +++ b/src/DrawingTracer.h @@ -25,9 +25,12 @@ public: * The callback to receive drawn event. * bbox in device space. */ + // a non-char graphics is drawn std::function on_non_char_drawn; + // a char is drawn in the clip area std::function on_char_drawn; - std::function on_char_clipped; + // a char is drawn out of/partially in the clip area + std::function on_char_clipped; DrawingTracer(const Param & param); virtual ~DrawingTracer(); diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc index b87c897..9b3f1bd 100644 --- a/src/HTMLRenderer/draw.cc +++ b/src/HTMLRenderer/draw.cc @@ -32,7 +32,8 @@ using std::ostream; void HTMLRenderer::restoreState(GfxState * state) { - updateAll(state); tracer.restore(); + updateAll(state); + tracer.restore(); } void HTMLRenderer::saveState(GfxState *state) diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index d784f3b..c032e0e 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -82,7 +82,7 @@ HTMLRenderer::HTMLRenderer(const Param & param) tracer.on_char_drawn = [this](double * box) { covered_text_handler.add_char_bbox(box); }; tracer.on_char_clipped = - [this](double * box) { covered_text_handler.add_char_bbox(box); }; //TODO + [this](double * box, bool partial) { covered_text_handler.add_char_bbox_clipped(box, partial); }; tracer.on_non_char_drawn = [this](double * box) { covered_text_handler.add_non_char_bbox(box); }; } diff --git a/src/util/math.cc b/src/util/math.cc index fb898c6..1ddabce 100644 --- a/src/util/math.cc +++ b/src/util/math.cc @@ -60,12 +60,30 @@ void tm_transform_bbox(const double * tm, double * bbox) } } -bool bbox_intersect(double * bbox1, double * bbox2) +bool bbox_intersect(const double * bbox1, const double * bbox2, double * result) { - return min(bbox1[0], bbox1[2]) < max(bbox2[0], bbox2[2]) - && max(bbox1[0], bbox1[2]) > min(bbox2[0], bbox2[2]) - && min(bbox1[1], bbox1[3]) < max(bbox2[1], bbox2[3]) - && max(bbox1[1], bbox1[3]) > min(bbox2[1], bbox2[3]); + double x0, y0, x1, y1; + + x0 = max(min(bbox1[0], bbox1[2]), min(bbox2[0], bbox2[2])); + x1 = min(max(bbox1[0], bbox1[2]), max(bbox2[0], bbox2[2])); + + if (x0 >= x1) + return false; + + y0 = max(min(bbox1[1], bbox1[3]), min(bbox2[1], bbox2[3])); + y1 = min(max(bbox1[1], bbox1[3]), max(bbox2[1], bbox2[3])); + + if (y0 >= y1) + return false; + + if (result) + { + result[0] = x0; + result[1] = y0; + result[2] = x1; + result[3] = y1; + } + return true; } } //namespace pdf2htmlEX diff --git a/src/util/math.h b/src/util/math.h index fcdebc4..75997f9 100644 --- a/src/util/math.h +++ b/src/util/math.h @@ -39,8 +39,14 @@ static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); } void tm_transform(const double * tm, double & x, double & y, bool is_delta = false); void tm_multiply(double * tm_left, const double * tm_right); void tm_transform_bbox(const double * tm, double * bbox); - -bool bbox_intersect(double * bbox1, double * bbox2); +/** + * Calculate the intersection of 2 boxes. + * If they are intersecting, store the result to result (if not null) and return true. + * Otherwise return false, and result is not touched. + * Param result can be same as one of bbox1 and bbox2. + * Data in boxes are expected in the order of (x0, y0, x1, y1). + */ +bool bbox_intersect(const double * bbox1, const double * bbox2, double * result = nullptr); } //namespace pdf2htmlEX #endif //MATH_H__ From 55d45acd32b9c2acde393ee53b211d4ac544a768 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Tue, 17 Jun 2014 11:19:24 +0800 Subject: [PATCH 04/23] Code cleaning --- src/DrawingTracer.cc | 16 +++++++--------- src/HTMLRenderer/HTMLRenderer.h | 2 -- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc index 7c55f2e..cb16c21 100644 --- a/src/DrawingTracer.cc +++ b/src/DrawingTracer.cc @@ -162,8 +162,8 @@ void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox) void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) { - // Note: even if 4 corner of the char are all in the clip area, - // it still could be partially clipped. + // Note: even if 4 corners of the char are all in or all out of the clip area, + // it could still be partially clipped. // TODO better solution? int pt_in = 0; if (cairo_in_clip(cairo, bbox[0], bbox[1])) @@ -216,11 +216,9 @@ void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, do return; Matrix tm, itm; - //memcpy(tm_ctm.m, this->cur_text_tm, sizeof(tm_ctm.m)); memcpy(tm.m, state->getTextMat(), sizeof(tm.m)); - double fs = state->getFontSize(); - double cx = state->getCurX(), cy = state->getCurY(), + double cx = state->getCurX(), cy = state->getCurY(), fs = state->getFontSize(), ry = state->getRise(), h = state->getHorizScaling(); //cx and cy has been transformed by text matrix, we need to reverse them. @@ -229,10 +227,10 @@ void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, do itm.transform(cx, cy, &char_cx, &char_cy); //TODO Vertical? Currently vertical/type3 chars are treated as non-chars. - double tchar[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry}; + double char_m[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry}; - double tfinal[6]; - tm_multiply(tfinal, tm.m, tchar); + double final_m[6]; + tm_multiply(final_m, tm.m, char_m); auto font = state->getFont(); double bbox[4] {0, 0, ax, ay}; @@ -245,7 +243,7 @@ void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, do else {//TODO Vertical? } - tm_transform_bbox(tfinal, bbox); + tm_transform_bbox(final_m, bbox); draw_char_bbox(state, bbox); } diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 8aa0a0f..4c77c6d 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -20,8 +20,6 @@ #include #include -#include - #include "pdf2htmlEX-config.h" #include "Param.h" From a5ac6b4d0d7a5c19dfbad44cdef50dffab5bebdb Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Tue, 17 Jun 2014 17:05:01 +0800 Subject: [PATCH 05/23] Make SplashBackgroundRenderer working with --process-covered-text --- .../SplashBackgroundRenderer.cc | 8 +++++ .../SplashBackgroundRenderer.h | 1 + src/HTMLRenderer/HTMLRenderer.h | 1 - src/HTMLRenderer/general.cc | 32 ++++++++----------- src/HTMLTextPage.h | 3 ++ 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc index b7dc686..8b70e4c 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.cc +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -88,6 +88,13 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y, { SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); } + // If a char is treated as image, it is not subject to cover test + // (see HTMLRenderer::drawString), so don't increase drawn_char_count. + else if (param.process_covered_text) { + if (html_renderer->get_chars_covered()[drawn_char_count]) + SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); + drawn_char_count++; + } } void SplashBackgroundRenderer::beginTextObject(GfxState *state) @@ -122,6 +129,7 @@ static GBool annot_cb(Annot *, void * pflag) { bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno) { + drawn_char_count = 0; bool process_annotation = param.process_annotation; doc->displayPage(this, pageno, param.h_dpi, param.v_dpi, 0, diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.h b/src/BackgroundRenderer/SplashBackgroundRenderer.h index 11d9534..6646e7b 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.h +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.h @@ -70,6 +70,7 @@ protected: HTMLRenderer * html_renderer; const Param & param; std::string format; + int drawn_char_count; }; } // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 4c77c6d..9d47095 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -347,7 +347,6 @@ protected: #endif BackgroundRenderer * bg_renderer; BackgroundRenderer * fallback_bg_renderer; - bool fallback_bg_required; struct { std::ofstream fs; diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index c032e0e..9d24c60 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -153,12 +153,7 @@ void HTMLRenderer::process(PDFDoc *doc) false, // printing nullptr, nullptr, nullptr, nullptr); - if(param.process_nontext) - { - fallback_bg_required = !bg_renderer->render_page(doc, i); - if (fallback_bg_required && fallback_bg_renderer != nullptr) - fallback_bg_renderer->render_page(doc, i); - } + if(param.split_pages) { @@ -207,13 +202,15 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) this->pageNum = pageNum; - double pageWidth = state->getPageWidth(); - double pageHeight = state->getPageHeight(); + html_text_page.set_page_size(state->getPageWidth(), state->getPageHeight()); - html_text_page.set_page_size(pageWidth, pageHeight); + reset_state(); +} + +void HTMLRenderer::endPage() { + long long wid = all_manager.width.install(html_text_page.get_width()); + long long hid = all_manager.height.install(html_text_page.get_height()); - long long wid = all_manager.width.install(pageWidth); - long long hid = all_manager.height.install(pageHeight); (*f_curpage) << "
render_page(cur_doc, pageNum)) bg_renderer->embed_image(pageNum); - else if (fallback_bg_renderer != nullptr) - fallback_bg_renderer->embed_image(pageNum); + else + { + if (fallback_bg_renderer->render_page(cur_doc, pageNum)) + fallback_bg_renderer->embed_image(pageNum); + } } - reset_state(); -} - -void HTMLRenderer::endPage() { // dump all text html_text_page.dump_text(*f_curpage); html_text_page.dump_css(f_css.fs); diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h index 7bffec4..ccaa564 100644 --- a/src/HTMLTextPage.h +++ b/src/HTMLTextPage.h @@ -39,6 +39,9 @@ public: void set_page_size(double width, double height); void clip(const HTMLClipState & clip_state); + double get_width() { return page_width; } + double get_height() { return page_height; } + private: void optimize(void); From 69ceff7dbdef3869d07d7316102bc8fa12a0524e Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Tue, 17 Jun 2014 17:20:13 +0800 Subject: [PATCH 06/23] Delete an unintentionally commited file. --- src/util/math.bk.cc | 72 --------------------------------------------- 1 file changed, 72 deletions(-) delete mode 100644 src/util/math.bk.cc diff --git a/src/util/math.bk.cc b/src/util/math.bk.cc deleted file mode 100644 index 0abafaf..0000000 --- a/src/util/math.bk.cc +++ /dev/null @@ -1,72 +0,0 @@ -#include -#include -#include - -#include "math.h" - -using std::min; -using std::max; - -namespace pdf2htmlEX { - -void tm_transform(const double * tm, double & x, double & y, bool is_delta) -{ - double xx = x, yy = y; - x = tm[0] * xx + tm[2] * yy; - y = tm[1] * xx + tm[3] * yy; - if(!is_delta) - { - x += tm[4]; - y += tm[5]; - } -} - -void tm_multiply(const double * tm_left, const double * tm_right, double * tm_result) -{ - double old[4]; - memcpy(old, tm_left, sizeof(old)); - - tm_result[0] = tm_left[0] * tm_right[0] + tm_left[2] * tm_right[1]; - tm_result[1] = tm_left[1] * tm_right[0] + tm_left[3] * tm_right[1]; - tm_result[2] = tm_left[0] * tm_right[2] + tm_left[2] * tm_right[3]; - tm_result[3] = tm_left[1] * tm_right[2] + tm_left[3] * tm_right[3]; - tm_result[4] += tm_left[0] * tm_right[4] + tm_left[2] * tm_right[5]; - tm_result[5] += tm_left[1] * tm_right[4] + tm_left[3] * tm_right[5]; -} - -void tm_transform_bbox(const double * tm, double * bbox) -{ - double & x1 = bbox[0]; - double & y1 = bbox[1]; - double & x2 = bbox[2]; - double & y2 = bbox[3]; - double _[4][2]; - _[0][0] = _[1][0] = x1; - _[0][1] = _[2][1] = y1; - _[2][0] = _[3][0] = x2; - _[1][1] = _[3][1] = y2; - - x1 = y1 = std::numeric_limits::max(); - x2 = y2 = std::numeric_limits::min(); - for(int i = 0; i < 4; ++i) - { - auto & x = _[i][0]; - auto & y = _[i][1]; - tm_transform(tm, x, y); - if(x < x1) x1 = x; - if(x > x2) x2 = x; - if(y < y1) y1 = y; - if(y > y2) y2 = y; - } -} - -bool bbox_intersect(double * bbox1, double * bbox2) -{ - return min(bbox1[0], bbox1[2]) < max(bbox2[0], bbox2[2]) - && max(bbox1[0], bbox1[2]) > min(bbox2[0], bbox2[2]) - && min(bbox1[1], bbox1[3]) < max(bbox2[1], bbox2[3]) - && max(bbox1[1], bbox1[3]) > min(bbox2[1], bbox2[3]); -} - -} //namespace pdf2htmlEX - From 4ab00e3644a78fec742657d46f94421eb85d086f Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Tue, 17 Jun 2014 17:33:37 +0800 Subject: [PATCH 07/23] Add check of nullptr to fallback_bg_renderer; remove outdated comment. --- src/HTMLRenderer/general.cc | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 9d24c60..a2be519 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -142,9 +142,6 @@ void HTMLRenderer::process(PDFDoc *doc) cur_page_filename = filled_template_filename; } - // We handle covered texts during doc->displayPage(this...), - // and bg_renderer->render_page() depends on the result, so it must be called after - // doc->displayPage(this...). doc->displayPage(this, i, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, 0, @@ -153,8 +150,6 @@ void HTMLRenderer::process(PDFDoc *doc) false, // printing nullptr, nullptr, nullptr, nullptr); - - if(param.split_pages) { delete f_curpage; @@ -245,7 +240,7 @@ void HTMLRenderer::endPage() { { if (bg_renderer->render_page(cur_doc, pageNum)) bg_renderer->embed_image(pageNum); - else + else if (fallback_bg_renderer != nullptr) { if (fallback_bg_renderer->render_page(cur_doc, pageNum)) fallback_bg_renderer->embed_image(pageNum); From e468f0349402a9cd72d51cb9c2a90f4329b94946 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Tue, 17 Jun 2014 23:37:25 +0800 Subject: [PATCH 08/23] Fix covered text processing issue (in DrawingTracer::set_ctm) when zoom != 1 --- src/DrawingTracer.cc | 32 +++++++++++++++++++++----------- src/DrawingTracer.h | 3 ++- src/HTMLRenderer/state.cc | 2 +- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc index cb16c21..24fa6e2 100644 --- a/src/DrawingTracer.cc +++ b/src/DrawingTracer.cc @@ -41,19 +41,21 @@ void DrawingTracer::finish() } } -void DrawingTracer::set_ctm(GfxState *state) +// Poppler won't inform us its initial CTM, and the initial CTM is affected by zoom level. +// OutputDev::clip() may be called before OutputDev::updateCTM(), so we can't rely on GfxState::getCTM(), +// and should trace ctm changes ourself (via cairo). +void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m21, double m22, double m31, double m32) { if (!param.process_covered_text) return; - double * ctm = state->getCTM(); cairo_matrix_t matrix; - matrix.xx = ctm[0]; - matrix.yx = ctm[1]; - matrix.xy = ctm[2]; - matrix.yy = ctm[3]; - matrix.x0 = ctm[4]; - matrix.y0 = ctm[5]; - cairo_set_matrix(cairo, &matrix); + matrix.xx = m11; + matrix.yx = m12; + matrix.xy = m21; + matrix.yy = m22; + matrix.x0 = m31; + matrix.y0 = m32; + cairo_transform(cairo, &matrix); } void DrawingTracer::clip(GfxState * state, bool even_odd) @@ -154,7 +156,7 @@ void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox) cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); if(bbox_intersect(cbox, bbox, bbox)) { - tm_transform_bbox(state->getCTM(), bbox); + transform_bbox_by_ctm(bbox); if (on_non_char_drawn) on_non_char_drawn(bbox); } @@ -188,7 +190,7 @@ void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); bbox_intersect(cbox, bbox, bbox); } - tm_transform_bbox(state->getCTM(), bbox); + transform_bbox_by_ctm(bbox); if (pt_in < 4) { if(on_char_clipped) @@ -247,4 +249,12 @@ void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, do draw_char_bbox(state, bbox); } +void DrawingTracer::transform_bbox_by_ctm(double * bbox) +{ + cairo_matrix_t mat; + cairo_get_matrix(cairo, &mat); + double mat_a[6] {mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0}; + tm_transform_bbox(mat_a, bbox); +} + } /* namespace pdf2htmlEX */ diff --git a/src/DrawingTracer.h b/src/DrawingTracer.h index 032c511..d8cee1c 100644 --- a/src/DrawingTracer.h +++ b/src/DrawingTracer.h @@ -46,7 +46,7 @@ public: * An image is drawing */ void draw_image(GfxState * state); - void set_ctm(GfxState * state); + void update_ctm(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32); void clip(GfxState * state, bool even_odd = false); void clip_to_stroke_path(GfxState * state); void fill(GfxState * state, bool even_odd = false); @@ -60,6 +60,7 @@ private: void do_path(GfxState * state, GfxPath * path); void draw_non_char_bbox(GfxState * state, double * bbox); void draw_char_bbox(GfxState * state, double * bbox); + void transform_bbox_by_ctm(double * bbox); const Param & param; cairo_t * cairo = nullptr; diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 498655a..9278e4e 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -46,7 +46,7 @@ void HTMLRenderer::updateFont(GfxState * state) void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) { ctm_changed = true; - tracer.set_ctm(state); + tracer.update_ctm(state, m11, m12, m21, m22, m31, m32); } void HTMLRenderer::updateTextMat(GfxState * state) { From a57b61731d8b326e4ef8445a2619f4b9e9a25c9c Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Wed, 18 Jun 2014 00:04:04 +0800 Subject: [PATCH 09/23] Fix Travis build --- src/DrawingTracer.cc | 2 +- src/DrawingTracer.h | 2 +- src/HTMLState.h | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc index 24fa6e2..05ab81b 100644 --- a/src/DrawingTracer.cc +++ b/src/DrawingTracer.cc @@ -13,7 +13,7 @@ namespace pdf2htmlEX { -DrawingTracer::DrawingTracer(const Param & param):param(param) +DrawingTracer::DrawingTracer(const Param & param): param(param), cairo(nullptr) { } diff --git a/src/DrawingTracer.h b/src/DrawingTracer.h index d8cee1c..a04b453 100644 --- a/src/DrawingTracer.h +++ b/src/DrawingTracer.h @@ -63,7 +63,7 @@ private: void transform_bbox_by_ctm(double * bbox); const Param & param; - cairo_t * cairo = nullptr; + cairo_t * cairo; }; } /* namespace pdf2htmlEX */ diff --git a/src/HTMLState.h b/src/HTMLState.h index 9327e5c..195af21 100644 --- a/src/HTMLState.h +++ b/src/HTMLState.h @@ -63,8 +63,10 @@ struct HTMLLineState double x,y; double transform_matrix[4]; // The page-cope char index(in drawing order) of the first char in this line. - int first_char_index = -1; - const std::vector * chars_covered = nullptr; + int first_char_index; + const std::vector * chars_covered; + + HTMLLineState(): first_char_index(-1), chars_covered(nullptr) { } }; struct HTMLClipState From f705a98581ef66d0e52648a513b85ed873f0c267 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Wed, 18 Jun 2014 17:58:05 +0800 Subject: [PATCH 10/23] Improve DrawingTracer::stroke() by break path into steps. --- src/DrawingTracer.cc | 67 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc index 05ab81b..3a48363 100644 --- a/src/DrawingTracer.cc +++ b/src/DrawingTracer.cc @@ -10,6 +10,9 @@ #include "util/math.h" #include "DrawingTracer.h" +//#define DT_DEBUG(x) (x) +#define DT_DEBUG(x) + namespace pdf2htmlEX { @@ -128,14 +131,63 @@ void DrawingTracer::stroke(GfxState * state) { if (!param.process_covered_text) return; - // TODO - // 1. if stroke extents is large, break the path into pieces and handle each of them; - // 2. if the line width is small, could just ignore the path? - do_path(state, state->getPath()); + + DT_DEBUG(printf("DrawingTracer::stroke\n")); + cairo_set_line_width(cairo, state->getLineWidth()); - double sbox[4]; - cairo_stroke_extents(cairo, sbox, sbox + 1, sbox + 2, sbox + 3); - draw_non_char_bbox(state, sbox); + + // GfxPath is broken into steps, each step makes up a cairo path and its bbox is used for covering test. + // TODO + // 1. path steps that are not vertical or horizontal lines may still falsely "cover" many chars, + // can we slice those steps further? + // 2. if the line width is small, can we just ignore the path? + GfxPath * path = state->getPath(); + for (int i = 0; i < path->getNumSubpaths(); ++i) { + GfxSubpath * subpath = path->getSubpath(i); + if (subpath->getNumPoints() <= 0) + continue; + double x = subpath->getX(0); + double y = subpath->getY(0); + //p: loop cursor; j: next point index + int p =1, j = 1; + int n = subpath->getNumPoints(); + while (p <= n) { + cairo_new_path(cairo); + cairo_move_to(cairo, x, y); + if (subpath->getCurve(j)) { + x = subpath->getX(j+2); + y = subpath->getY(j+2); + cairo_curve_to(cairo, + subpath->getX(j), subpath->getY(j), + subpath->getX(j+1), subpath->getY(j+1), + x, y); + p += 3; + } else { + x = subpath->getX(j); + y = subpath->getY(j); + cairo_line_to(cairo, x, y); + ++p; + } + + DT_DEBUG(printf("DrawingTracer::stroke:new box:\n")); + double sbox[4]; + cairo_stroke_extents(cairo, sbox, sbox + 1, sbox + 2, sbox + 3); + if (sbox[0] != sbox[2] && sbox[1] != sbox[3]) + draw_non_char_bbox(state, sbox); + else + DT_DEBUG(printf("DrawingTracer::stroke:zero box!\n")); + + if (p == n) + { + if (subpath->isClosed()) + j = 0; // if sub path is closed, go back to starting point + else + break; + } + else + j = p; + } + } } void DrawingTracer::fill(GfxState * state, bool even_odd) @@ -157,6 +209,7 @@ void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox) if(bbox_intersect(cbox, bbox, bbox)) { transform_bbox_by_ctm(bbox); + DT_DEBUG(printf("DrawingTracer::draw_non_char_bbox:[%f,%f,%f,%f]\n", bbox[0],bbox[1],bbox[2],bbox[3])); if (on_non_char_drawn) on_non_char_drawn(bbox); } From b9899d8d4a00e5f308ab7a6dcd6c670947e41ddf Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Wed, 18 Jun 2014 21:15:58 +0800 Subject: [PATCH 11/23] Fix issue in char displacement calculation. --- src/DrawingTracer.cc | 2 ++ src/HTMLRenderer/text.cc | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc index 3a48363..c282824 100644 --- a/src/DrawingTracer.cc +++ b/src/DrawingTracer.cc @@ -232,6 +232,7 @@ void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) if (pt_in == 0) { + transform_bbox_by_ctm(bbox); if(on_char_clipped) on_char_clipped(bbox, false); } @@ -255,6 +256,7 @@ void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) on_char_drawn(bbox); } } + DT_DEBUG(printf("DrawingTracer::draw_char_bbox:[%f,%f,%f,%f]\n",bbox[0],bbox[1],bbox[2],bbox[3])); } void DrawingTracer::draw_image(GfxState *state) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 13f2065..6191554 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -51,9 +51,14 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) char *p = s->getCString(); int len = s->getLength(); + //accumulated displacement of chars in this string, in text object space double dx = 0; double dy = 0; - double dx1,dy1; + //displacement of current char, in text object space + double ddx, ddy; + //advance of current char, in glyph space + double ax, ay; + //origin of current char, in glyph space double ox, oy; int nChars = 0; @@ -65,14 +70,15 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) while (len > 0) { - auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy); + auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy); if(!(equal(ox, 0) && equal(oy, 0))) { cerr << "TODO: non-zero origins" << endl; } - - tracer.draw_char(state, dx, dy, dx1, dy1); //TODO dx dy seems not correct? + ddx = (ax * cur_font_size + cur_letter_space) * cur_horiz_scaling; + ddy = ay * cur_font_size; + tracer.draw_char(state, dx, dy, ax, ay); bool is_space = false; if (n == 1 && *p == ' ') @@ -93,13 +99,13 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) if(is_space && (param.space_as_offset)) { // ignore horiz_scaling, as it has been merged into CTM - html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); + html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); } else { if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) { - html_text_page.get_cur_line()->append_unicodes(u, uLen, (dx1 * cur_font_size + cur_letter_space)); + html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx); } else { @@ -112,7 +118,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) { uu = unicode_from_font(code, font); } - html_text_page.get_cur_line()->append_unicodes(&uu, 1, (dx1 * cur_font_size + cur_letter_space)); + html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx); /* * In PDF, word_space is appended if (n == 1 and *p = ' ') * but in HTML, word_space is appended if (uu == ' ') @@ -125,8 +131,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) } } - dx += dx1; - dy += dy1; + dx += ddx; + dy += ddy; ++nChars; p += n; From 5fdc4cbb36fdae27e0ed1453faad86cbaf44293a Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Thu, 19 Jun 2014 11:06:54 +0800 Subject: [PATCH 12/23] Improve stroke handling in DrawingTracer by simulating line join. --- src/DrawingTracer.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc index c282824..66ebe8c 100644 --- a/src/DrawingTracer.cc +++ b/src/DrawingTracer.cc @@ -141,6 +141,10 @@ void DrawingTracer::stroke(GfxState * state) // 1. path steps that are not vertical or horizontal lines may still falsely "cover" many chars, // can we slice those steps further? // 2. if the line width is small, can we just ignore the path? + // 3. line join feature can't be retained. We use line-cap-square to minimize the problem that + // some chars actually covered by a line join are missed. However chars covered by a acute angle + // with line-join-miter may be still recognized as not covered. + cairo_set_line_cap(cairo, CAIRO_LINE_CAP_SQUARE); GfxPath * path = state->getPath(); for (int i = 0; i < path->getNumSubpaths(); ++i) { GfxSubpath * subpath = path->getSubpath(i); From 4c2fcb4f0dee65616d2260dfe8759c0ea3b1986f Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Fri, 20 Jun 2014 09:53:23 +0800 Subject: [PATCH 13/23] Fix of "Fix issue in char displacement calculation". --- src/HTMLRenderer/text.cc | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 6191554..132f087 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -14,6 +14,9 @@ #include "util/namespace.h" #include "util/unicode.h" +//#define HR_DEBUG(x) (x) +#define HR_DEBUG(x) + namespace pdf2htmlEX { using std::all_of; @@ -61,16 +64,17 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) //origin of current char, in glyph space double ox, oy; - int nChars = 0; - int nSpaces = 0; int uLen; CharCode code; Unicode *u = nullptr; + HR_DEBUG(printf("HTMLRenderer::drawString:len=%d\n", len)); + while (len > 0) { auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy); + HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%d\n", u[0])); if(!(equal(ox, 0) && equal(oy, 0))) { @@ -93,7 +97,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) * There are always ugly PDF files with no useful info at all. */ is_space = true; - ++nSpaces; + ddx += cur_word_space * cur_horiz_scaling; } if(is_space && (param.space_as_offset)) @@ -134,16 +138,10 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) dx += ddx; dy += ddy; - ++nChars; p += n; len -= n; } - // horiz_scaling is merged into ctm now, - // so the coordinate system is ugly - dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * cur_horiz_scaling; - dy *= cur_font_size; - cur_tx += dx; cur_ty += dy; From e69e9a8be83a79ae9955bf2f8dd11245cb68f8cd Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Wed, 25 Jun 2014 14:32:34 +0800 Subject: [PATCH 14/23] Fix word space problem in HTMLRenderer::drawString(). --- src/HTMLRenderer/text.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 132f087..8a2fae3 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -57,7 +57,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) //accumulated displacement of chars in this string, in text object space double dx = 0; double dy = 0; - //displacement of current char, in text object space + //displacement of current char, in text object space, including letter space but not word space. double ddx, ddy; //advance of current char, in glyph space double ax, ay; @@ -97,7 +97,6 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) * There are always ugly PDF files with no useful info at all. */ is_space = true; - ddx += cur_word_space * cur_horiz_scaling; } if(is_space && (param.space_as_offset)) @@ -137,6 +136,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) dx += ddx; dy += ddy; + if (is_space) + dx += cur_word_space * cur_horiz_scaling; p += n; len -= n; From 65e82028bb33e9ea20ef6c3068a68ed9a5fa01d9 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Wed, 25 Jun 2014 18:49:40 +0800 Subject: [PATCH 15/23] Allow covered text handling (--process-covered-text) to work without cairo. --- src/DrawingTracer.cc | 47 +++++++++++++++++++++++++++++++++++++++++--- src/DrawingTracer.h | 11 ++++++++++- src/util/math.h | 7 +++++++ 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc index 66ebe8c..8855de6 100644 --- a/src/DrawingTracer.cc +++ b/src/DrawingTracer.cc @@ -13,10 +13,17 @@ //#define DT_DEBUG(x) (x) #define DT_DEBUG(x) +#if !ENABLE_SVG +#warning "Cairo is disabled because ENABLE_SVG is off, --process-covered-text has limited functionality." +#endif + namespace pdf2htmlEX { -DrawingTracer::DrawingTracer(const Param & param): param(param), cairo(nullptr) +DrawingTracer::DrawingTracer(const Param & param): param(param) +#if ENABLE_SVG +, cairo(nullptr) +#endif { } @@ -30,18 +37,23 @@ void DrawingTracer::reset(GfxState *state) if (!param.process_covered_text) return; finish(); + +#if ENABLE_SVG cairo_rectangle_t page_box {0, 0, width:state->getPageWidth(), height:state->getPageHeight()}; cairo_surface_t * surface = cairo_recording_surface_create(CAIRO_CONTENT_COLOR_ALPHA, &page_box); cairo = cairo_create(surface); +#endif } void DrawingTracer::finish() { +#if ENABLE_SVG if (cairo) { cairo_destroy(cairo); cairo = nullptr; } +#endif } // Poppler won't inform us its initial CTM, and the initial CTM is affected by zoom level. @@ -51,6 +63,8 @@ void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m { if (!param.process_covered_text) return; + +#if ENABLE_SVG cairo_matrix_t matrix; matrix.xx = m11; matrix.yx = m12; @@ -59,15 +73,18 @@ void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m matrix.x0 = m31; matrix.y0 = m32; cairo_transform(cairo, &matrix); +#endif } void DrawingTracer::clip(GfxState * state, bool even_odd) { if (!param.process_covered_text) return; +#if ENABLE_SVG do_path(state, state->getPath()); cairo_set_fill_rule(cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); cairo_clip (cairo); +#endif } void DrawingTracer::clip_to_stroke_path(GfxState * state) @@ -81,17 +98,22 @@ void DrawingTracer::save() { if (!param.process_covered_text) return; +#if ENABLE_SVG cairo_save(cairo); +#endif } void DrawingTracer::restore() { if (!param.process_covered_text) return; +#if ENABLE_SVG cairo_restore(cairo); +#endif } void DrawingTracer::do_path(GfxState * state, GfxPath * path) { +#if ENABLE_SVG //copy from CairoOutputDev::doPath GfxSubpath *subpath; int i, j; @@ -125,10 +147,12 @@ void DrawingTracer::do_path(GfxState * state, GfxPath * path) } } } +#endif } void DrawingTracer::stroke(GfxState * state) { +#if ENABLE_SVG if (!param.process_covered_text) return; @@ -192,27 +216,33 @@ void DrawingTracer::stroke(GfxState * state) j = p; } } +#endif } void DrawingTracer::fill(GfxState * state, bool even_odd) { if (!param.process_covered_text) return; + +#if ENABLE_SVG do_path(state, state->getPath()); //cairo_fill_extents don't take fill rule into account. //cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); double fbox[4]; cairo_fill_extents(cairo, fbox, fbox + 1, fbox + 2, fbox + 3); draw_non_char_bbox(state, fbox); +#endif } void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox) { +#if ENABLE_SVG double cbox[4]; cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); if(bbox_intersect(cbox, bbox, bbox)) +#endif { - transform_bbox_by_ctm(bbox); + transform_bbox_by_ctm(bbox, state); DT_DEBUG(printf("DrawingTracer::draw_non_char_bbox:[%f,%f,%f,%f]\n", bbox[0],bbox[1],bbox[2],bbox[3])); if (on_non_char_drawn) on_non_char_drawn(bbox); @@ -221,6 +251,7 @@ void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox) void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) { +#if ENABLE_SVG // Note: even if 4 corners of the char are all in or all out of the clip area, // it could still be partially clipped. // TODO better solution? @@ -260,6 +291,11 @@ void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) on_char_drawn(bbox); } } +#else + transform_bbox_by_ctm(bbox, state); + if (on_char_drawn) + on_char_drawn(bbox); +#endif DT_DEBUG(printf("DrawingTracer::draw_char_bbox:[%f,%f,%f,%f]\n",bbox[0],bbox[1],bbox[2],bbox[3])); } @@ -308,12 +344,17 @@ void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, do draw_char_bbox(state, bbox); } -void DrawingTracer::transform_bbox_by_ctm(double * bbox) + +void DrawingTracer::transform_bbox_by_ctm(double * bbox, GfxState * state) { +#if ENABLE_SVG cairo_matrix_t mat; cairo_get_matrix(cairo, &mat); double mat_a[6] {mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0}; tm_transform_bbox(mat_a, bbox); +#else + tm_transform_bbox(state->getCTM(), bbox); +#endif } } /* namespace pdf2htmlEX */ diff --git a/src/DrawingTracer.h b/src/DrawingTracer.h index a04b453..2e3159d 100644 --- a/src/DrawingTracer.h +++ b/src/DrawingTracer.h @@ -11,7 +11,12 @@ #include #include + +#include "pdf2htmlEX-config.h" + +#if ENABLE_SVG #include +#endif #include "Param.h" @@ -60,10 +65,14 @@ private: void do_path(GfxState * state, GfxPath * path); void draw_non_char_bbox(GfxState * state, double * bbox); void draw_char_bbox(GfxState * state, double * bbox); - void transform_bbox_by_ctm(double * bbox); + // If cairo is available, parameter state is ignored + void transform_bbox_by_ctm(double * bbox, GfxState * state = nullptr); const Param & param; + +#if ENABLE_SVG cairo_t * cairo; +#endif }; } /* namespace pdf2htmlEX */ diff --git a/src/util/math.h b/src/util/math.h index 75997f9..8302a93 100644 --- a/src/util/math.h +++ b/src/util/math.h @@ -24,6 +24,13 @@ static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6 return false; return true; } + +static inline void tm_init(double * tm) +{ + tm[0] = tm[3] = 1; + tm[1] = tm[2] = tm[4] = tm[5] = 0; +} + static inline void tm_multiply(double * result, const double * m1, const double * m2) { result[0] = m1[0] * m2[0] + m1[2] * m2[1]; From 39e171a73760c5120a7bd4c30d4fd4b06974b0b1 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Thu, 26 Jun 2014 12:39:35 +0800 Subject: [PATCH 16/23] Improve covered text handling: 1. take care of chars corespond to 0 or more than one unicode points; 2. merge sibling invisiable spans; 3. improve interfaces of HTMLLineState and HTMLRenderer; --- .../CairoBackgroundRenderer.cc | 2 +- .../SplashBackgroundRenderer.cc | 2 +- src/HTMLRenderer/HTMLRenderer.h | 9 ++- src/HTMLRenderer/state.cc | 5 +- src/HTMLRenderer/text.cc | 15 ++++- src/HTMLState.h | 7 ++- src/HTMLTextLine.cc | 61 ++++++++++++++----- src/HTMLTextLine.h | 28 ++++++++- 8 files changed, 101 insertions(+), 28 deletions(-) diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc index 19d5795..1ff8622 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.cc +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc @@ -66,7 +66,7 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y, // If a char is treated as image, it is not subject to cover test // (see HTMLRenderer::drawString), so don't increase drawn_char_count. else if (param.process_covered_text) { - if (html_renderer->get_chars_covered()[drawn_char_count]) + if (html_renderer->is_char_covered(drawn_char_count)) CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); drawn_char_count++; } diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc index 8b70e4c..4089da7 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.cc +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -91,7 +91,7 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y, // If a char is treated as image, it is not subject to cover test // (see HTMLRenderer::drawString), so don't increase drawn_char_count. else if (param.process_covered_text) { - if (html_renderer->get_chars_covered()[drawn_char_count]) + if (html_renderer->is_char_covered(drawn_char_count)) SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); drawn_char_count++; } diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 9d47095..3ec57c1 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -150,7 +150,14 @@ public: bool can_stroke(GfxState *state) { return false; } ////{ return css_do_path(state, false, true); } bool can_fill(GfxState *state) { return false; } ////{ return css_do_path(state, true, true); } - const std::vector & get_chars_covered() { return covered_text_handler.get_chars_covered(); } + /* + * Covered text handling. + */ + // Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page. + // Does not fail on out-of-bound conditions, but return false. + bool is_char_covered(int index); + // Currently drawn char (glyph) count in current page. + int get_char_count() { return (int)covered_text_handler.get_chars_covered().size(); } protected: //////////////////////////////////////////////////// diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 9278e4e..c46ed4c 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -123,8 +123,7 @@ void HTMLRenderer::reset_state() cur_line_state.y = 0; memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix)); - if (param.process_covered_text) - cur_line_state.chars_covered = &covered_text_handler.get_chars_covered(); + cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);}; cur_clip_state.xmin = 0; cur_clip_state.xmax = 0; @@ -510,7 +509,7 @@ void HTMLRenderer::prepare_text_line(GfxState * state) state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y); state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y); if (param.process_covered_text) - cur_line_state.first_char_index = covered_text_handler.get_chars_covered().size(); + cur_line_state.first_char_index = get_char_count(); html_text_page.open_new_line(cur_line_state); cur_text_state.vertical_align = 0; diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 8a2fae3..2ec3877 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) while (len > 0) { auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy); - HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%d\n", u[0])); + HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0])); if(!(equal(ox, 0) && equal(oy, 0))) { @@ -101,6 +101,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) if(is_space && (param.space_as_offset)) { + html_text_page.get_cur_line()->append_padding_char(); // ignore horiz_scaling, as it has been merged into CTM html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); } @@ -150,4 +151,16 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) draw_ty += dy; } +bool HTMLRenderer::is_char_covered(int index) +{ + auto covered = covered_text_handler.get_chars_covered(); + if (index < 0 || index >= (int)covered.size()) + { + std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: " + << index << ", size: " << covered.size() < + #include "Color.h" namespace pdf2htmlEX { @@ -64,9 +66,10 @@ struct HTMLLineState double transform_matrix[4]; // The page-cope char index(in drawing order) of the first char in this line. int first_char_index; - const std::vector * chars_covered; + // A function to determine whether a char is covered at a given index. + std::function is_char_covered; - HTMLLineState(): first_char_index(-1), chars_covered(nullptr) { } + HTMLLineState(): first_char_index(-1) { } }; struct HTMLClipState diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index ba32209..1304e31 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -36,7 +36,14 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width) { - text.insert(text.end(), u, u+l); + if (l == 1) + text.push_back(min(u[0], (unsigned)INT_MAX)); + else + { + text.push_back(- decomposed_text.size() - 1); + decomposed_text.emplace_back(); + decomposed_text.back().assign(u, u + l); + } this->width += width; } @@ -69,30 +76,54 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state) last_state.font_size *= last_state.font_info->font_size_scale; } -void HTMLTextLine::dump_chars(ostream & out, const Unicode * u, int uLen) +void HTMLTextLine::dump_char(std::ostream & out, int pos) { - if (!line_state.chars_covered) + int c = text[pos]; + if (c > 0) { - writeUnicodes(out, u, uLen); + Unicode u = c; + writeUnicodes(out, &u, 1); + } + else if (c < 0) + { + auto dt = decomposed_text[- c - 1]; + writeUnicodes(out, &dt.front(), dt.size()); + } +} + +void HTMLTextLine::dump_chars(ostream & out, int begin, int len) +{ + if (line_state.first_char_index < 0) + { + for (int i = 0; i < len; i++) + dump_char(out, begin + i); return; } - //TODO merge sibling invisiable spans - int start = this->line_state.first_char_index + dumped_char_count; - for(int i = 0; i < uLen; i++) + bool invisible_group_open = false; + for(int i = 0; i < len; i++) { - if (!(*line_state.chars_covered)[start + i]) //visible + if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible { - writeUnicodes(out, u + i, 1); + if (invisible_group_open) + { + invisible_group_open = false; + out << ""; + } + dump_char(out, begin + i); } else { - out << ""; - writeUnicodes(out, u + i, 1); - out << ""; + if (!invisible_group_open) + { + out << ""; + invisible_group_open = true; + } + dump_char(out, begin + i); } } - dumped_char_count += uLen; + if (invisible_group_open) + out << ""; } void HTMLTextLine::dump_text(ostream & out) @@ -110,8 +141,6 @@ void HTMLTextLine::dump_text(ostream & out) return; } - dumped_char_count = 0; - // Start Output { // open
for the current text line @@ -244,7 +273,7 @@ void HTMLTextLine::dump_text(ostream & out) size_t next_text_idx = text_idx2; if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx) next_text_idx = cur_offset_iter->start_idx; - dump_chars(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx); + dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx); cur_text_idx = next_text_idx; } } diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h index 8fa814f..c8a3c8b 100644 --- a/src/HTMLTextLine.h +++ b/src/HTMLTextLine.h @@ -73,7 +73,16 @@ public: double width; }; + /** + * Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to + * multiple code points. + */ void append_unicodes(const Unicode * u, int l, double width); + /** + * Append a special padding char with 0 width, in order to keep char index consistent. + * The padding char is ignored during output. + */ + void append_padding_char() { text.push_back(0); } void append_offset(double width); void append_state(const HTMLTextState & text_state); void dump_text(std::ostream & out); @@ -91,7 +100,13 @@ public: private: void optimize_normal(std::vector &); void optimize_aggressive(std::vector &); - void dump_chars(std::ostream & out, const Unicode * u, int uLen); + + /** + * Dump chars' unicode to output stream. + * begin/pos is the index in 'text'. + */ + void dump_chars(std::ostream & out, int begin, int len); + void dump_char(std::ostream & out, int pos); const Param & param; AllStateManager & all_manager; @@ -103,9 +118,16 @@ private: std::vector states; std::vector offsets; - std::vector text; - int dumped_char_count; + /** + * Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text': + * - If c > 0, it is the unicode code point corresponds to the glyph; + * - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?); + * - If c < -1, this glyph corresponds to more than one unicode code points, + * which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'. + */ + std::vector text; + std::vector > decomposed_text; }; } // namespace pdf2htmlEX From ceb4e3eac6d163ea1e127f001448f63b0dc927e8 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Fri, 27 Jun 2014 01:31:24 +0800 Subject: [PATCH 17/23] Covered text handling: use class instead of inline style. --- src/HTMLTextLine.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index 1304e31..c1a624a 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -38,7 +38,7 @@ void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width) { if (l == 1) text.push_back(min(u[0], (unsigned)INT_MAX)); - else + else if (l > 1) { text.push_back(- decomposed_text.size() - 1); decomposed_text.emplace_back(); @@ -93,6 +93,8 @@ void HTMLTextLine::dump_char(std::ostream & out, int pos) void HTMLTextLine::dump_chars(ostream & out, int begin, int len) { + static const Color transparent { true, {0, 0, 0} }; + if (line_state.first_char_index < 0) { for (int i = 0; i < len; i++) @@ -116,7 +118,9 @@ void HTMLTextLine::dump_chars(ostream & out, int begin, int len) { if (!invisible_group_open) { - out << ""; + out << ""; invisible_group_open = true; } dump_char(out, begin + i); From 85f04be632563f4919973e8cc5cfc7edc5670133 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Fri, 27 Jun 2014 10:55:56 +0800 Subject: [PATCH 18/23] Covered text handling: rename option name to --correct-text-visibility. --- pdf2htmlEX.1.in | 2 +- .../CairoBackgroundRenderer.cc | 2 +- .../SplashBackgroundRenderer.cc | 2 +- src/DrawingTracer.cc | 22 +++++++++---------- src/HTMLRenderer/state.cc | 4 +++- src/Param.h | 2 +- src/pdf2htmlEX.cc | 2 +- 7 files changed, 19 insertions(+), 17 deletions(-) diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 49266b0..fdd55db 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -243,7 +243,7 @@ If set to 0, pdf2htmlEX would try its best to balance the two methods above. If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for text. Turn it off if anything goes wrong. .TP -.B --process-covered-text <0|1> (Default: 0) +.B --correct-text-visibility <0|1> (Default: 0) If set to 1, pdf2htmlEX will try to detect texts covered by other graphics and properly arrange them, i.e. covered texts are made transparent in text layer, and are drawn on background layer. diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc index 1ff8622..c86704a 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.cc +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc @@ -65,7 +65,7 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y, } // If a char is treated as image, it is not subject to cover test // (see HTMLRenderer::drawString), so don't increase drawn_char_count. - else if (param.process_covered_text) { + else if (param.correct_text_visibility) { if (html_renderer->is_char_covered(drawn_char_count)) CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); drawn_char_count++; diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc index 4089da7..f28a322 100644 --- a/src/BackgroundRenderer/SplashBackgroundRenderer.cc +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -90,7 +90,7 @@ void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y, } // If a char is treated as image, it is not subject to cover test // (see HTMLRenderer::drawString), so don't increase drawn_char_count. - else if (param.process_covered_text) { + else if (param.correct_text_visibility) { if (html_renderer->is_char_covered(drawn_char_count)) SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); drawn_char_count++; diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc index 8855de6..8db1437 100644 --- a/src/DrawingTracer.cc +++ b/src/DrawingTracer.cc @@ -14,7 +14,7 @@ #define DT_DEBUG(x) #if !ENABLE_SVG -#warning "Cairo is disabled because ENABLE_SVG is off, --process-covered-text has limited functionality." +#warning "Cairo is disabled because ENABLE_SVG is off, --correct-text-visibility has limited functionality." #endif namespace pdf2htmlEX @@ -34,7 +34,7 @@ DrawingTracer::~DrawingTracer() void DrawingTracer::reset(GfxState *state) { - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; finish(); @@ -61,7 +61,7 @@ void DrawingTracer::finish() // and should trace ctm changes ourself (via cairo). void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m21, double m22, double m31, double m32) { - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; #if ENABLE_SVG @@ -78,7 +78,7 @@ void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m void DrawingTracer::clip(GfxState * state, bool even_odd) { - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; #if ENABLE_SVG do_path(state, state->getPath()); @@ -89,14 +89,14 @@ void DrawingTracer::clip(GfxState * state, bool even_odd) void DrawingTracer::clip_to_stroke_path(GfxState * state) { - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; // TODO cairo_stroke_to_path() ? } void DrawingTracer::save() { - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; #if ENABLE_SVG cairo_save(cairo); @@ -104,7 +104,7 @@ void DrawingTracer::save() } void DrawingTracer::restore() { - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; #if ENABLE_SVG cairo_restore(cairo); @@ -153,7 +153,7 @@ void DrawingTracer::do_path(GfxState * state, GfxPath * path) void DrawingTracer::stroke(GfxState * state) { #if ENABLE_SVG - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; DT_DEBUG(printf("DrawingTracer::stroke\n")); @@ -221,7 +221,7 @@ void DrawingTracer::stroke(GfxState * state) void DrawingTracer::fill(GfxState * state, bool even_odd) { - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; #if ENABLE_SVG @@ -301,7 +301,7 @@ void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) void DrawingTracer::draw_image(GfxState *state) { - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; double bbox[4] {0, 0, 1, 1}; draw_non_char_bbox(state, bbox); @@ -309,7 +309,7 @@ void DrawingTracer::draw_image(GfxState *state) void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, double ay) { - if (!param.process_covered_text) + if (!param.correct_text_visibility) return; Matrix tm, itm; diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index c46ed4c..e452827 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -508,8 +508,10 @@ void HTMLRenderer::prepare_text_line(GfxState * state) double rise_x, rise_y; state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y); state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y); - if (param.process_covered_text) + + if (param.correct_text_visibility) cur_line_state.first_char_index = get_char_count(); + html_text_page.open_new_line(cur_line_state); cur_text_state.vertical_align = 0; diff --git a/src/Param.h b/src/Param.h index d3d67b2..314ea9b 100644 --- a/src/Param.h +++ b/src/Param.h @@ -38,7 +38,7 @@ struct Param int process_nontext; int process_outline; int process_annotation; - int process_covered_text; + int correct_text_visibility; int printing; int fallback; int tmp_file_size_limit; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index d338ab9..038412f 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -187,7 +187,7 @@ void parse_options (int argc, char **argv) .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") .add("tounicode", ¶m.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)") .add("optimize-text", ¶m.optimize_text, 0, "try to reduce the number of HTML elements used for text") - .add("process-covered-text", ¶m.process_covered_text, 0, "try to detect texts covered by other graphics and properly arrange them") + .add("correct-text-visibility", ¶m.correct_text_visibility, 0, "try to detect texts covered by other graphics and properly arrange them") // background image .add("bg-format", ¶m.bg_format, "png", "specify background image format") From 25d53ba0d1378b374bfbaab61c0acd8f7b15c27a Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Fri, 27 Jun 2014 17:18:29 +0800 Subject: [PATCH 19/23] Fix call to Color constructor. --- src/HTMLTextLine.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index c1a624a..7cb29f5 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -93,7 +93,7 @@ void HTMLTextLine::dump_char(std::ostream & out, int pos) void HTMLTextLine::dump_chars(ostream & out, int begin, int len) { - static const Color transparent { true, {0, 0, 0} }; + static const Color transparent(0, 0, 0, true); if (line_state.first_char_index < 0) { From 24b5eeb1e5d8d8b054c2f1489452607683b86795 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Sun, 29 Jun 2014 13:51:31 +0800 Subject: [PATCH 20/23] re-apply "fix rise with optimize-text(9c0b2a8a)" after rebase --- src/HTMLRenderer/text.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 2ec3877..95d8186 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -80,7 +80,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) { cerr << "TODO: non-zero origins" << endl; } - ddx = (ax * cur_font_size + cur_letter_space) * cur_horiz_scaling; + ddx = ax * cur_font_size + cur_letter_space; ddy = ay * cur_font_size; tracer.draw_char(state, dx, dy, ax, ay); @@ -135,7 +135,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) } } - dx += ddx; + dx += ddx * cur_horiz_scaling; dy += ddy; if (is_space) dx += cur_word_space * cur_horiz_scaling; From 9bd1eb2c2aec9004a6999f6dc0738f8a162777db Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Sun, 29 Jun 2014 14:39:51 +0800 Subject: [PATCH 21/23] Remove unuseful constructor/destructor of CoveredTextHandler --- src/CoveredTextHandler.cc | 11 ----------- src/CoveredTextHandler.h | 2 -- 2 files changed, 13 deletions(-) diff --git a/src/CoveredTextHandler.cc b/src/CoveredTextHandler.cc index 9f8074a..ace7bdc 100644 --- a/src/CoveredTextHandler.cc +++ b/src/CoveredTextHandler.cc @@ -11,17 +11,6 @@ namespace pdf2htmlEX { -CoveredTextHandler::CoveredTextHandler() -{ - // TODO Auto-generated constructor stub - -} - -CoveredTextHandler::~CoveredTextHandler() -{ - // TODO Auto-generated destructor stub -} - void CoveredTextHandler::reset() { char_bboxes.clear(); diff --git a/src/CoveredTextHandler.h b/src/CoveredTextHandler.h index 0531236..f491e9b 100644 --- a/src/CoveredTextHandler.h +++ b/src/CoveredTextHandler.h @@ -18,8 +18,6 @@ namespace pdf2htmlEX { class CoveredTextHandler { public: - CoveredTextHandler(); - virtual ~CoveredTextHandler(); /** * Reset to initial state. Should be called when start drawing a page. From d0348b3974cedc67a37ad5562aea820462323d44 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Sun, 29 Jun 2014 14:48:57 +0800 Subject: [PATCH 22/23] Rename class CoveredTextHandler to CoveredTextDetector --- src/CoveredTextHandler.cc | 8 ++++---- src/CoveredTextHandler.h | 6 +++--- src/HTMLRenderer/HTMLRenderer.h | 4 ++-- src/HTMLRenderer/general.cc | 8 ++++---- src/HTMLRenderer/text.cc | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/CoveredTextHandler.cc b/src/CoveredTextHandler.cc index ace7bdc..31da294 100644 --- a/src/CoveredTextHandler.cc +++ b/src/CoveredTextHandler.cc @@ -11,19 +11,19 @@ namespace pdf2htmlEX { -void CoveredTextHandler::reset() +void CoveredTextDetector::reset() { char_bboxes.clear(); chars_covered.clear(); } -void CoveredTextHandler::add_char_bbox(double * bbox) +void CoveredTextDetector::add_char_bbox(double * bbox) { char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4); chars_covered.push_back(false); } -void CoveredTextHandler::add_char_bbox_clipped(double * bbox, bool patially) +void CoveredTextDetector::add_char_bbox_clipped(double * bbox, bool patially) { char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4); chars_covered.push_back(true); @@ -31,7 +31,7 @@ void CoveredTextHandler::add_char_bbox_clipped(double * bbox, bool patially) add_non_char_bbox(bbox, chars_covered.size() - 1); } -void CoveredTextHandler::add_non_char_bbox(double * bbox, int index) +void CoveredTextDetector::add_non_char_bbox(double * bbox, int index) { if (index < 0) index = chars_covered.size(); diff --git a/src/CoveredTextHandler.h b/src/CoveredTextHandler.h index f491e9b..3203053 100644 --- a/src/CoveredTextHandler.h +++ b/src/CoveredTextHandler.h @@ -5,8 +5,8 @@ * Author: duanyao */ -#ifndef COVEREDTEXTHANDLER_H__ -#define COVEREDTEXTHANDLER_H__ +#ifndef COVEREDTEXTDETECTOR_H__ +#define COVEREDTEXTDETECTOR_H__ #include @@ -15,7 +15,7 @@ namespace pdf2htmlEX { /** * Detect characters that are covered by non-char graphics on a page. */ -class CoveredTextHandler +class CoveredTextDetector { public: diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 3ec57c1..0dd107b 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -157,7 +157,7 @@ public: // Does not fail on out-of-bound conditions, but return false. bool is_char_covered(int index); // Currently drawn char (glyph) count in current page. - int get_char_count() { return (int)covered_text_handler.get_chars_covered().size(); } + int get_char_count() { return (int)covered_text_detecor.get_chars_covered().size(); } protected: //////////////////////////////////////////////////// @@ -364,7 +364,7 @@ protected: static const std::string MANIFEST_FILENAME; - CoveredTextHandler covered_text_handler; + CoveredTextDetector covered_text_detecor; DrawingTracer tracer; }; diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index a2be519..7f211db 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -80,11 +80,11 @@ HTMLRenderer::HTMLRenderer(const Param & param) all_manager.bottom .set_eps(EPS); tracer.on_char_drawn = - [this](double * box) { covered_text_handler.add_char_bbox(box); }; + [this](double * box) { covered_text_detecor.add_char_bbox(box); }; tracer.on_char_clipped = - [this](double * box, bool partial) { covered_text_handler.add_char_bbox_clipped(box, partial); }; + [this](double * box, bool partial) { covered_text_detecor.add_char_bbox_clipped(box, partial); }; tracer.on_non_char_drawn = - [this](double * box) { covered_text_handler.add_non_char_bbox(box); }; + [this](double * box) { covered_text_detecor.add_non_char_bbox(box); }; } HTMLRenderer::~HTMLRenderer() @@ -192,7 +192,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) #endif { - covered_text_handler.reset(); + covered_text_detecor.reset(); tracer.reset(state); this->pageNum = pageNum; diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 95d8186..5d5ecd9 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -153,7 +153,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) bool HTMLRenderer::is_char_covered(int index) { - auto covered = covered_text_handler.get_chars_covered(); + auto covered = covered_text_detecor.get_chars_covered(); if (index < 0 || index >= (int)covered.size()) { std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: " From 448b0d6dad518ead618aab51c1b41bc07cfc3187 Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Sun, 29 Jun 2014 14:55:48 +0800 Subject: [PATCH 23/23] Rename files CoveredTextHandler.* to CoveredTextDetector.* --- CMakeLists.txt | 4 ++-- src/{CoveredTextHandler.cc => CoveredTextDetector.cc} | 4 ++-- src/{CoveredTextHandler.h => CoveredTextDetector.h} | 4 ++-- src/HTMLRenderer/HTMLRenderer.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) rename src/{CoveredTextHandler.cc => CoveredTextDetector.cc} (94%) rename src/{CoveredTextHandler.h => CoveredTextDetector.h} (96%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d75405..8f0aad4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,8 +161,8 @@ set(PDF2HTMLEX_SRC ${PDF2HTMLEX_SRC} src/Base64Stream.cc src/Color.h src/Color.cc - src/CoveredTextHandler.h - src/CoveredTextHandler.cc + src/CoveredTextDetector.h + src/CoveredTextDetector.cc src/DrawingTracer.h src/DrawingTracer.cc src/HTMLState.h diff --git a/src/CoveredTextHandler.cc b/src/CoveredTextDetector.cc similarity index 94% rename from src/CoveredTextHandler.cc rename to src/CoveredTextDetector.cc index 31da294..e109b3f 100644 --- a/src/CoveredTextHandler.cc +++ b/src/CoveredTextDetector.cc @@ -1,11 +1,11 @@ /* - * CoveredTextHandler.cc + * CoveredTextDetector.cc * * Created on: 2014-6-14 * Author: duanyao */ -#include "CoveredTextHandler.h" +#include "CoveredTextDetector.h" #include "util/math.h" diff --git a/src/CoveredTextHandler.h b/src/CoveredTextDetector.h similarity index 96% rename from src/CoveredTextHandler.h rename to src/CoveredTextDetector.h index 3203053..bee6c17 100644 --- a/src/CoveredTextHandler.h +++ b/src/CoveredTextDetector.h @@ -1,5 +1,5 @@ /* - * CoveredTextHandler.h + * CoveredTextDetector.h * * Created on: 2014-6-14 * Author: duanyao @@ -58,4 +58,4 @@ private: } -#endif /* COVEREDTEXTHANDLER_H__ */ +#endif /* COVEREDTEXTDETECTOR_H__ */ diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 0dd107b..a8d9e12 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -31,7 +31,7 @@ #include "HTMLTextPage.h" #include "BackgroundRenderer/BackgroundRenderer.h" -#include "CoveredTextHandler.h" +#include "CoveredTextDetector.h" #include "DrawingTracer.h" #include "util/const.h"