diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ce16cc..bb47b10 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,6 +161,8 @@ set(PDF2HTMLEX_SRC ${PDF2HTMLEX_SRC} src/Base64Stream.cc src/Color.h src/Color.cc + src/CoveredTextHandler.h + src/CoveredTextHandler.cc src/HTMLState.h src/HTMLTextLine.h src/HTMLTextLine.cc diff --git a/pdf2htmlEX.1.in b/pdf2htmlEX.1.in index 3d1d3bb..49266b0 100644 --- a/pdf2htmlEX.1.in +++ b/pdf2htmlEX.1.in @@ -242,6 +242,11 @@ If set to 0, pdf2htmlEX would try its best to balance the two methods above. .B --optimize-text <0|1> (Default: 0) If set to 1, pdf2htmlEX will try to reduce the number of HTML elements used for text. Turn it off if anything goes wrong. +.TP +.B --process-covered-text <0|1> (Default: 0) +If set to 1, pdf2htmlEX will try to detect texts covered by other graphics and properly arrange them, +i.e. covered texts are made transparent in text layer, and are drawn on background layer. + .SS Background Image .TP diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc index b2733c2..19d5795 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.cc +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc @@ -63,6 +63,13 @@ void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y, { CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); } + // If a char is treated as image, it is not subject to cover test + // (see HTMLRenderer::drawString), so don't increase drawn_char_count. + else if (param.process_covered_text) { + if (html_renderer->get_chars_covered()[drawn_char_count]) + CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); + drawn_char_count++; + } } void CairoBackgroundRenderer::beginTextObject(GfxState *state) @@ -97,6 +104,7 @@ static GBool annot_cb(Annot *, void * pflag) { bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno) { + drawn_char_count = 0; double page_width; double page_height; if(param.use_cropbox) diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.h b/src/BackgroundRenderer/CairoBackgroundRenderer.h index b2d2f14..65164e6 100644 --- a/src/BackgroundRenderer/CairoBackgroundRenderer.h +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.h @@ -66,6 +66,7 @@ private: std::unordered_map bitmaps_ref_count; // id of bitmaps' stream used by current page std::vector bitmaps_in_current_page; + int drawn_char_count; }; } diff --git a/src/CoveredTextHandler.cc b/src/CoveredTextHandler.cc new file mode 100644 index 0000000..8557b57 --- /dev/null +++ b/src/CoveredTextHandler.cc @@ -0,0 +1,55 @@ +/* + * CoveredTextHandler.cc + * + * Created on: 2014-6-14 + * Author: duanyao + */ + +#include "CoveredTextHandler.h" + +#include "util/math.h" + +namespace pdf2htmlEX { + +CoveredTextHandler::CoveredTextHandler() +{ + // TODO Auto-generated constructor stub + +} + +CoveredTextHandler::~CoveredTextHandler() +{ + // TODO Auto-generated destructor stub +} + +void CoveredTextHandler::reset() +{ + char_bboxes.clear(); + chars_covered.clear(); +} + +void CoveredTextHandler::add_char_bbox(double * bbox) +{ + for (int i = 0; i < 4; i++) + char_bboxes.push_back(bbox[i]); + chars_covered.push_back(false); +} + +void CoveredTextHandler::add_non_char_bbox(double * bbox, int index) +{ + if (index < 0) + index = chars_covered.size(); + for (int i = 0; i < index; i++) + { + if (chars_covered[i]) + continue; + double * cbbox = &char_bboxes[i * 4]; + if (bbox_intersect(cbbox, bbox)) + { + chars_covered[i] = true; + add_non_char_bbox(cbbox, i); + } + } +} + +} diff --git a/src/CoveredTextHandler.h b/src/CoveredTextHandler.h new file mode 100644 index 0000000..34decf6 --- /dev/null +++ b/src/CoveredTextHandler.h @@ -0,0 +1,62 @@ +/* + * CoveredTextHandler.h + * + * Created on: 2014-6-14 + * Author: duanyao + */ + +#ifndef COVEREDTEXTHANDLER_H__ +#define COVEREDTEXTHANDLER_H__ + +#include + +namespace pdf2htmlEX { + +/** + * Detect characters that are covered by non-char graphics on a page. + */ +class CoveredTextHandler +{ +public: + CoveredTextHandler(); + virtual ~CoveredTextHandler(); + + /** + * Reset to initial state. Should be called when start drawing a page. + */ + void reset(); + + /** + * Add a drawn character's bounding box. + * @param bbox (x0, y0, x1, y1) + */ + void add_char_bbox(double * bbox); + + /** + * Add a drawn non-char graphics' bounding box. + * If it intersects any previously drawn char's bbox, the char is marked as covered + * and treated as an non-char. + * @param bbox (x0, y0, x1, y1) + * @param index this graphics' drawing order: assume it is drawn after (index-1)th + * char. -1 means after the last char. + */ + void add_non_char_bbox(double * bbox, int index = -1); + + /** + * An array of flags indicating whether a char is covered by any non-char graphics. + * Index by the order that these chars are added. + * This vector grows as add_char_bbox() is called, so its size is the count + * of currently drawn chars. + */ + const std::vector & get_chars_covered() { return chars_covered; } + +private: + //covered text test + std::vector chars_covered; + // x00, y00, x01, y01; x10, y10, x11, y11;... + std::vector char_bboxes; +}; + +} + +#endif /* COVEREDTEXTHANDLER_H__ */ diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 73929ab..75293f8 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -31,6 +31,7 @@ #include "HTMLTextPage.h" #include "BackgroundRenderer/BackgroundRenderer.h" +#include "CoveredTextHandler.h" #include "util/const.h" #include "util/misc.h" @@ -125,6 +126,15 @@ public: virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg); + virtual void drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, + int width, int height, + GfxImageColorMap *colorMap, + GBool interpolate, + Stream *maskStr, + int maskWidth, int maskHeight, + GfxImageColorMap *maskColorMap, + GBool maskInterpolate); + virtual void stroke(GfxState *state) { css_do_path(state, false); } virtual void fill(GfxState *state) { css_do_path(state, true); } virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax); @@ -135,6 +145,8 @@ public: bool can_stroke(GfxState *state) { return css_do_path(state, false, true); } bool can_fill(GfxState *state) { return css_do_path(state, true, true); } + const std::vector & get_chars_covered() { return covered_text_handler.get_chars_covered(); } + protected: //////////////////////////////////////////////////// // misc @@ -215,6 +227,19 @@ protected: const GfxRGB * line_color, const GfxRGB * fill_color, void (*style_function)(void *, std::ostream &) = nullptr, void * style_function_data = nullptr ); + //////////////////////////////////////////////////// + // Covered text handling + //////////////////////////////////////////////////// + /* + * Cue CoveredTextHandler that a character is drawn + * x, y: glyph-drawing position, in PDF text object space. + * ax, ay: glyph advance, in glyph space. + */ + void add_char_bbox(GfxState *state, double x, double y, double ax, double ay); + /* + * Cue CoveredTextHandler that an image is drawn + */ + void add_image_bbox(GfxState *state); //////////////////////////////////////////////////// // PDF stuffs @@ -338,6 +363,8 @@ protected: std::string cur_page_filename; static const std::string MANIFEST_FILENAME; + + CoveredTextHandler covered_text_handler; }; } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 803bc2d..3d43ede 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -133,13 +133,10 @@ void HTMLRenderer::process(PDFDoc *doc) cur_page_filename = filled_template_filename; } - if(param.process_nontext) - { - fallback_bg_required = !bg_renderer->render_page(doc, i); - if (fallback_bg_required && fallback_bg_renderer != nullptr) - fallback_bg_renderer->render_page(doc, i); - } - + // We handle covered texts during doc->displayPage(this...), + // and bg_renderer->render_page() depends on the result, so it must be called after + // doc->displayPage(this...). + covered_text_handler.reset(); doc->displayPage(this, i, text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, 0, @@ -148,6 +145,13 @@ void HTMLRenderer::process(PDFDoc *doc) false, // printing nullptr, nullptr, nullptr, nullptr); + if(param.process_nontext) + { + fallback_bg_required = !bg_renderer->render_page(doc, i); + if (fallback_bg_required && fallback_bg_renderer != nullptr) + fallback_bg_renderer->render_page(doc, i); + } + if(param.split_pages) { delete f_curpage; diff --git a/src/HTMLRenderer/image.cc b/src/HTMLRenderer/image.cc index 9c3da52..3e4f8d0 100644 --- a/src/HTMLRenderer/image.cc +++ b/src/HTMLRenderer/image.cc @@ -14,6 +14,8 @@ namespace pdf2htmlEX { void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg) { + add_image_bbox(state); + return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg); #if 0 @@ -62,4 +64,30 @@ void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int w #endif } +void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, + int width, int height, + GfxImageColorMap *colorMap, + GBool interpolate, + Stream *maskStr, + int maskWidth, int maskHeight, + GfxImageColorMap *maskColorMap, + GBool maskInterpolate) +{ + add_image_bbox(state); + + return OutputDev::drawSoftMaskedImage(state,ref,str, + width,height,colorMap,interpolate, + maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate); +} + +void HTMLRenderer::add_image_bbox(GfxState *state) +{ + if (!param.process_covered_text) + return; + auto ctm = state->getCTM(); + double bbox[4] {0, 0, 1, 1}; + tm_transform_bbox(ctm, bbox); + covered_text_handler.add_non_char_bbox(bbox); +} + } // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 320fd81..8df9700 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -119,6 +119,9 @@ void HTMLRenderer::reset_state() cur_line_state.y = 0; memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix)); + if (param.process_covered_text) + cur_line_state.chars_covered = &covered_text_handler.get_chars_covered(); + cur_clip_state.xmin = 0; cur_clip_state.xmax = 0; cur_clip_state.ymin = 0; @@ -502,6 +505,8 @@ void HTMLRenderer::prepare_text_line(GfxState * state) double rise_x, rise_y; state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y); state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y); + if (param.process_covered_text) + cur_line_state.first_char_index = covered_text_handler.get_chars_covered().size(); html_text_page.open_new_line(cur_line_state); cur_text_state.vertical_align = 0; diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index dafe510..0122356 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -72,6 +72,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) cerr << "TODO: non-zero origins" << endl; } + add_char_bbox(state, dx, dy, dx1, dy1); + bool is_space = false; if (n == 1 && *p == ' ') { @@ -143,4 +145,43 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) draw_ty += dy; } +void HTMLRenderer::add_char_bbox(GfxState *state, double x, double y, double ax, double ay) +{ + if (!param.process_covered_text) + return; + + Matrix tm_ctm, tm, itm; + memcpy(tm_ctm.m, this->cur_text_tm, sizeof(tm_ctm.m)); + memcpy(tm.m, state->getTextMat(), sizeof(tm.m)); + double fs = state->getFontSize(); + + double cx = state->getCurX(), cy = state->getCurY(), + ry = state->getRise(), h = state->getHorizScaling(); + + //cx and cy has been transformed by text matrix, we need to reverse them. + tm.invertTo(&itm); + double char_cx, char_cy; + itm.transform(cx, cy, &char_cx, &char_cy); + + //TODO Vertical? Currently vertical/type3 chars are treated as non-chars. + double tchar[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry}; + + double tfinal[6]; + tm_multiply(tfinal, tm_ctm.m, tchar); + + auto font = state->getFont(); + double bbox[4] {0, 0, ax, ay}; + double desc = font->getDescent(), asc = font->getAscent(); + if (font->getWMode() == 0) + { + bbox[1] += desc; + bbox[3] += asc; + } + else + {//TODO Vertical? + } + tm_transform_bbox(tfinal, bbox); + covered_text_handler.add_char_bbox(bbox); +} + } // namespace pdf2htmlEX diff --git a/src/HTMLState.h b/src/HTMLState.h index b8a470b..9327e5c 100644 --- a/src/HTMLState.h +++ b/src/HTMLState.h @@ -62,6 +62,9 @@ struct HTMLLineState { double x,y; double transform_matrix[4]; + // The page-cope char index(in drawing order) of the first char in this line. + int first_char_index = -1; + const std::vector * chars_covered = nullptr; }; struct HTMLClipState diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index f2320c5..ba32209 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -69,6 +69,32 @@ void HTMLTextLine::append_state(const HTMLTextState & text_state) last_state.font_size *= last_state.font_info->font_size_scale; } +void HTMLTextLine::dump_chars(ostream & out, const Unicode * u, int uLen) +{ + if (!line_state.chars_covered) + { + writeUnicodes(out, u, uLen); + return; + } + + //TODO merge sibling invisiable spans + int start = this->line_state.first_char_index + dumped_char_count; + for(int i = 0; i < uLen; i++) + { + if (!(*line_state.chars_covered)[start + i]) //visible + { + writeUnicodes(out, u + i, 1); + } + else + { + out << ""; + writeUnicodes(out, u + i, 1); + out << ""; + } + } + dumped_char_count += uLen; +} + void HTMLTextLine::dump_text(ostream & out) { /* @@ -84,6 +110,8 @@ void HTMLTextLine::dump_text(ostream & out) return; } + dumped_char_count = 0; + // Start Output { // open
for the current text line @@ -216,7 +244,7 @@ void HTMLTextLine::dump_text(ostream & out) size_t next_text_idx = text_idx2; if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx) next_text_idx = cur_offset_iter->start_idx; - writeUnicodes(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx); + dump_chars(out, (&text.front()) + cur_text_idx, next_text_idx - cur_text_idx); cur_text_idx = next_text_idx; } } diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h index 782b491..8fa814f 100644 --- a/src/HTMLTextLine.h +++ b/src/HTMLTextLine.h @@ -91,6 +91,7 @@ public: private: void optimize_normal(std::vector &); void optimize_aggressive(std::vector &); + void dump_chars(std::ostream & out, const Unicode * u, int uLen); const Param & param; AllStateManager & all_manager; @@ -103,6 +104,8 @@ private: std::vector states; std::vector offsets; std::vector text; + + int dumped_char_count; }; } // namespace pdf2htmlEX diff --git a/src/Param.h b/src/Param.h index 84a2f55..d3d67b2 100644 --- a/src/Param.h +++ b/src/Param.h @@ -38,6 +38,7 @@ struct Param int process_nontext; int process_outline; int process_annotation; + int process_covered_text; int printing; int fallback; int tmp_file_size_limit; diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc index cd1ae46..d338ab9 100644 --- a/src/pdf2htmlEX.cc +++ b/src/pdf2htmlEX.cc @@ -187,6 +187,7 @@ void parse_options (int argc, char **argv) .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") .add("tounicode", ¶m.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)") .add("optimize-text", ¶m.optimize_text, 0, "try to reduce the number of HTML elements used for text") + .add("process-covered-text", ¶m.process_covered_text, 0, "try to detect texts covered by other graphics and properly arrange them") // background image .add("bg-format", ¶m.bg_format, "png", "specify background image format") diff --git a/src/util/math.bk.cc b/src/util/math.bk.cc new file mode 100644 index 0000000..0abafaf --- /dev/null +++ b/src/util/math.bk.cc @@ -0,0 +1,72 @@ +#include +#include +#include + +#include "math.h" + +using std::min; +using std::max; + +namespace pdf2htmlEX { + +void tm_transform(const double * tm, double & x, double & y, bool is_delta) +{ + double xx = x, yy = y; + x = tm[0] * xx + tm[2] * yy; + y = tm[1] * xx + tm[3] * yy; + if(!is_delta) + { + x += tm[4]; + y += tm[5]; + } +} + +void tm_multiply(const double * tm_left, const double * tm_right, double * tm_result) +{ + double old[4]; + memcpy(old, tm_left, sizeof(old)); + + tm_result[0] = tm_left[0] * tm_right[0] + tm_left[2] * tm_right[1]; + tm_result[1] = tm_left[1] * tm_right[0] + tm_left[3] * tm_right[1]; + tm_result[2] = tm_left[0] * tm_right[2] + tm_left[2] * tm_right[3]; + tm_result[3] = tm_left[1] * tm_right[2] + tm_left[3] * tm_right[3]; + tm_result[4] += tm_left[0] * tm_right[4] + tm_left[2] * tm_right[5]; + tm_result[5] += tm_left[1] * tm_right[4] + tm_left[3] * tm_right[5]; +} + +void tm_transform_bbox(const double * tm, double * bbox) +{ + double & x1 = bbox[0]; + double & y1 = bbox[1]; + double & x2 = bbox[2]; + double & y2 = bbox[3]; + double _[4][2]; + _[0][0] = _[1][0] = x1; + _[0][1] = _[2][1] = y1; + _[2][0] = _[3][0] = x2; + _[1][1] = _[3][1] = y2; + + x1 = y1 = std::numeric_limits::max(); + x2 = y2 = std::numeric_limits::min(); + for(int i = 0; i < 4; ++i) + { + auto & x = _[i][0]; + auto & y = _[i][1]; + tm_transform(tm, x, y); + if(x < x1) x1 = x; + if(x > x2) x2 = x; + if(y < y1) y1 = y; + if(y > y2) y2 = y; + } +} + +bool bbox_intersect(double * bbox1, double * bbox2) +{ + return min(bbox1[0], bbox1[2]) < max(bbox2[0], bbox2[2]) + && max(bbox1[0], bbox1[2]) > min(bbox2[0], bbox2[2]) + && min(bbox1[1], bbox1[3]) < max(bbox2[1], bbox2[3]) + && max(bbox1[1], bbox1[3]) > min(bbox2[1], bbox2[3]); +} + +} //namespace pdf2htmlEX + diff --git a/src/util/math.cc b/src/util/math.cc index fd8e77c..fb898c6 100644 --- a/src/util/math.cc +++ b/src/util/math.cc @@ -1,8 +1,12 @@ #include #include +#include #include "math.h" +using std::min; +using std::max; + namespace pdf2htmlEX { void tm_transform(const double * tm, double & x, double & y, bool is_delta) @@ -56,5 +60,13 @@ void tm_transform_bbox(const double * tm, double * bbox) } } +bool bbox_intersect(double * bbox1, double * bbox2) +{ + return min(bbox1[0], bbox1[2]) < max(bbox2[0], bbox2[2]) + && max(bbox1[0], bbox1[2]) > min(bbox2[0], bbox2[2]) + && min(bbox1[1], bbox1[3]) < max(bbox2[1], bbox2[3]) + && max(bbox1[1], bbox1[3]) > min(bbox2[1], bbox2[3]); +} + } //namespace pdf2htmlEX diff --git a/src/util/math.h b/src/util/math.h index 759bbcc..fcdebc4 100644 --- a/src/util/math.h +++ b/src/util/math.h @@ -40,5 +40,7 @@ void tm_transform(const double * tm, double & x, double & y, bool is_delta = fal void tm_multiply(double * tm_left, const double * tm_right); void tm_transform_bbox(const double * tm, double * bbox); +bool bbox_intersect(double * bbox1, double * bbox2); + } //namespace pdf2htmlEX #endif //MATH_H__