diff --git a/share/base.css.in b/share/base.css.in index 83a9246..cffa338 100644 --- a/share/base.css.in +++ b/share/base.css.in @@ -49,7 +49,7 @@ .@CSS_CSS_DRAW_CN@ { display:none; } } /* Part 2: Page Elements: Modify with caution - * The followings are base classes, which are meant to be override by PDF specific classes + * The followings are base classes, some of which are meant to be override by PDF specific classes * So do not increase the specificity (e.g. ".classname" -> "#page-container .classname") */ .@CSS_PAGE_DECORATION_CN@ { /* page decoration */ @@ -71,12 +71,15 @@ .@CSS_PAGE_CONTENT_BOX_CN@ { /* content of a page */ position:absolute; border-width:0; + padding:0; + margin:0; top:0; left:0; width:100%; height:100%; overflow:hidden; display:block; + /* set transform-origin for scaling */ transform-origin:0% 0%; -ms-transform-origin:0% 0%; -moz-transform-origin:0% 0%; @@ -114,6 +117,14 @@ .@CSS_PAGE_CONTENT_BOX_CN@ {overflow:visible;} } } +.@CSS_CLIP_CN@ { /* clip box */ + position:absolute; + border-width:0; + padding:0; + margin:0; + overflow:hidden; + display:block; +} .@CSS_LINE_CN@ { /* text line */ position:absolute; white-space:pre; @@ -144,7 +155,7 @@ span { /* text blocks within a line */ .@CSS_PAGE_DATA_CN@ { /* info for Javascript */ display:none; } -.@CSS_LINE_CN@ { /* annotation links */ +.@CSS_LINK_CN@ { /* annotation links */ } /* transparent color - WebKit */ .@CSS_CSS_DRAW_CN@ { /* css drawing */ diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 80e3182..cdd17e0 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -296,8 +296,7 @@ protected: { NLS_NONE, NLS_NEWSTATE, - NLS_NEWLINE, - NLS_NEWCLIP + NLS_NEWLINE } new_line_state; // for font reencoding diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 950501f..daafcea 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -173,8 +173,13 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) { this->pageNum = pageNum; - long long wid = all_manager.width.install(state->getPageWidth()); - long long hid = all_manager.height.install(state->getPageHeight()); + double pageWidth = state->getPageWidth(); + double pageHeight = state->getPageHeight(); + + html_text_page.set_page_size(pageWidth, pageHeight); + + long long wid = all_manager.width.install(pageWidth); + long long hid = all_manager.height.install(pageHeight); f_pages.fs << "
getClipBBox(&x1, &y1, &x2, &y2); + html_text_page.clip(x1, y1, x2, y2); + } + bool need_recheck_position = false; bool need_rescale_font = false; bool draw_text_scale_changed = false; diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index 598b95f..e2d81f7 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -29,6 +29,8 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para :param(param) ,all_manager(all_manager) ,line_state(line_state) + ,clip_x1(0) + ,clip_y1(0) { } void HTMLTextLine::append_unicodes(const Unicode * u, int l) @@ -81,9 +83,9 @@ void HTMLTextLine::dump_text(ostream & out) // open
for the current text line out << "
states; std::vector offsets; diff --git a/src/HTMLTextPage.cc b/src/HTMLTextPage.cc index 8780e5e..d1e092b 100644 --- a/src/HTMLTextPage.cc +++ b/src/HTMLTextPage.cc @@ -7,6 +7,7 @@ */ #include "HTMLTextPage.h" +#include "util/css_const.h" namespace pdf2htmlEX { @@ -17,6 +18,8 @@ HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager) : param(param) , all_manager(all_manager) , cur_line(nullptr) + , page_width(0) + , page_height(0) { } void HTMLTextPage::dump_text(ostream & out) @@ -25,8 +28,42 @@ void HTMLTextPage::dump_text(ostream & out) (*iter)->prepare(); if(param.optimize_text) optimize(); - for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter) - (*iter)->dump_text(out); + + //push a dummy entry for convenience + clip_boxes.emplace_back(0, 0, page_width, page_height, text_lines.size()); + + ClipBox cur_cb(0, 0, page_width, page_height, 0); + bool has_clip = false; + + auto text_line_iter = text_lines.begin(); + for(auto clip_iter = clip_boxes.begin(); clip_iter != clip_boxes.end(); ++clip_iter) + { + if(has_clip) + { + out << "
"; + } + + auto next_text_line_iter = text_lines.begin() + clip_iter->start_idx; + while(text_line_iter != next_text_line_iter) + { + (*text_line_iter)->clip(cur_cb.x1, cur_cb.y1, cur_cb.x2, cur_cb.y2); + (*text_line_iter)->dump_text(out); + ++text_line_iter; + } + if(has_clip) + { + out << "
"; + } + + cur_cb = *clip_iter; + has_clip = !(equal(0, cur_cb.x1) && equal(0, cur_cb.y1) + && equal(page_width, cur_cb.x2) && equal(page_height, cur_cb.y2)); + } } void HTMLTextPage::dump_css(ostream & out) @@ -37,6 +74,7 @@ void HTMLTextPage::dump_css(ostream & out) void HTMLTextPage::clear(void) { text_lines.clear(); + clip_boxes.clear(); cur_line = nullptr; } @@ -50,6 +88,40 @@ void HTMLTextPage::open_new_line(const HTMLLineState & line_state) cur_line = text_lines.back().get(); } +void HTMLTextPage::set_page_size(double width, double height) +{ + page_width = width; + page_height = height; +} + +void HTMLTextPage::clip(double x1, double y1, double x2, double y2) +{ + if(!clip_boxes.empty()) + { + auto & cb = clip_boxes.back(); + if(cb.start_idx == text_lines.size()) + { + /* + * Previous ClipBox is not used + */ + cb.x1 = x1; + cb.y1 = y1; + cb.x2 = x2; + cb.y2 = y2; + return; + } + if(equal(cb.x1, x1) && equal(cb.y1, y1) + && equal(cb.x2, x2) && equal(cb.y2, y2)) + { + /* + * same as previous ClipBox + */ + return; + } + } + clip_boxes.emplace_back(x1, y1, x2, y2, text_lines.size()); +} + void HTMLTextPage::optimize(void) { //TODO diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h index 5e6683e..2125519 100644 --- a/src/HTMLTextPage.h +++ b/src/HTMLTextPage.h @@ -35,6 +35,10 @@ public: void clear(void); void open_new_line(const HTMLLineState & line_state); + + /* for clipping */ + void set_page_size(double width, double height); + void clip(double x1, double y1, double x2, double y2); private: void optimize(void); @@ -42,7 +46,18 @@ private: const Param & param; AllStateManager & all_manager; HTMLTextLine * cur_line; + double page_width, page_height; + std::vector> text_lines; + + struct ClipBox { + ClipBox(double x1, double y1, double x2, double y2, size_t start_idx) + :x1(x1),y1(y1),x2(x2),y2(y2),start_idx(start_idx) + { } + double x1, y1, x2, y2; + size_t start_idx; + }; + std::vector clip_boxes; }; } //namespace pdf2htmlEX diff --git a/src/css_class_names.cmakelists.txt b/src/css_class_names.cmakelists.txt index 6b604e6..18217f5 100644 --- a/src/css_class_names.cmakelists.txt +++ b/src/css_class_names.cmakelists.txt @@ -9,6 +9,7 @@ set(CSS_INVALID_ID "_") set(CSS_LINE_CN "t") # Text set(CSS_TRANSFORM_MATRIX_CN "m") # Matrix +set(CSS_CLIP_CN "c") # Clip set(CSS_PAGE_DECORATION_CN "pd") # Page Decoration set(CSS_PAGE_FRAME_CN "pf") # Page Frame diff --git a/src/util/css_const.h.in b/src/util/css_const.h.in index 1e9f1aa..edc4f0f 100644 --- a/src/util/css_const.h.in +++ b/src/util/css_const.h.in @@ -26,6 +26,7 @@ const char * const INVALID_ID = "@CSS_INVALID_ID@"; const char * const LINE_CN = "@CSS_LINE_CN@"; const char * const TRANSFORM_MATRIX_CN = "@CSS_TRANSFORM_MATRIX_CN@"; +const char * const CLIP_CN = "@CSS_CLIP_CN@"; // page_decoration is for shadow etc // page_frame cannot have margin or border-width, pdf2htmlEX.js will use it to determine the coordinates