From 6079a05e345b59ab4493ef69f761df7ce6072d0e Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Tue, 4 Sep 2012 23:33:15 +0800 Subject: [PATCH] first implemenation of LineBuffer --- CMakeLists.txt | 1 + src/HTMLRenderer.h | 78 ++++++++++++---- src/HTMLRenderer/LineBuffer.cc | 158 +++++++++++++++++++++++++++++++++ src/HTMLRenderer/export.cc | 20 ++--- src/HTMLRenderer/general.cc | 5 +- src/HTMLRenderer/install.cc | 12 +-- src/HTMLRenderer/state.cc | 127 ++++++++------------------ src/HTMLRenderer/text.cc | 4 +- 8 files changed, 279 insertions(+), 126 deletions(-) create mode 100644 src/HTMLRenderer/LineBuffer.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index c573001..dbe133e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,7 @@ add_executable(pdf2htmlEX src/HTMLRenderer/text.cc src/HTMLRenderer/image.cc src/HTMLRenderer/namespace.h + src/HTMLRenderer/LineBuffer.cc src/ff/ff.h src/ff/ff.c src/BackgroundRenderer.h diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index eed9a65..54c4766 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -132,7 +132,7 @@ class HTMLRenderer : public OutputDev //////////////////////////////////////////////////// // manage styles //////////////////////////////////////////////////// - FontInfo install_font(GfxFont * font); + const FontInfo * install_font(GfxFont * font); void install_embedded_font(GfxFont * font, FontInfo & info); void install_base_font(GfxFont * font, GfxFontLoc * font_loc, FontInfo & info); void install_external_font (GfxFont * font, FontInfo & info); @@ -193,15 +193,13 @@ class HTMLRenderer : public OutputDev //////////////////////////////////////////////////// // states //////////////////////////////////////////////////// - //line status - //indicating the status for current line & next line - //see comments: meaning for current line || meaning for next line - enum class LineStatus + bool line_opened; + enum class NewLineState { - NONE, // no line is opened (last
is closed) || stay with the same style - SPAN, // there's a pending opening (within a pending opening
) || open a new if possible, otherwise a new
- DIV // there's a pending opening
(but no ) || has to open a new
- } line_status, new_line_status; + NONE, // stay with the same style + SPAN, // open a new if possible, otherwise a new
+ DIV // has to open a new
+ } new_line_state; // The order is according to the appearance in check_state_change // any state changed @@ -211,7 +209,7 @@ class HTMLRenderer : public OutputDev bool text_pos_changed; // font & size - FontInfo cur_font_info; + const FontInfo * cur_font_info; double cur_font_size; long long cur_fs_id; bool font_changed; @@ -264,11 +262,61 @@ class HTMLRenderer : public OutputDev double draw_tx, draw_ty; // some metrics have to be determined after all elements in the lines have been seen - // TODO: add a class for these - double line_x, line_y; - long long line_tm_id; - double line_ascent, line_height; - std::stringstream line_buf; + class LineBuffer { + public: + LineBuffer (HTMLRenderer * renderer) : renderer(renderer) { } + + class State { + public: + void begin(std::ostream & out) const; + static void end(std::ostream & out); + + enum { + FONT_ID, + FONT_SIZE_ID, + COLOR_ID, + LETTER_SPACE_ID, + WORD_SPACE_ID, + RISE_ID, + + ID_COUNT + }; + + long long ids[ID_COUNT]; + double ascent; + size_t start_idx; // index of the first Text using this state + + static const char * format_str; // class names for each id + }; + + + class Offset { + public: + size_t start_idx; // should put this idx before text[start_idx]; + double width; + }; + + void reset(GfxState * state); + void append_unicodes(const Unicode * u, int l); + void append_offset(double width); + void append_state(void); + void flush(void); + + private: + // retrieve state from renderer + void set_state(State & state); + + HTMLRenderer * renderer; + + double x, y; + long long tm_id; + + std::vector states; + std::vector offsets; + std::vector text; + + } line_buf; + friend class LineBuffer; // for font reencoding int32_t * cur_mapping; diff --git a/src/HTMLRenderer/LineBuffer.cc b/src/HTMLRenderer/LineBuffer.cc new file mode 100644 index 0000000..3cc74cc --- /dev/null +++ b/src/HTMLRenderer/LineBuffer.cc @@ -0,0 +1,158 @@ +/* + * LineBuffer.cc + * + * Generate and optimized HTML for one line + * + * by WangLu + * 2012.09.04 + */ + +#include "HTMLRenderer.h" +#include "HTMLRenderer/namespace.h" + +using std::min; +using std::max; +using std::hex; +using std::dec; + +void HTMLRenderer::LineBuffer::reset(GfxState * state) +{ + state->transform(state->getCurX(), state->getCurY(), &x, &y); + tm_id = renderer->cur_tm_id; +} + +void HTMLRenderer::LineBuffer::append_unicodes(const Unicode * u, int l) +{ + text.insert(text.end(), u, u+l); +} + +void HTMLRenderer::LineBuffer::append_offset(double width) +{ + if((!offsets.empty()) && (offsets.back().start_idx == text.size())) + offsets.back().width += width; + else + offsets.push_back({text.size(), width}); +} + +void HTMLRenderer::LineBuffer::append_state(void) +{ + if(states.empty() || (states.back().start_idx != text.size())) + { + states.resize(states.size() + 1); + states.back().start_idx = text.size(); + } + + set_state(states.back()); +} + +void HTMLRenderer::LineBuffer::flush(void) +{ + /* + * Each Line is an independent absolute positioined block + * so even we have a few states or offsets, we may omit them + */ + if(text.empty()) return; + + if(states.empty() || (states[0].start_idx != 0)) + { + cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl; + return; + } + + states.resize(states.size() + 1); + states.back().start_idx = text.size(); + + offsets.push_back({text.size(), 0}); + + // TODO: optimize state + double max_ascent = 0; + for(const State & s : states) + max_ascent = max(max_ascent, s.ascent); + + // TODO: class for height ? + ostream & out = renderer->html_fout; + out << format("
") + % x % y + % max_ascent + % tm_id + ; + + auto cur_state_iter = states.begin(); + auto cur_offset_iter = offsets.begin(); + + double dx = 0; + + size_t cur_text_idx = 0; + while(cur_text_idx < text.size()) + { + if(cur_text_idx >= cur_state_iter->start_idx) + { + if(cur_text_idx) + State::end(out); + + cur_state_iter->begin(out); + + ++ cur_state_iter; + } + + if(cur_text_idx >= cur_offset_iter->start_idx) + { + double target = cur_offset_iter->width + dx; + double w; + + auto wid = renderer->install_whitespace(target, w); + + // TODO +// double threshold = draw_font_size * (cur_font_info.ascent - cur_font_info.descent) * (param->space_threshold); + double threshold = 0; + out << format("%2%") % wid % (target > (threshold - EPS) ? " " : ""); + + dx = target - w; + + ++ cur_offset_iter; + } + + size_t next_text_idx = min(cur_state_iter->start_idx, cur_offset_iter->start_idx); + outputUnicodes(out, text.data() + cur_text_idx, next_text_idx - cur_text_idx); + cur_text_idx = next_text_idx; + } + + State::end(out); + out << "
"; + + + states.clear(); + offsets.clear(); + text.clear(); + +} + +void HTMLRenderer::LineBuffer::set_state (State & state) +{ + state.ids[State::FONT_ID] = renderer->cur_font_info->id; + state.ids[State::FONT_SIZE_ID] = renderer->cur_fs_id; + state.ids[State::COLOR_ID] = renderer->cur_color_id; + state.ids[State::LETTER_SPACE_ID] = renderer->cur_ls_id; + state.ids[State::WORD_SPACE_ID] = renderer->cur_ws_id; + state.ids[State::RISE_ID] = renderer->cur_rise_id; + + state.ascent = renderer->cur_font_info->ascent * renderer->draw_font_size; +} + +void HTMLRenderer::LineBuffer::State::begin (ostream & out) const +{ + out << " 0) out << ' '; + out << format("%1%%|2$x|") % format_str[i] % ids[i]; + } + out << "\">"; +} + +void HTMLRenderer::LineBuffer::State::end(ostream & out) +{ + out << ""; +} + +const char * HTMLRenderer::LineBuffer::State::format_str = "fsclwr"; diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc index a61c7e6..d6e1b2e 100644 --- a/src/HTMLRenderer/export.cc +++ b/src/HTMLRenderer/export.cc @@ -16,7 +16,7 @@ using boost::algorithm::ifind_first; -void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, const string & fontfileformat, GfxFont * font) +void HTMLRenderer::export_remote_font(const FontInfo & info, const string & suffix, const string & fontfileformat, GfxFont * font) { allcss_fout << format("@font-face{font-family:f%|1$x|;src:url(") % info.id; @@ -45,12 +45,12 @@ static string general_font_family(GfxFont * font) } // TODO: this function is called when some font is unable to process, may use the name there as a hint -void HTMLRenderer::export_remote_default_font(long long fn_id) +void HTMLRenderer::export_remote_default_font(long long fn_id) { allcss_fout << format(".f%|1$x|{font-family:sans-serif;color:transparent;visibility:hidden;}")%fn_id << endl; } -void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont) +void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont) { allcss_fout << format(".f%|1$x|{") % info.id; allcss_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";"; @@ -68,12 +68,12 @@ void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, cons allcss_fout << "}" << endl; } -void HTMLRenderer::export_font_size (long long fs_id, double font_size) +void HTMLRenderer::export_font_size (long long fs_id, double font_size) { allcss_fout << format(".s%|1$x|{font-size:%2%px;}") % fs_id % font_size << endl; } -void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) +void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) { allcss_fout << format(".t%|1$x|{") % tm_id; @@ -101,24 +101,24 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) allcss_fout << "}" << endl; } -void HTMLRenderer::export_letter_space (long long ls_id, double letter_space) +void HTMLRenderer::export_letter_space (long long ls_id, double letter_space) { allcss_fout << format(".l%|1$x|{letter-spacing:%2%px;}") % ls_id % letter_space << endl; } -void HTMLRenderer::export_word_space (long long ws_id, double word_space) +void HTMLRenderer::export_word_space (long long ws_id, double word_space) { allcss_fout << format(".w%|1$x|{word-spacing:%2%px;}") % ws_id % word_space << endl; } -void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb) +void HTMLRenderer::export_color (long long color_id, const GfxRGB * rgb) { allcss_fout << format(".c%|1$x|{color:rgb(%2%,%3%,%4%);}") % color_id % (int)colToByte(rgb->r) % (int)colToByte(rgb->g) % (int)colToByte(rgb->b) << endl; } -void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) +void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) { if(ws_width > 0) allcss_fout << format("._%|1$x|{display:inline-block;width:%2%px;}") % ws_id % ws_width << endl; @@ -126,7 +126,7 @@ void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) allcss_fout << format("._%|1$x|{display:inline;margin-left:%2%px;}") % ws_id % ws_width << endl; } -void HTMLRenderer::export_rise (long long rise_id, double rise) +void HTMLRenderer::export_rise (long long rise_id, double rise) { allcss_fout << format(".r%|1$x|{top:%2%px;}") % rise_id % (-rise) << endl; } diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index ac587e2..ab15a4c 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -25,7 +25,8 @@ static void dummy(void *, ErrorCategory, int pos, char *) } HTMLRenderer::HTMLRenderer(const Param * param) - :line_status(LineStatus::NONE) + :line_opened(false) + ,line_buf(this) ,image_count(0) ,param(param) ,dest_dir(param->dest_dir) @@ -147,7 +148,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) this->pageWidth = state->getPageWidth(); this->pageHeight = state->getPageHeight(); - assert(line_status == LineStatus::NONE); + assert((!line_opened) && "Open line in startPage detected!"); html_fout << format("
second; + return &(iter->second); long long new_fn_id = font_name_map.size(); @@ -38,7 +38,7 @@ FontInfo HTMLRenderer::install_font(GfxFont * font) if(font == nullptr) { export_remote_default_font(new_fn_id); - return cur_info_iter->second; + return &(cur_info_iter->second); } cur_info_iter->second.ascent = font->getAscent(); @@ -52,12 +52,12 @@ FontInfo HTMLRenderer::install_font(GfxFont * font) if(font->getType() == fontType3) { cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl; export_remote_default_font(new_fn_id); - return cur_info_iter->second; + return &(cur_info_iter->second); } if(font->getWMode()) { cerr << "Writing mode is unsupported and will be rendered as Image" << endl; export_remote_default_font(new_fn_id); - return cur_info_iter->second; + return &(cur_info_iter->second); } auto * font_loc = font->locateFont(xref, gTrue); @@ -86,7 +86,7 @@ FontInfo HTMLRenderer::install_font(GfxFont * font) export_remote_default_font(new_fn_id); } - return cur_info_iter->second; + return &(cur_info_iter->second); } void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info) diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 545e3ed..5ff8eae 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -72,13 +72,10 @@ void HTMLRenderer::updateFillColor(GfxState * state) } void HTMLRenderer::check_state_change(GfxState * state) { - //TODO: - // close but not
, to use the first style of the line - // DEPENDENCY WARNING // don't adjust the order of state checking - new_line_status = LineStatus::NONE; + new_line_state = NewLineState::NONE; bool need_recheck_position = false; bool need_rescale_font = false; @@ -94,11 +91,11 @@ void HTMLRenderer::check_state_change(GfxState * state) // font name & size if(all_changed || font_changed) { - FontInfo new_font_info = install_font(state->getFont()); + const FontInfo * new_font_info = install_font(state->getFont()); - if(!(new_font_info.id == cur_font_info.id)) + if(!(new_font_info->id == cur_font_info->id)) { - new_line_status = max(new_line_status, LineStatus::SPAN); + new_line_state = max(new_line_state, NewLineState::SPAN); cur_font_info = new_font_info; } @@ -168,13 +165,13 @@ void HTMLRenderer::check_state_change(GfxState * state) if(!(_equal(new_draw_font_size, draw_font_size))) { - new_line_status = max(new_line_status, LineStatus::SPAN); + new_line_state = max(new_line_state, NewLineState::SPAN); draw_font_size = new_draw_font_size; cur_fs_id = install_font_size(draw_font_size); } if(!(_tm_equal(new_draw_ctm, draw_ctm, 4))) { - new_line_status = max(new_line_status, LineStatus::DIV); + new_line_state = max(new_line_state, NewLineState::DIV); memcpy(draw_ctm, new_draw_ctm, sizeof(draw_ctm)); cur_tm_id = install_transform_matrix(draw_ctm); } @@ -236,7 +233,7 @@ void HTMLRenderer::check_state_change(GfxState * state) if(!merged) { - new_line_status = max(new_line_status, LineStatus::DIV); + new_line_state = max(new_line_state, NewLineState::DIV); } } @@ -247,7 +244,7 @@ void HTMLRenderer::check_state_change(GfxState * state) double new_letter_space = state->getCharSpace(); if(!_equal(cur_letter_space, new_letter_space)) { - new_line_status = max(new_line_status, LineStatus::SPAN); + new_line_state = max(new_line_state, NewLineState::SPAN); cur_letter_space = new_letter_space; cur_ls_id = install_letter_space(cur_letter_space * draw_scale); } @@ -260,7 +257,7 @@ void HTMLRenderer::check_state_change(GfxState * state) double new_word_space = state->getWordSpace(); if(!_equal(cur_word_space, new_word_space)) { - new_line_status = max(new_line_status, LineStatus::SPAN); + new_line_state = max(new_line_state, NewLineState::SPAN); cur_word_space = new_word_space; cur_ws_id = install_word_space(cur_word_space * draw_scale); } @@ -273,7 +270,7 @@ void HTMLRenderer::check_state_change(GfxState * state) state->getFillRGB(&new_color); if(!((new_color.r == cur_color.r) && (new_color.g == cur_color.g) && (new_color.b == cur_color.b))) { - new_line_status = max(new_line_status, LineStatus::SPAN); + new_line_state = max(new_line_state, NewLineState::SPAN); cur_color = new_color; cur_color_id = install_color(&new_color); } @@ -286,7 +283,7 @@ void HTMLRenderer::check_state_change(GfxState * state) double new_rise = state->getRise(); if(!_equal(cur_rise, new_rise)) { - new_line_status = max(new_line_status, LineStatus::SPAN); + new_line_state = max(new_line_state, NewLineState::SPAN); cur_rise = new_rise; cur_rise_id = install_rise(new_rise * draw_scale); } @@ -314,31 +311,22 @@ void HTMLRenderer::reset_state_change() } void HTMLRenderer::prepare_line(GfxState * state) { - // close old tags when necessary - if((line_status == LineStatus::NONE) || (new_line_status == LineStatus::NONE)) + if(!line_opened) { - //pass - } - else if(new_line_status == LineStatus::DIV) - { - close_line(); - } - else - { - assert(new_line_status == LineStatus::SPAN); - if(line_status == LineStatus::SPAN) - html_fout << ""; - else - assert(line_status == LineStatus::DIV); - // don't change line_status - } - - if(line_status == LineStatus::NONE) - { - new_line_status = LineStatus::DIV; + new_line_state = NewLineState::DIV; } - if(new_line_status != LineStatus::DIV) + if(new_line_state == NewLineState::DIV) + { + close_line(); + + line_buf.reset(state); + + //resync position + draw_ty = cur_ty; + draw_tx = cur_tx; + } + else { // align horizontal position // try to merge with the last line if possible @@ -349,67 +337,24 @@ void HTMLRenderer::prepare_line(GfxState * state) } else { - // don't close a pending span here, keep the styling - double w; - auto wid = install_whitespace(target, w); - double threshold = draw_font_size * (cur_font_info.ascent - cur_font_info.descent) * (param->space_threshold); - line_buf << format("%2%") % wid % (target > (threshold - EPS) ? " " : ""); - draw_tx += w / draw_scale; + line_buf.append_offset(target); + draw_tx += target / draw_scale; } } - if(new_line_status != LineStatus::NONE) + if(new_line_state != NewLineState::NONE) { - // have to open a new tag - if (new_line_status == LineStatus::DIV) - { - state->transform(state->getCurX(), state->getCurY(), &line_x, &line_y); - line_tm_id = cur_tm_id; - line_ascent = cur_font_info.ascent * draw_font_size; - line_height = (cur_font_info.ascent - cur_font_info.descent) * draw_font_size; - - //resync position - draw_ty = cur_ty; - draw_tx = cur_tx; - } - else if(new_line_status == LineStatus::SPAN) - { - // pass - } - else - { - assert(false && "Bad value of new_line_status"); - } - - line_buf << format("") - % cur_font_info.id % cur_fs_id % cur_color_id % cur_ls_id % cur_ws_id % cur_rise_id; - line_ascent = max(line_ascent, cur_font_info.ascent * draw_font_size); - line_height = max(line_height, (cur_font_info.ascent - cur_font_info.descent) * draw_font_size); - - line_status = LineStatus::SPAN; + line_buf.append_state(); } + + line_opened = true; } + void HTMLRenderer::close_line() { - if(line_status == LineStatus::NONE) - return; - - // TODO class for height - html_fout << format("
") - % line_x - % line_y - % line_tm_id - % line_ascent - ; - html_fout << line_buf.rdbuf(); - line_buf.str(""); - - if(line_status == LineStatus::SPAN) - html_fout << ""; - else - assert(line_status == LineStatus::DIV); - - html_fout << "
"; - line_status = LineStatus::NONE; - + if(line_opened) + { + line_opened = false; + line_buf.flush(); + } } diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 931f585..a013e44 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -407,8 +407,8 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) ++nSpaces; } - Unicode uu = (cur_font_info.use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font)); - outputUnicodes(line_buf, &uu, 1); + Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font)); + line_buf.append_unicodes(&uu, 1); dx += dx1; dy += dy1;