diff --git a/CMakeLists.txt b/CMakeLists.txt index 97e2a81..d396417 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,15 +181,17 @@ add_executable(pdf2htmlEX src/util/path.cc src/util/unicode.h src/util/unicode.cc - src/HTMLState.h src/ArgParser.h src/ArgParser.cc src/Base64Stream.h src/Base64Stream.cc src/Color.h src/Color.cc + src/HTMLState.h src/HTMLTextLine.h src/HTMLTextLine.cc + src/HTMLTextPage.h + src/HTMLTextPage.cc src/Preprocessor.h src/Preprocessor.cc src/StringFormatter.h diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index f23b559..16a24cc 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -27,7 +26,7 @@ #include "TmpFiles.h" #include "Color.h" #include "StateManager.h" -#include "HTMLTextLine.h" +#include "HTMLTextPage.h" #include "util/const.h" #include "util/misc.h" @@ -242,17 +241,11 @@ protected: double print_scale (void) const { return 96.0 / DEFAULT_DPI / text_zoom_factor(); } + const Param & param; + //////////////////////////////////////////////////// // PDF states //////////////////////////////////////////////////// - bool line_opened; - enum NewLineState - { - NLS_NONE, // stay with the same style - NLS_SPAN, // open a new if possible, otherwise a new
- NLS_DIV // has to open a new
- } new_line_state; - // track the original (unscaled) values to determine scaling and merge lines // current position double cur_tx, cur_ty; // real text position, in text coords @@ -290,8 +283,18 @@ protected: // also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty) double draw_tx, draw_ty; - // some metrics have to be determined after all elements in the lines have been seen - std::vector> text_lines; + // managers store values actually used in HTML (i.e. scaled) + AllStateManater all_manager; + + enum NewLineState + { + NLS_NONE, // stay with the same style + NLS_SPAN, // open a new if possible, otherwise a new
+ NLS_DIV // has to open a new
+ } new_line_state; + + + HTMLTextPage html_text_page; // for font reencoding int32_t * cur_mapping; @@ -311,11 +314,6 @@ protected: HTMLState cur_html_state; std::unordered_map font_info_map; - // managers store values actually used in HTML (i.e. scaled) - AllStateManater all_manager; - - const Param & param; - struct { std::ofstream fs; std::string path; diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 427d091..865f7ae 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -40,10 +40,10 @@ using std::endl; HTMLRenderer::HTMLRenderer(const Param & param) :OutputDev() - ,line_opened(false) + ,param(param) + ,html_text_page(param, all_manager) ,preprocessor(param) ,tmp_files(param) - ,param(param) { if(!(param.debug)) { @@ -51,7 +51,6 @@ HTMLRenderer::HTMLRenderer(const Param & param) globalParams->setErrQuiet(gTrue); } - text_lines.emplace_back(new HTMLTextLine(param, all_manager)); ffw_init(param.debug); cur_mapping = new int32_t [0x10000]; cur_mapping2 = new char* [0x100]; @@ -169,8 +168,6 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state) void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) { - assert((!line_opened) && "Open line in startPage detected!"); - this->pageNum = pageNum; long long wid = all_manager.width.install(state->getPageWidth()); @@ -213,8 +210,9 @@ void HTMLRenderer::endPage() { close_text_line(); // dump all text - for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter) - (*iter)->flush(f_pages.fs); + html_text_page.dump_text(f_pages.fs); + html_text_page.dump_css(f_css.fs); + html_text_page.clear(); // process links before the page is closed cur_doc->processLinks(this, pageNum); diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc index 249d3f5..f039681 100644 --- a/src/HTMLRenderer/state.cc +++ b/src/HTMLRenderer/state.cc @@ -335,7 +335,7 @@ void HTMLRenderer::check_state_change(GfxState * state) if(merged) { - text_lines.back()->append_offset(dx * old_draw_text_scale); + html_text_page.append_offset(dx * old_draw_text_scale); if(equal(dy, 0)) { cur_html_state.vertical_align = 0; @@ -434,11 +434,6 @@ void HTMLRenderer::check_state_change(GfxState * state) void HTMLRenderer::prepare_text_line(GfxState * state) { - if(!line_opened) - { - new_line_state = NLS_DIV; - } - if(new_line_state == NLS_DIV) { close_text_line(); @@ -458,26 +453,20 @@ void HTMLRenderer::prepare_text_line(GfxState * state) double target = (cur_tx - draw_tx) * draw_text_scale; if(!equal(target, 0)) { - text_lines.back()->append_offset(target); + html_text_page.append_offset(target); draw_tx += target / draw_text_scale; } } if(new_line_state != NLS_NONE) { - text_lines.back()->append_state(cur_html_state); + html_text_page.append_state(cur_html_state); } - - line_opened = true; } void HTMLRenderer::close_text_line() { - if(line_opened) - { - line_opened = false; - text_lines.emplace_back(new HTMLTextLine(param, all_manager)); - } + html_text_page.open_new_line(); } } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index 7529438..8a0411a 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -90,13 +90,13 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) if(is_space && (param.space_as_offset)) { // ignore horiz_scaling, as it's merged in CTM - text_lines.back()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); + html_text_page.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); } else { if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode)) { - text_lines.back()->append_unicodes(u, uLen); + html_text_page.append_unicodes(u, uLen); // TODO: decomposed characters may be not with the same width as the original ligature, need to fix it. } else @@ -110,14 +110,14 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) { uu = unicode_from_font(code, font); } - text_lines.back()->append_unicodes(&uu, 1); + html_text_page.append_unicodes(&uu, 1); /* * In PDF, word_space is appended if (n == 1 and *p = ' ') * but in HTML, word_space is appended if (uu == ' ') */ int space_count = (is_space ? 1 : 0) - (uu == ' ' ? 1 : 0); if(space_count != 0) - text_lines.back()->append_offset(cur_word_space * draw_text_scale * space_count); + html_text_page.append_offset(cur_word_space * draw_text_scale * space_count); } } diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index ee20126..e4f8f30 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -6,17 +6,13 @@ * Copyright (C) 2012,2013 Lu Wang */ -#include #include #include #include "HTMLTextLine.h" -#include "util/namespace.h" -#include "util/unicode.h" -#include "util/math.h" -#include "util/css_const.h" #include "util/encoding.h" +#include "util/css_const.h" namespace pdf2htmlEX { @@ -29,6 +25,10 @@ using std::endl; using std::find; using std::abs; +HTMLTextLine::HTMLTextLine (const Param & param, AllStateManater & all_manager) + : param(param), all_manager(all_manager) +{ } + void HTMLTextLine::append_unicodes(const Unicode * u, int l) { text.insert(text.end(), u, u+l); @@ -59,18 +59,14 @@ void HTMLTextLine::append_state(const HTMLState & html_state) (HTMLState&)(states.back()) = html_state; } -void HTMLTextLine::flush(ostream & out) +void HTMLTextLine::dump_text(ostream & out) { /* * Each Line is an independent absolute positioned block * so even we have a few states or offsets, we may omit them */ if(text.empty()) - { - states.clear(); - offsets.clear(); return; - } // remove unuseful states in the end while((!states.empty()) && (states.back().start_idx >= text.size())) @@ -79,9 +75,6 @@ void HTMLTextLine::flush(ostream & out) if(states.empty() || (states[0].start_idx != 0)) { cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl; - states.clear(); - text.clear(); - offsets.clear(); return; } @@ -256,7 +249,10 @@ void HTMLTextLine::flush(ostream & out) } out << "
"; +} +void HTMLTextLine::clear(void) +{ states.clear(); offsets.clear(); text.clear(); diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h index 3183555..107b9a9 100644 --- a/src/HTMLTextLine.h +++ b/src/HTMLTextLine.h @@ -27,8 +27,7 @@ namespace pdf2htmlEX { class HTMLTextLine { public: - HTMLTextLine (const Param & param, AllStateManater & all_manager) - : param(param), all_manager(all_manager) { } + HTMLTextLine (const Param & param, AllStateManater & all_manager); struct State : public HTMLState { // before output @@ -78,7 +77,10 @@ public: void append_unicodes(const Unicode * u, int l); void append_offset(double width); void append_state(const HTMLState & html_state); - void flush(std::ostream & out); + void dump_text(std::ostream & out); + + bool empty(void) const { return text.empty(); } + void clear(void); private: void optimize(void); diff --git a/src/HTMLTextPage.cc b/src/HTMLTextPage.cc new file mode 100644 index 0000000..1b6de71 --- /dev/null +++ b/src/HTMLTextPage.cc @@ -0,0 +1,78 @@ +/* + * HTMLTextPage.cc + * + * Generate and optimized HTML for one Page + * + * Copyright (C) 2013 Lu Wang + */ + +#include "HTMLTextPage.h" + +namespace pdf2htmlEX { + +using std::ostream; + +HTMLTextPage::HTMLTextPage(const Param & param, AllStateManater & all_manager) + : param(param) + , all_manager(all_manager) + , last_line(nullptr) +{ } + +void HTMLTextPage::dump_text(ostream & out) +{ + for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter) + (*iter)->dump_text(out); +} + +void HTMLTextPage::append_unicodes(const Unicode * u, int l) +{ + if(!last_line) + open_new_line(); + last_line->append_unicodes(u, l); +} + +void HTMLTextPage::append_offset(double offset) +{ + if(!last_line) + open_new_line(); + last_line->append_offset(offset); +} + +void HTMLTextPage::append_state(const HTMLState & state) +{ + if(!last_line) + open_new_line(); + last_line->append_state(state); +} + +void HTMLTextPage::dump_css(ostream & out) +{ + //TODO +} + +void HTMLTextPage::clear(void) +{ + text_lines.clear(); + last_line = nullptr; +} + +void HTMLTextPage::open_new_line(void) +{ + if(last_line && (last_line->empty())) + { + // state and offsets might be nonempty + last_line->clear(); + } + else + { + text_lines.emplace_back(new HTMLTextLine(param, all_manager)); + last_line = text_lines.back().get(); + } +} + +void HTMLTextPage::optimize(void) +{ + //TODO +} + +} // namespace pdf2htmlEX diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h new file mode 100644 index 0000000..bbe813e --- /dev/null +++ b/src/HTMLTextPage.h @@ -0,0 +1,51 @@ +/* + * Header file for HTMLTextPage + * Copyright (C) 2013 Lu Wang + */ + +#ifndef HTMLTEXTPAGE_H__ +#define HTMLTEXTPAGE_H__ + +#include +#include +#include + +#include "Param.h" +#include "StateManager.h" +#include "HTMLTextLine.h" +#include "HTMLState.h" + +namespace pdf2htmlEX { + +/* + * Store and optimize a page of text in HTML + * + * contains a series of HTMLTextLine + */ + +class HTMLTextPage +{ +public: + HTMLTextPage (const Param & param, AllStateManater & all_manager); + + void append_unicodes(const Unicode * u, int l); + void append_offset(double offset); + void append_state(const HTMLState & state); + + void dump_text(std::ostream & out); + void dump_css(std::ostream & out); + void clear(void); + + void open_new_line(void); + +private: + void optimize(void); + + const Param & param; + AllStateManater & all_manager; + HTMLTextLine * last_line; + std::vector> text_lines; +}; + +} //namespace pdf2htmlEX +#endif //HTMLTEXTPAGE_H__