/* * Header file for HTMLTextLine * Copyright (C) 2013 Lu Wang */ #ifndef HTMLTEXTLINE_H__ #define HTMLTEXTLINE_H__ #include #include #include #include "Param.h" #include "StateManager.h" #include "HTMLState.h" namespace pdf2htmlEX { /* * Store and optimize a line of text in HTML * * contains a series of * - Text * - Shift * - State change */ class HTMLTextLine { public: HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager); struct State : public HTMLTextState { // before output void begin(std::ostream & out, const State * prev_state); // after output void end(std::ostream & out) const; // calculate the hash code void hash(void); // calculate the difference between another State int diff(const State & s) const; enum { FONT_ID, FONT_SIZE_ID, FILL_COLOR_ID, STROKE_COLOR_ID, LETTER_SPACE_ID, WORD_SPACE_ID, HASH_ID_COUNT, VERTICAL_ALIGN_ID = HASH_ID_COUNT, ID_COUNT }; static long long umask_by_id(int id); long long ids[ID_COUNT]; size_t start_idx; // index of the first Text using this state // for optimization long long hash_value; long long hash_umask; // some states may not be actually used bool need_close; static const char * const css_class_names []; // class names for each id }; struct Offset { Offset(size_t size_idx, double width) :start_idx(size_idx),width(width) { } size_t start_idx; // should put this Offset right before text[start_idx]; double width; }; /** * Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to * multiple code points. */ void append_unicodes(const Unicode * u, int l, double width); /** * Append a special padding char with 0 width, in order to keep char index consistent. * The padding char is ignored during output. */ void append_padding_char() { text.push_back(0); } void append_offset(double width); void append_state(const HTMLTextState & text_state); void dump_text(std::ostream & out); bool text_empty(void) const { return text.empty(); } void clear(void); void clip(const HTMLClipState &); /* * Optimize and calculate necessary values */ void prepare(void); void optimize(std::vector &); private: void optimize_normal(std::vector &); void optimize_aggressive(std::vector &); /** * Dump chars' unicode to output stream. * begin/pos is the index in 'text'. */ void dump_chars(std::ostream & out, int begin, int len); void dump_char(std::ostream & out, int pos); const Param & param; AllStateManager & all_manager; HTMLLineState line_state; double ascent, descent; double clip_x1, clip_y1; double width; std::vector states; std::vector offsets; /** * Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text': * - If c > 0, it is the unicode code point corresponds to the glyph; * - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?); * - If c < -1, this glyph corresponds to more than one unicode code points, * which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'. */ std::vector text; std::vector > decomposed_text; }; } // namespace pdf2htmlEX #endif //HTMLTEXTLINE_H__