1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-07-05 09:38:40 +00:00
This commit is contained in:
Lu Wang 2013-04-07 16:10:52 +08:00
parent da0b6fcabb
commit 4a0f422b31
6 changed files with 71 additions and 48 deletions

View File

@ -284,7 +284,7 @@ protected:
double draw_tx, draw_ty; double draw_tx, draw_ty;
// managers store values actually used in HTML (i.e. scaled) // managers store values actually used in HTML (i.e. scaled)
AllStateManater all_manager; AllStateManager all_manager;
enum NewLineState enum NewLineState
{ {

View File

@ -25,7 +25,7 @@ using std::endl;
using std::find; using std::find;
using std::abs; using std::abs;
HTMLTextLine::HTMLTextLine (const Param & param, AllStateManater & all_manager) HTMLTextLine::HTMLTextLine (const Param & param, AllStateManager & all_manager)
: param(param), all_manager(all_manager) : param(param), all_manager(all_manager)
{ } { }
@ -59,53 +59,35 @@ void HTMLTextLine::append_state(const HTMLState & html_state)
(HTMLState&)(states.back()) = html_state; (HTMLState&)(states.back()) = html_state;
} }
void HTMLTextLine::dump_text(ostream & out) bool HTMLTextLine::dump_text(ostream & out)
{ {
/* /*
* Each Line is an independent absolute positioned block * Each Line is an independent absolute positioned block
* so even we have a few states or offsets, we may omit them * so even we have a few states or offsets, we may omit them
*/ */
if(text.empty()) if(text.empty())
return; return false;
// remove unuseful states in the end
while((!states.empty()) && (states.back().start_idx >= text.size()))
states.pop_back();
if(states.empty() || (states[0].start_idx != 0)) if(states.empty() || (states[0].start_idx != 0))
{ {
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl; cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
return; return false;
} }
// optimize before output
optimize();
// Start Output // Start Output
{ {
// max_ascent determines the height of the div
double accum_vertical_align = 0; // accumulated
double max_ascent = 0;
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
accum_vertical_align += iter->vertical_align;
double cur_ascent = accum_vertical_align + iter->font_info->ascent * iter->font_size;
if(cur_ascent > max_ascent)
max_ascent = cur_ascent;
}
// open <div> for the current text line // open <div> for the current text line
out << "<div class=\"" << CSS::LINE_CN out << "<div class=\"" << CSS::LINE_CN
<< " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(states[0].transform_matrix) << " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(states[0].transform_matrix)
<< " " << CSS::LEFT_CN << all_manager.left.install(states[0].x) << " " << CSS::LEFT_CN << all_manager.left.install(states[0].x)
<< " " << CSS::HEIGHT_CN << all_manager.height.install(max_ascent) << " " << CSS::HEIGHT_CN << all_manager.height.install(ascent)
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(states[0].y) << " " << CSS::BOTTOM_CN << all_manager.bottom.install(states[0].y)
; ;
// it will be closed by the first state // it will be closed by the first state
} }
std::vector<State*> stack;
// a special safeguard in the bottom // a special safeguard in the bottom
stack.clear();
stack.push_back(nullptr); stack.push_back(nullptr);
//accumulated horizontal offset; //accumulated horizontal offset;
@ -123,15 +105,6 @@ void HTMLTextLine::dump_text(ostream & out)
{ {
// export current state, find a closest parent // export current state, find a closest parent
{ {
// set id
state_iter1->ids[State::FONT_ID] = state_iter1->font_info->id;
state_iter1->ids[State::FONT_SIZE_ID] = all_manager.font_size.install(state_iter1->font_size);
state_iter1->ids[State::FILL_COLOR_ID] = all_manager.fill_color.install(state_iter1->fill_color);
state_iter1->ids[State::STROKE_COLOR_ID] = all_manager.stroke_color.install(state_iter1->stroke_color);
state_iter1->ids[State::LETTER_SPACE_ID] = all_manager.letter_space.install(state_iter1->letter_space);
state_iter1->ids[State::WORD_SPACE_ID] = all_manager.word_space.install(state_iter1->word_space);
state_iter1->hash();
// greedy // greedy
double vertical_align = state_iter1->vertical_align; double vertical_align = state_iter1->vertical_align;
int best_cost = State::HASH_ID_COUNT + 1; int best_cost = State::HASH_ID_COUNT + 1;
@ -249,6 +222,7 @@ void HTMLTextLine::dump_text(ostream & out)
} }
out << "</div>"; out << "</div>";
return true;
} }
void HTMLTextLine::clear(void) void HTMLTextLine::clear(void)
@ -258,14 +232,46 @@ void HTMLTextLine::clear(void)
text.clear(); text.clear();
} }
void HTMLTextLine::prepare(void)
{
if(param.optimize_text)
optimize();
// max_ascent determines the height of the div
double accum_vertical_align = 0; // accumulated
ascent = 0;
descent = 0;
// note that vertical_align cannot be calculated here
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
iter->ids[State::FONT_ID] = iter->font_info->id;
iter->ids[State::FONT_SIZE_ID] = all_manager.font_size.install(iter->font_size);
iter->ids[State::FILL_COLOR_ID] = all_manager.fill_color.install(iter->fill_color);
iter->ids[State::STROKE_COLOR_ID] = all_manager.stroke_color.install(iter->stroke_color);
iter->ids[State::LETTER_SPACE_ID] = all_manager.letter_space.install(iter->letter_space);
iter->ids[State::WORD_SPACE_ID] = all_manager.word_space.install(iter->word_space);
iter->hash();
accum_vertical_align += iter->vertical_align;
double cur_ascent = accum_vertical_align + iter->font_info->ascent * iter->font_size;
if(cur_ascent > ascent)
ascent = cur_ascent;
double cur_descent = accum_vertical_align + iter->font_info->descent * iter->font_size;
if(cur_descent < descent)
descent = cur_descent;
}
}
/* /*
* Adjust letter space and word space in order to reduce the number of HTML elements * Adjust letter space and word space in order to reduce the number of HTML elements
* May also unmask word space * May also unmask word space
*/ */
void HTMLTextLine::optimize() void HTMLTextLine::optimize()
{ {
if(!(param.optimize_text)) // remove unuseful states in the end
return; while((!states.empty()) && (states.back().start_idx >= text.size()))
states.pop_back();
assert(!states.empty()); assert(!states.empty());

View File

@ -27,7 +27,7 @@ namespace pdf2htmlEX {
class HTMLTextLine class HTMLTextLine
{ {
public: public:
HTMLTextLine (const Param & param, AllStateManater & all_manager); HTMLTextLine (const Param & param, AllStateManager & all_manager);
struct State : public HTMLState { struct State : public HTMLState {
// before output // before output
@ -65,7 +65,6 @@ public:
static const char * const css_class_names []; // class names for each id static const char * const css_class_names []; // class names for each id
}; };
struct Offset { struct Offset {
Offset(size_t size_idx, double width) Offset(size_t size_idx, double width)
:start_idx(size_idx),width(width) :start_idx(size_idx),width(width)
@ -77,26 +76,33 @@ public:
void append_unicodes(const Unicode * u, int l); void append_unicodes(const Unicode * u, int l);
void append_offset(double width); void append_offset(double width);
void append_state(const HTMLState & html_state); void append_state(const HTMLState & html_state);
void dump_text(std::ostream & out); // return if anything dumped
bool dump_text(std::ostream & out);
bool empty(void) const { return text.empty(); } bool empty(void) const { return text.empty(); }
void clear(void); void clear(void);
/*
* Optimize and calculate necessary values
*/
void prepare(void);
double get_ascent (void) const { return ascent; }
double get_descent(void) const { return descent; }
private: private:
void optimize(void); void optimize(void);
const Param & param; const Param & param;
AllStateManater & all_manager; AllStateManager & all_manager;
double x, y; double x, y;
long long tm_id; long long tm_id;
double ascent, descent;
std::vector<State> states; std::vector<State> states;
std::vector<Offset> offsets; std::vector<Offset> offsets;
std::vector<Unicode> text; std::vector<Unicode> text;
// for flush
std::vector<State*> stack;
}; };
} // namespace pdf2htmlEX } // namespace pdf2htmlEX

View File

@ -12,7 +12,7 @@ namespace pdf2htmlEX {
using std::ostream; using std::ostream;
HTMLTextPage::HTMLTextPage(const Param & param, AllStateManater & all_manager) HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
: param(param) : param(param)
, all_manager(all_manager) , all_manager(all_manager)
, last_line(nullptr) , last_line(nullptr)
@ -20,7 +20,7 @@ HTMLTextPage::HTMLTextPage(const Param & param, AllStateManater & all_manager)
void HTMLTextPage::dump_text(ostream & out) void HTMLTextPage::dump_text(ostream & out)
{ {
optimize(); prepare();
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter) for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
(*iter)->dump_text(out); (*iter)->dump_text(out);
} }
@ -71,6 +71,16 @@ void HTMLTextPage::open_new_line(void)
} }
} }
void HTMLTextPage::prepare(void)
{
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
{
(*iter)->prepare();
}
if(param.optimize_text)
optimize();
}
void HTMLTextPage::optimize(void) void HTMLTextPage::optimize(void)
{ {
//TODO //TODO

View File

@ -26,7 +26,7 @@ namespace pdf2htmlEX {
class HTMLTextPage class HTMLTextPage
{ {
public: public:
HTMLTextPage (const Param & param, AllStateManater & all_manager); HTMLTextPage (const Param & param, AllStateManager & all_manager);
void append_unicodes(const Unicode * u, int l); void append_unicodes(const Unicode * u, int l);
void append_offset(double offset); void append_offset(double offset);
@ -39,10 +39,11 @@ public:
void open_new_line(void); void open_new_line(void);
private: private:
void prepare(void);
void optimize(void); void optimize(void);
const Param & param; const Param & param;
AllStateManater & all_manager; AllStateManager & all_manager;
HTMLTextLine * last_line; HTMLTextLine * last_line;
std::vector<std::unique_ptr<HTMLTextLine>> text_lines; std::vector<std::unique_ptr<HTMLTextLine>> text_lines;
}; };

View File

@ -411,7 +411,7 @@ private:
std::unordered_map<int, std::pair<double,double>> value_map; std::unordered_map<int, std::pair<double,double>> value_map;
}; };
struct AllStateManater struct AllStateManager
{ {
TransformMatrixManager transform_matrix; TransformMatrixManager transform_matrix;
VerticalAlignManager vertical_align; VerticalAlignManager vertical_align;