mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-07-05 09:38:40 +00:00
working
This commit is contained in:
parent
da0b6fcabb
commit
4a0f422b31
|
@ -284,7 +284,7 @@ protected:
|
||||||
double draw_tx, draw_ty;
|
double draw_tx, draw_ty;
|
||||||
|
|
||||||
// managers store values actually used in HTML (i.e. scaled)
|
// managers store values actually used in HTML (i.e. scaled)
|
||||||
AllStateManater all_manager;
|
AllStateManager all_manager;
|
||||||
|
|
||||||
enum NewLineState
|
enum NewLineState
|
||||||
{
|
{
|
||||||
|
|
|
@ -25,7 +25,7 @@ using std::endl;
|
||||||
using std::find;
|
using std::find;
|
||||||
using std::abs;
|
using std::abs;
|
||||||
|
|
||||||
HTMLTextLine::HTMLTextLine (const Param & param, AllStateManater & all_manager)
|
HTMLTextLine::HTMLTextLine (const Param & param, AllStateManager & all_manager)
|
||||||
: param(param), all_manager(all_manager)
|
: param(param), all_manager(all_manager)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
|
@ -59,53 +59,35 @@ void HTMLTextLine::append_state(const HTMLState & html_state)
|
||||||
(HTMLState&)(states.back()) = html_state;
|
(HTMLState&)(states.back()) = html_state;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTextLine::dump_text(ostream & out)
|
bool HTMLTextLine::dump_text(ostream & out)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Each Line is an independent absolute positioned block
|
* Each Line is an independent absolute positioned block
|
||||||
* so even we have a few states or offsets, we may omit them
|
* so even we have a few states or offsets, we may omit them
|
||||||
*/
|
*/
|
||||||
if(text.empty())
|
if(text.empty())
|
||||||
return;
|
return false;
|
||||||
|
|
||||||
// remove unuseful states in the end
|
|
||||||
while((!states.empty()) && (states.back().start_idx >= text.size()))
|
|
||||||
states.pop_back();
|
|
||||||
|
|
||||||
if(states.empty() || (states[0].start_idx != 0))
|
if(states.empty() || (states[0].start_idx != 0))
|
||||||
{
|
{
|
||||||
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
|
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// optimize before output
|
|
||||||
optimize();
|
|
||||||
|
|
||||||
// Start Output
|
// Start Output
|
||||||
{
|
{
|
||||||
// max_ascent determines the height of the div
|
|
||||||
double accum_vertical_align = 0; // accumulated
|
|
||||||
double max_ascent = 0;
|
|
||||||
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
|
||||||
{
|
|
||||||
accum_vertical_align += iter->vertical_align;
|
|
||||||
double cur_ascent = accum_vertical_align + iter->font_info->ascent * iter->font_size;
|
|
||||||
if(cur_ascent > max_ascent)
|
|
||||||
max_ascent = cur_ascent;
|
|
||||||
}
|
|
||||||
|
|
||||||
// open <div> for the current text line
|
// open <div> for the current text line
|
||||||
out << "<div class=\"" << CSS::LINE_CN
|
out << "<div class=\"" << CSS::LINE_CN
|
||||||
<< " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(states[0].transform_matrix)
|
<< " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(states[0].transform_matrix)
|
||||||
<< " " << CSS::LEFT_CN << all_manager.left.install(states[0].x)
|
<< " " << CSS::LEFT_CN << all_manager.left.install(states[0].x)
|
||||||
<< " " << CSS::HEIGHT_CN << all_manager.height.install(max_ascent)
|
<< " " << CSS::HEIGHT_CN << all_manager.height.install(ascent)
|
||||||
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(states[0].y)
|
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(states[0].y)
|
||||||
;
|
;
|
||||||
// it will be closed by the first state
|
// it will be closed by the first state
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<State*> stack;
|
||||||
// a special safeguard in the bottom
|
// a special safeguard in the bottom
|
||||||
stack.clear();
|
|
||||||
stack.push_back(nullptr);
|
stack.push_back(nullptr);
|
||||||
|
|
||||||
//accumulated horizontal offset;
|
//accumulated horizontal offset;
|
||||||
|
@ -123,15 +105,6 @@ void HTMLTextLine::dump_text(ostream & out)
|
||||||
{
|
{
|
||||||
// export current state, find a closest parent
|
// export current state, find a closest parent
|
||||||
{
|
{
|
||||||
// set id
|
|
||||||
state_iter1->ids[State::FONT_ID] = state_iter1->font_info->id;
|
|
||||||
state_iter1->ids[State::FONT_SIZE_ID] = all_manager.font_size.install(state_iter1->font_size);
|
|
||||||
state_iter1->ids[State::FILL_COLOR_ID] = all_manager.fill_color.install(state_iter1->fill_color);
|
|
||||||
state_iter1->ids[State::STROKE_COLOR_ID] = all_manager.stroke_color.install(state_iter1->stroke_color);
|
|
||||||
state_iter1->ids[State::LETTER_SPACE_ID] = all_manager.letter_space.install(state_iter1->letter_space);
|
|
||||||
state_iter1->ids[State::WORD_SPACE_ID] = all_manager.word_space.install(state_iter1->word_space);
|
|
||||||
state_iter1->hash();
|
|
||||||
|
|
||||||
// greedy
|
// greedy
|
||||||
double vertical_align = state_iter1->vertical_align;
|
double vertical_align = state_iter1->vertical_align;
|
||||||
int best_cost = State::HASH_ID_COUNT + 1;
|
int best_cost = State::HASH_ID_COUNT + 1;
|
||||||
|
@ -249,6 +222,7 @@ void HTMLTextLine::dump_text(ostream & out)
|
||||||
}
|
}
|
||||||
|
|
||||||
out << "</div>";
|
out << "</div>";
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTextLine::clear(void)
|
void HTMLTextLine::clear(void)
|
||||||
|
@ -258,14 +232,46 @@ void HTMLTextLine::clear(void)
|
||||||
text.clear();
|
text.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void HTMLTextLine::prepare(void)
|
||||||
|
{
|
||||||
|
if(param.optimize_text)
|
||||||
|
optimize();
|
||||||
|
|
||||||
|
// max_ascent determines the height of the div
|
||||||
|
double accum_vertical_align = 0; // accumulated
|
||||||
|
ascent = 0;
|
||||||
|
descent = 0;
|
||||||
|
// note that vertical_align cannot be calculated here
|
||||||
|
for(auto iter = states.begin(); iter != states.end(); ++iter)
|
||||||
|
{
|
||||||
|
iter->ids[State::FONT_ID] = iter->font_info->id;
|
||||||
|
iter->ids[State::FONT_SIZE_ID] = all_manager.font_size.install(iter->font_size);
|
||||||
|
iter->ids[State::FILL_COLOR_ID] = all_manager.fill_color.install(iter->fill_color);
|
||||||
|
iter->ids[State::STROKE_COLOR_ID] = all_manager.stroke_color.install(iter->stroke_color);
|
||||||
|
iter->ids[State::LETTER_SPACE_ID] = all_manager.letter_space.install(iter->letter_space);
|
||||||
|
iter->ids[State::WORD_SPACE_ID] = all_manager.word_space.install(iter->word_space);
|
||||||
|
iter->hash();
|
||||||
|
|
||||||
|
accum_vertical_align += iter->vertical_align;
|
||||||
|
double cur_ascent = accum_vertical_align + iter->font_info->ascent * iter->font_size;
|
||||||
|
if(cur_ascent > ascent)
|
||||||
|
ascent = cur_ascent;
|
||||||
|
double cur_descent = accum_vertical_align + iter->font_info->descent * iter->font_size;
|
||||||
|
if(cur_descent < descent)
|
||||||
|
descent = cur_descent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Adjust letter space and word space in order to reduce the number of HTML elements
|
* Adjust letter space and word space in order to reduce the number of HTML elements
|
||||||
* May also unmask word space
|
* May also unmask word space
|
||||||
*/
|
*/
|
||||||
void HTMLTextLine::optimize()
|
void HTMLTextLine::optimize()
|
||||||
{
|
{
|
||||||
if(!(param.optimize_text))
|
// remove unuseful states in the end
|
||||||
return;
|
while((!states.empty()) && (states.back().start_idx >= text.size()))
|
||||||
|
states.pop_back();
|
||||||
|
|
||||||
assert(!states.empty());
|
assert(!states.empty());
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ namespace pdf2htmlEX {
|
||||||
class HTMLTextLine
|
class HTMLTextLine
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
HTMLTextLine (const Param & param, AllStateManater & all_manager);
|
HTMLTextLine (const Param & param, AllStateManager & all_manager);
|
||||||
|
|
||||||
struct State : public HTMLState {
|
struct State : public HTMLState {
|
||||||
// before output
|
// before output
|
||||||
|
@ -65,7 +65,6 @@ public:
|
||||||
static const char * const css_class_names []; // class names for each id
|
static const char * const css_class_names []; // class names for each id
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
struct Offset {
|
struct Offset {
|
||||||
Offset(size_t size_idx, double width)
|
Offset(size_t size_idx, double width)
|
||||||
:start_idx(size_idx),width(width)
|
:start_idx(size_idx),width(width)
|
||||||
|
@ -77,26 +76,33 @@ public:
|
||||||
void append_unicodes(const Unicode * u, int l);
|
void append_unicodes(const Unicode * u, int l);
|
||||||
void append_offset(double width);
|
void append_offset(double width);
|
||||||
void append_state(const HTMLState & html_state);
|
void append_state(const HTMLState & html_state);
|
||||||
void dump_text(std::ostream & out);
|
// return if anything dumped
|
||||||
|
bool dump_text(std::ostream & out);
|
||||||
|
|
||||||
bool empty(void) const { return text.empty(); }
|
bool empty(void) const { return text.empty(); }
|
||||||
void clear(void);
|
void clear(void);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Optimize and calculate necessary values
|
||||||
|
*/
|
||||||
|
void prepare(void);
|
||||||
|
|
||||||
|
double get_ascent (void) const { return ascent; }
|
||||||
|
double get_descent(void) const { return descent; }
|
||||||
private:
|
private:
|
||||||
void optimize(void);
|
void optimize(void);
|
||||||
|
|
||||||
const Param & param;
|
const Param & param;
|
||||||
AllStateManater & all_manager;
|
AllStateManager & all_manager;
|
||||||
|
|
||||||
double x, y;
|
double x, y;
|
||||||
long long tm_id;
|
long long tm_id;
|
||||||
|
|
||||||
|
double ascent, descent;
|
||||||
|
|
||||||
std::vector<State> states;
|
std::vector<State> states;
|
||||||
std::vector<Offset> offsets;
|
std::vector<Offset> offsets;
|
||||||
std::vector<Unicode> text;
|
std::vector<Unicode> text;
|
||||||
|
|
||||||
// for flush
|
|
||||||
std::vector<State*> stack;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace pdf2htmlEX
|
} // namespace pdf2htmlEX
|
||||||
|
|
|
@ -12,7 +12,7 @@ namespace pdf2htmlEX {
|
||||||
|
|
||||||
using std::ostream;
|
using std::ostream;
|
||||||
|
|
||||||
HTMLTextPage::HTMLTextPage(const Param & param, AllStateManater & all_manager)
|
HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
|
||||||
: param(param)
|
: param(param)
|
||||||
, all_manager(all_manager)
|
, all_manager(all_manager)
|
||||||
, last_line(nullptr)
|
, last_line(nullptr)
|
||||||
|
@ -20,7 +20,7 @@ HTMLTextPage::HTMLTextPage(const Param & param, AllStateManater & all_manager)
|
||||||
|
|
||||||
void HTMLTextPage::dump_text(ostream & out)
|
void HTMLTextPage::dump_text(ostream & out)
|
||||||
{
|
{
|
||||||
optimize();
|
prepare();
|
||||||
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
|
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
|
||||||
(*iter)->dump_text(out);
|
(*iter)->dump_text(out);
|
||||||
}
|
}
|
||||||
|
@ -71,6 +71,16 @@ void HTMLTextPage::open_new_line(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void HTMLTextPage::prepare(void)
|
||||||
|
{
|
||||||
|
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
|
||||||
|
{
|
||||||
|
(*iter)->prepare();
|
||||||
|
}
|
||||||
|
if(param.optimize_text)
|
||||||
|
optimize();
|
||||||
|
}
|
||||||
|
|
||||||
void HTMLTextPage::optimize(void)
|
void HTMLTextPage::optimize(void)
|
||||||
{
|
{
|
||||||
//TODO
|
//TODO
|
||||||
|
|
|
@ -26,7 +26,7 @@ namespace pdf2htmlEX {
|
||||||
class HTMLTextPage
|
class HTMLTextPage
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
HTMLTextPage (const Param & param, AllStateManater & all_manager);
|
HTMLTextPage (const Param & param, AllStateManager & all_manager);
|
||||||
|
|
||||||
void append_unicodes(const Unicode * u, int l);
|
void append_unicodes(const Unicode * u, int l);
|
||||||
void append_offset(double offset);
|
void append_offset(double offset);
|
||||||
|
@ -39,10 +39,11 @@ public:
|
||||||
void open_new_line(void);
|
void open_new_line(void);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void prepare(void);
|
||||||
void optimize(void);
|
void optimize(void);
|
||||||
|
|
||||||
const Param & param;
|
const Param & param;
|
||||||
AllStateManater & all_manager;
|
AllStateManager & all_manager;
|
||||||
HTMLTextLine * last_line;
|
HTMLTextLine * last_line;
|
||||||
std::vector<std::unique_ptr<HTMLTextLine>> text_lines;
|
std::vector<std::unique_ptr<HTMLTextLine>> text_lines;
|
||||||
};
|
};
|
||||||
|
|
|
@ -411,7 +411,7 @@ private:
|
||||||
std::unordered_map<int, std::pair<double,double>> value_map;
|
std::unordered_map<int, std::pair<double,double>> value_map;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct AllStateManater
|
struct AllStateManager
|
||||||
{
|
{
|
||||||
TransformMatrixManager transform_matrix;
|
TransformMatrixManager transform_matrix;
|
||||||
VerticalAlignManager vertical_align;
|
VerticalAlignManager vertical_align;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user