1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-09-12 11:24:34 +00:00
pdf2htmlEX/src/HTMLTextLine.h

135 lines
3.7 KiB
C
Raw Normal View History

2013-04-06 15:55:32 +00:00
/*
* Header file for HTMLTextLine
* Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
*/
#ifndef HTMLTEXTLINE_H__
#define HTMLTEXTLINE_H__
2012-12-11 12:48:10 +00:00
2013-04-07 06:12:43 +00:00
#include <ostream>
2012-12-11 12:48:10 +00:00
#include <vector>
2013-04-06 08:45:01 +00:00
#include <CharTypes.h>
#include "Param.h"
2013-04-06 08:45:01 +00:00
#include "StateManager.h"
#include "HTMLState.h"
2012-12-11 12:48:10 +00:00
namespace pdf2htmlEX {
2013-01-19 11:19:15 +00:00
/*
2013-04-06 15:52:14 +00:00
* Store and optimize a line of text in HTML
*
* contains a series of
2013-01-19 11:19:15 +00:00
* - Text
* - Shift
* - State change
*/
2013-04-06 15:51:33 +00:00
class HTMLTextLine
2012-12-11 12:48:10 +00:00
{
public:
HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager);
2012-12-11 12:48:10 +00:00
struct State : public HTMLTextState {
2013-03-30 17:00:04 +00:00
// before output
void begin(std::ostream & out, const State * prev_state);
// after output
void end(std::ostream & out) const;
// calculate the hash code
void hash(void);
// calculate the difference between another State
int diff(const State & s) const;
enum {
FONT_ID,
FONT_SIZE_ID,
FILL_COLOR_ID,
STROKE_COLOR_ID,
LETTER_SPACE_ID,
WORD_SPACE_ID,
2013-04-05 13:53:34 +00:00
HASH_ID_COUNT,
2013-03-30 17:00:04 +00:00
2013-04-05 13:53:34 +00:00
VERTICAL_ALIGN_ID = HASH_ID_COUNT,
2013-03-30 17:00:04 +00:00
ID_COUNT
};
static long long umask_by_id(int id);
long long ids[ID_COUNT];
size_t start_idx; // index of the first Text using this state
2014-07-13 23:59:30 +00:00
// for optimization
2013-03-30 17:00:04 +00:00
long long hash_value;
long long hash_umask; // some states may not be actually used
bool need_close;
static const char * const css_class_names []; // class names for each id
2012-12-11 12:48:10 +00:00
};
2013-04-06 15:54:07 +00:00
struct Offset {
2013-12-26 11:57:24 +00:00
Offset(size_t size_idx, double width)
:start_idx(size_idx),width(width)
2013-04-03 17:35:44 +00:00
{ }
2013-03-30 17:00:04 +00:00
size_t start_idx; // should put this Offset right before text[start_idx];
double width;
2012-12-11 12:48:10 +00:00
};
/**
* Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
* multiple code points.
*/
2013-12-22 08:59:59 +00:00
void append_unicodes(const Unicode * u, int l, double width);
/**
* Append a special padding char with 0 width, in order to keep char index consistent.
* The padding char is ignored during output.
*/
void append_padding_char() { text.push_back(0); }
2012-12-11 12:48:10 +00:00
void append_offset(double width);
void append_state(const HTMLTextState & text_state);
2013-04-07 09:10:36 +00:00
void dump_text(std::ostream & out);
2013-04-07 09:30:07 +00:00
bool text_empty(void) const { return text.empty(); }
void clear(void);
2012-12-11 12:48:10 +00:00
2013-05-04 13:17:35 +00:00
void clip(const HTMLClipState &);
2013-05-04 11:26:26 +00:00
2013-04-07 08:10:52 +00:00
/*
* Optimize and calculate necessary values
*/
void prepare(void);
2013-12-22 08:59:59 +00:00
void optimize(std::vector<HTMLTextLine*> &);
2012-12-11 12:48:10 +00:00
private:
2013-12-22 08:59:59 +00:00
void optimize_normal(std::vector<HTMLTextLine*> &);
void optimize_aggressive(std::vector<HTMLTextLine*> &);
2013-03-20 15:46:58 +00:00
/**
* Dump chars' unicode to output stream.
* begin/pos is the index in 'text'.
*/
void dump_chars(std::ostream & out, int begin, int len);
void dump_char(std::ostream & out, int pos);
2013-03-20 15:46:58 +00:00
2013-04-06 09:01:05 +00:00
const Param & param;
2013-04-07 08:10:52 +00:00
AllStateManager & all_manager;
2012-12-11 12:48:10 +00:00
HTMLLineState line_state;
2013-04-07 08:10:52 +00:00
double ascent, descent;
2013-05-04 11:26:26 +00:00
double clip_x1, clip_y1;
2013-12-22 08:59:59 +00:00
double width;
2013-04-07 08:10:52 +00:00
2012-12-11 12:48:10 +00:00
std::vector<State> states;
std::vector<Offset> offsets;
2014-06-14 19:44:28 +00:00
/**
* Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
* - If c > 0, it is the unicode code point corresponds to the glyph;
* - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
* - If c < -1, this glyph corresponds to more than one unicode code points,
* which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
*/
std::vector<int> text;
std::vector<std::vector<Unicode> > decomposed_text;
2012-12-11 12:48:10 +00:00
};
} // namespace pdf2htmlEX
2013-04-06 15:55:32 +00:00
#endif //HTMLTEXTLINE_H__