1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-09-19 21:50:08 +00:00
pdf2htmlEX/src/HTMLRenderer/HTMLRenderer.h

448 lines
15 KiB
C
Raw Normal View History

2012-08-04 18:03:53 +00:00
/*
* HTMLRenderer.h
*
2012-08-23 21:07:01 +00:00
* by WangLu
2012-08-04 18:03:53 +00:00
*/
#ifndef HTMLRENDERER_H_
#define HTMLRENDERER_H_
#include <unordered_map>
#include <map>
#include <vector>
2012-08-15 04:27:41 +00:00
#include <set>
#include <sstream>
2012-09-03 13:54:48 +00:00
#include <cstdint>
2012-09-09 17:40:37 +00:00
#include <fstream>
2012-08-14 09:13:29 +00:00
2012-08-04 18:03:53 +00:00
#include <OutputDev.h>
#include <GfxState.h>
#include <Stream.h>
#include <PDFDoc.h>
#include <goo/gtypes.h>
#include <Object.h>
#include <GfxFont.h>
2012-09-16 10:30:34 +00:00
#include <Annot.h>
2012-08-04 18:03:53 +00:00
#include "Param.h"
2012-12-11 12:17:36 +00:00
#include "util/Preprocessor.h"
2012-11-29 10:28:07 +00:00
#include "util/const.h"
2012-11-29 12:39:30 +00:00
#include "util/StringFormatter.h"
2012-11-29 09:28:05 +00:00
#include "util/TmpFiles.h"
2012-08-14 18:28:19 +00:00
/*
* Naming Convention
*
* CSS classes
*
2012-09-25 11:29:59 +00:00
* _ - white space
* a - Annot link
2012-09-11 14:08:55 +00:00
* b - page Box
* d - page Decoration
2012-08-14 18:28:19 +00:00
* l - Line
2012-09-25 11:29:59 +00:00
* j - Js data
* p - Page
2012-08-14 18:28:19 +00:00
*
2012-10-01 20:06:38 +00:00
* Cd - CSS Draw
2012-10-01 17:59:04 +00:00
*
2013-01-25 08:39:20 +00:00
* Numbered CSS classes
* See also: HTMLRenderer::TextLineBuffer::format_str
2012-08-14 18:28:19 +00:00
*
* t<hex> - Transform matrix
2012-08-14 18:28:19 +00:00
* f<hex> - Font (also for font names)
* s<hex> - font Size
* l<hex> - Letter spacing
* w<hex> - Word spacing
2012-08-14 18:28:19 +00:00
* c<hex> - Color
* _<hex> - white space
2012-08-24 17:40:43 +00:00
* r<hex> - Rise
2012-09-16 07:53:41 +00:00
* h<hex> - Height
2013-01-25 08:39:20 +00:00
* L<hex> - Left
2012-08-14 18:28:19 +00:00
*
*/
namespace pdf2htmlEX {
2012-11-29 10:16:05 +00:00
// we may need more info of a font in the future
class FontInfo
{
public:
long long id;
bool use_tounicode;
int em_size;
double ascent, descent;
bool is_type3;
2012-11-29 10:16:05 +00:00
};
class GfxRGB_hash
{
public:
size_t operator () (const GfxRGB & rgb) const
{
2013-01-24 14:44:08 +00:00
return ( (((size_t)colToByte(rgb.r)) << 16)
| (((size_t)colToByte(rgb.g)) << 8)
| ((size_t)colToByte(rgb.b))
);
2012-11-29 10:16:05 +00:00
}
};
class GfxRGB_equal
{
public:
bool operator ()(const GfxRGB & rgb1, const GfxRGB & rgb2) const
{
2013-01-24 14:41:59 +00:00
return ((rgb1.r == rgb2.r) && (rgb1.g == rgb2.g) && (rgb1.b == rgb2.b));
2012-11-29 10:16:05 +00:00
}
};
class Matrix_less
{
public:
bool operator () (const Matrix & m1, const Matrix & m2) const
{
// Note that we only care about the first 4 elements
for(int i = 0; i < 4; ++i)
{
if(m1.m[i] < m2.m[i] - EPS)
return true;
if(m1.m[i] > m2.m[i] + EPS)
return false;
}
return false;
}
};
2012-11-29 12:39:30 +00:00
2012-08-04 18:03:53 +00:00
class HTMLRenderer : public OutputDev
{
public:
HTMLRenderer(const Param * param);
virtual ~HTMLRenderer();
void process(PDFDoc * doc);
2012-08-14 09:24:54 +00:00
////////////////////////////////////////////////////
// OutputDev interface
////////////////////////////////////////////////////
2012-08-04 18:03:53 +00:00
// Does this device use upside-down coordinates?
// (Upside-down means (0,0) is the top left corner of the page.)
virtual GBool upsideDown() { return gFalse; }
// Does this device use drawChar() or drawString()?
2012-08-07 07:03:06 +00:00
virtual GBool useDrawChar() { return gFalse; }
2012-08-04 18:03:53 +00:00
2012-10-02 18:19:40 +00:00
// Does this device use functionShadedFill(), axialShadedFill(), and
// radialShadedFill()? If this returns false, these shaded fills
// will be reduced to a series of other drawing operations.
2012-10-03 14:46:27 +00:00
virtual GBool useShadedFills(int type) { return (type == 2) ? gTrue: gFalse; }
2012-10-02 18:19:40 +00:00
2012-08-04 18:03:53 +00:00
// Does this device use beginType3Char/endType3Char? Otherwise,
// text in Type 3 fonts will be drawn with drawChar/drawString.
virtual GBool interpretType3Chars() { return gFalse; }
// Does this device need non-text content?
2012-10-03 14:46:27 +00:00
virtual GBool needNonText() { return (param->process_nontext) ? gTrue: gFalse; }
2012-08-04 18:03:53 +00:00
2012-09-16 10:30:34 +00:00
virtual void setDefaultCTM(double *ctm);
2012-08-04 18:03:53 +00:00
// Start a page.
2013-01-28 11:45:40 +00:00
// UGLY: These 2 versions are for different versions of poppler
2012-08-04 18:03:53 +00:00
virtual void startPage(int pageNum, GfxState *state);
2013-01-28 11:45:40 +00:00
virtual void startPage(int pageNum, GfxState *state, XRef * xref);
2012-08-04 18:03:53 +00:00
// End a page.
virtual void endPage();
2012-08-07 07:03:06 +00:00
/*
* To optmize false alarms
* We just mark as changed, and recheck if they have been changed when we are about to output a new string
*/
2012-10-03 14:46:27 +00:00
/*
* Ugly implementation of save/restore
*/
2013-01-24 12:39:24 +00:00
virtual void saveState(GfxState * state) { updateAll(state); }
virtual void restoreState(GfxState * state) { updateAll(state); }
2012-10-03 14:46:27 +00:00
2012-08-07 11:39:47 +00:00
virtual void updateAll(GfxState * state);
virtual void updateRise(GfxState * state);
2012-08-07 11:39:47 +00:00
virtual void updateTextPos(GfxState * state);
virtual void updateTextShift(GfxState * state, double shift);
virtual void updateFont(GfxState * state);
virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
virtual void updateTextMat(GfxState * state);
virtual void updateHorizScaling(GfxState * state);
virtual void updateCharSpace(GfxState * state);
virtual void updateWordSpace(GfxState * state);
2013-01-24 12:39:24 +00:00
virtual void updateRender(GfxState * state);
virtual void updateFillColorSpace(GfxState * state);
virtual void updateStrokeColorSpace(GfxState * state);
2012-08-07 11:39:47 +00:00
virtual void updateFillColor(GfxState * state);
2013-01-24 12:39:24 +00:00
virtual void updateStrokeColor(GfxState * state);
2012-08-04 18:03:53 +00:00
/*
* Rendering
*/
2012-08-04 18:03:53 +00:00
virtual void drawString(GfxState * state, GooString * s);
2012-08-12 10:53:22 +00:00
virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg);
2012-10-03 14:46:27 +00:00
virtual void stroke(GfxState *state) { css_do_path(state, false); }
virtual void fill(GfxState *state) { css_do_path(state, true); }
2012-10-02 18:19:40 +00:00
virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax);
2012-10-01 17:59:04 +00:00
virtual void processLink(AnnotLink * al);
2012-10-03 14:46:27 +00:00
/* capacity test */
bool can_stroke(GfxState *state) { return css_do_path(state, false, true); }
bool can_fill(GfxState *state) { return css_do_path(state, true, true); }
2012-08-13 14:20:38 +00:00
protected:
2012-08-15 04:27:41 +00:00
////////////////////////////////////////////////////
// misc
////////////////////////////////////////////////////
2012-10-24 14:24:33 +00:00
void pre_process(PDFDoc * doc);
2012-09-16 10:30:34 +00:00
void post_process();
2013-01-28 13:01:02 +00:00
void process_outline();
void process_outline_items(GooList * items);
2012-11-29 12:39:30 +00:00
void set_stream_flags (std::ostream & out);
2012-09-09 16:21:46 +00:00
std::string dump_embedded_font (GfxFont * font, long long fn_id);
void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false);
2012-08-10 15:40:57 +00:00
2013-01-28 13:01:02 +00:00
// convert a LinkAction to a string that our Javascript code can understand
std::string get_linkaction_str(LinkAction *, std::string & detail);
2013-01-28 10:31:02 +00:00
2012-08-14 09:24:54 +00:00
////////////////////////////////////////////////////
// manage styles
////////////////////////////////////////////////////
2012-09-04 15:33:15 +00:00
const FontInfo * install_font(GfxFont * font);
void install_embedded_font(GfxFont * font, FontInfo & info);
void install_base_font(GfxFont * font, GfxFontLoc * font_loc, FontInfo & info);
void install_external_font (GfxFont * font, FontInfo & info);
2012-08-04 18:03:53 +00:00
long long install_font_size(double font_size);
2012-08-06 10:10:06 +00:00
long long install_transform_matrix(const double * tm);
long long install_letter_space(double letter_space);
long long install_word_space(double word_space);
2012-08-06 16:48:33 +00:00
long long install_color(const GfxRGB * rgb);
long long install_whitespace(double ws_width, double & actual_width);
2012-08-24 17:40:43 +00:00
long long install_rise(double rise);
2012-09-16 07:53:41 +00:00
long long install_height(double height);
2013-01-25 08:39:20 +00:00
long long install_left(double left);
2012-08-04 18:03:53 +00:00
2012-08-14 09:24:54 +00:00
////////////////////////////////////////////////////
// export css styles
////////////////////////////////////////////////////
2012-08-04 18:03:53 +00:00
/*
* remote font: to be retrieved from the web server
* local font: to be substituted with a local (client side) font
*/
2012-08-30 15:36:30 +00:00
void export_remote_font(const FontInfo & info, const std::string & suffix, const std::string & fontfileformat, GfxFont * font);
2012-08-04 18:03:53 +00:00
void export_remote_default_font(long long fn_id);
void export_local_font(const FontInfo & info, GfxFont * font, const std::string & original_font_name, const std::string & cssfont);
2012-08-04 18:03:53 +00:00
void export_font_size(long long fs_id, double font_size);
2012-08-06 10:10:06 +00:00
void export_transform_matrix(long long tm_id, const double * tm);
void export_letter_space(long long ls_id, double letter_space);
void export_word_space(long long ws_id, double word_space);
2012-08-06 16:48:33 +00:00
void export_color(long long color_id, const GfxRGB * rgb);
void export_whitespace(long long ws_id, double ws_width);
2012-08-24 17:40:43 +00:00
void export_rise(long long rise_id, double rise);
2012-09-16 07:53:41 +00:00
void export_height(long long height_id, double height);
2013-01-25 08:39:20 +00:00
void export_left(long long left_id, double left);
2012-09-12 15:26:14 +00:00
// depending on single-html, to embed the content or add a link to it
// "type": specify the file type, usually it's the suffix, in which case this parameter could be ""
2012-09-12 15:26:14 +00:00
// "copy": indicates whether to copy the file into dest_dir, if not embedded
void embed_file(std::ostream & out, const std::string & path, const std::string & type, bool copy);
2012-09-12 15:26:14 +00:00
2012-08-14 09:24:54 +00:00
////////////////////////////////////////////////////
// state tracking
////////////////////////////////////////////////////
// check updated states, and determine new_line_stauts
// make sure this function can be called several times consecutively without problem
2012-08-14 09:24:54 +00:00
void check_state_change(GfxState * state);
// reset all ***_changed flags
void reset_state_change();
// prepare the line context, (close old tags, open new tags)
// make sure the current HTML style consistent with PDF
2012-10-01 17:59:04 +00:00
void prepare_text_line(GfxState * state);
void close_text_line();
////////////////////////////////////////////////////
// CSS drawing
////////////////////////////////////////////////////
2012-10-03 14:46:27 +00:00
/*
* test_only is for capacity check
*/
bool css_do_path(GfxState *state, bool fill, bool test_only = false);
2012-10-01 20:06:38 +00:00
/*
* coordinates are to transformed by state->getCTM()
* (x,y) should be the bottom-left corner INCLUDING border
* w,h should be the metrics WITHOUT border
*
* line_color & fill_color may be specified as nullptr to indicate none
2012-10-02 18:19:40 +00:00
* style_function & style_function_data may be provided to provide more styles
2012-10-01 20:06:38 +00:00
*/
2012-10-02 18:19:40 +00:00
void css_draw_rectangle(double x, double y, double w, double h, const double * tm,
2012-10-01 20:06:38 +00:00
double * line_width_array, int line_width_count,
2012-10-02 18:19:40 +00:00
const GfxRGB * line_color, const GfxRGB * fill_color,
void (*style_function)(void *, std::ostream &) = nullptr, void * style_function_data = nullptr );
2012-08-14 09:24:54 +00:00
////////////////////////////////////////////////////
// PDF stuffs
////////////////////////////////////////////////////
2012-08-06 16:48:33 +00:00
XRef * xref;
2012-09-16 10:30:34 +00:00
PDFDoc * cur_doc;
2013-01-28 10:31:02 +00:00
Catalog * cur_catalog;
2012-09-16 10:30:34 +00:00
double default_ctm[6];
2012-08-04 18:03:53 +00:00
// page info
2012-08-14 09:50:16 +00:00
int pageNum;
2012-08-05 11:39:37 +00:00
double pageWidth ;
double pageHeight ;
2012-08-04 18:03:53 +00:00
2012-09-22 14:47:44 +00:00
/*
* The content of each page is first scaled with factor1 (>=1), then scale back with factor2(<=1)
*
* factor1 is use to multiplied with all metrics (height/width/font-size...), in order to improve accuracy
* factor2 is applied with css transform, and is exposed to Javascript
*
* factor1 & factor 2 are determined according to zoom and font-size-multiplier
*
*/
2012-10-02 06:19:20 +00:00
double text_zoom_factor (void) const { return text_scale_factor1 * text_scale_factor2; }
double text_scale_factor1;
double text_scale_factor2;
2012-09-22 14:47:44 +00:00
2012-08-05 11:39:37 +00:00
2012-08-14 09:24:54 +00:00
////////////////////////////////////////////////////
// states
////////////////////////////////////////////////////
2012-09-04 15:33:15 +00:00
bool line_opened;
2012-09-09 06:48:10 +00:00
enum NewLineState
{
2012-09-09 06:48:10 +00:00
NLS_NONE, // stay with the same style
NLS_SPAN, // open a new <span> if possible, otherwise a new <div>
NLS_DIV // has to open a new <div>
2012-09-04 15:33:15 +00:00
} new_line_state;
// The order is according to the appearance in check_state_change
2012-08-14 09:50:16 +00:00
// any state changed
bool all_changed;
2012-08-06 16:48:33 +00:00
// current position
2012-08-07 11:39:47 +00:00
double cur_tx, cur_ty; // real text position, in text coords
bool text_pos_changed;
2012-08-05 11:39:37 +00:00
// font & size
2012-09-04 15:33:15 +00:00
const FontInfo * cur_font_info;
2012-08-05 11:39:37 +00:00
double cur_font_size;
long long cur_fs_id;
bool font_changed;
2012-08-04 18:03:53 +00:00
// transform matrix
2012-10-01 17:59:04 +00:00
long long cur_ttm_id;
2012-08-06 10:10:06 +00:00
bool ctm_changed;
bool text_mat_changed;
// horizontal scaling
bool hori_scale_changed;
2012-10-01 17:59:04 +00:00
// this is CTM * TextMAT in PDF
// [4] and [5] are ignored,
// as we'll calculate the position of the origin separately
2012-10-01 17:59:04 +00:00
double cur_text_tm[6]; // unscaled
2012-08-06 10:10:06 +00:00
// letter spacing
long long cur_ls_id;
double cur_letter_space;
bool letter_space_changed;
// word spacing
long long cur_ws_id;
double cur_word_space;
bool word_space_changed;
2012-10-12 14:39:58 +00:00
// text color
2012-08-06 16:48:33 +00:00
long long cur_color_id;
GfxRGB cur_color;
bool color_changed;
2012-08-06 10:10:06 +00:00
2012-08-24 17:40:43 +00:00
// rise
long long cur_rise_id;
double cur_rise;
bool rise_changed;
2012-08-14 06:35:55 +00:00
// optimize for web
2012-08-06 14:46:50 +00:00
// we try to render the final font size directly
// to reduce the effect of ctm as much as possible
2012-08-07 07:03:06 +00:00
2012-10-02 06:19:20 +00:00
// draw_ctm is cur_ctm scaled by 1/draw_text_scale,
// so everything redenered should be multiplied by draw_text_scale
2012-10-01 17:59:04 +00:00
double draw_text_tm[6];
2012-08-06 10:10:06 +00:00
double draw_font_size;
2012-10-02 06:19:20 +00:00
double draw_text_scale;
2012-08-04 18:03:53 +00:00
2012-08-07 11:39:47 +00:00
// the position of next char, in text coords
2012-08-28 11:10:31 +00:00
// this is actual position (in HTML), which might be different from cur_tx/ty (in PDF)
// also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty)
2012-08-07 11:39:47 +00:00
double draw_tx, draw_ty;
// some metrics have to be determined after all elements in the lines have been seen
2013-01-19 11:19:15 +00:00
// see TextLineBuffer.h
2012-12-11 12:48:01 +00:00
class TextLineBuffer;
friend class TextLineBuffer;
TextLineBuffer * text_line_buf;
2012-09-03 13:54:48 +00:00
// for font reencoding
int32_t * cur_mapping;
char ** cur_mapping2;
int * width_list;
2012-11-29 12:39:30 +00:00
Preprocessor preprocessor;
2012-11-26 21:38:13 +00:00
TmpFiles tmp_files;
2012-09-07 16:38:41 +00:00
// for string formatting
2012-11-29 12:39:30 +00:00
StringFormatter str_fmt;
2012-09-03 13:54:48 +00:00
2012-08-14 09:24:54 +00:00
////////////////////////////////////////////////////
// styles & resources
////////////////////////////////////////////////////
2012-08-14 09:13:29 +00:00
std::unordered_map<long long, FontInfo> font_name_map;
std::map<double, long long> font_size_map;
2012-10-02 18:19:40 +00:00
std::map<Matrix, long long, Matrix_less> transform_matrix_map;
std::map<double, long long> letter_space_map;
std::map<double, long long> word_space_map;
std::unordered_map<GfxRGB, long long, GfxRGB_hash, GfxRGB_equal> color_map;
std::map<double, long long> whitespace_map;
2012-08-24 17:40:43 +00:00
std::map<double, long long> rise_map;
2012-09-16 07:53:41 +00:00
std::map<double, long long> height_map;
2013-01-25 08:39:20 +00:00
std::map<double, long long> left_map;
2012-08-04 18:03:53 +00:00
const Param * param;
2013-01-28 10:46:44 +00:00
struct {
std::ofstream fs;
std::string path;
2013-01-28 12:00:20 +00:00
} f_outline, f_pages, f_css;
2012-09-12 15:26:14 +00:00
static const std::string MANIFEST_FILENAME;
2012-08-04 18:03:53 +00:00
};
} //namespace pdf2htmlEX
2012-08-04 18:03:53 +00:00
#endif /* HTMLRENDERER_H_ */