2012-08-04 18:03:53 +00:00
|
|
|
/*
|
|
|
|
* HTMLRenderer.h
|
|
|
|
*
|
2012-08-23 21:07:01 +00:00
|
|
|
* by WangLu
|
2012-08-04 18:03:53 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef HTMLRENDERER_H_
|
|
|
|
#define HTMLRENDERER_H_
|
|
|
|
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <map>
|
|
|
|
#include <vector>
|
2012-08-15 04:27:41 +00:00
|
|
|
#include <set>
|
2012-08-30 16:25:05 +00:00
|
|
|
#include <sstream>
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-14 09:50:16 +00:00
|
|
|
#include <boost/format.hpp>
|
2012-08-14 09:13:29 +00:00
|
|
|
#include <boost/filesystem/fstream.hpp>
|
|
|
|
|
2012-08-04 18:03:53 +00:00
|
|
|
#include <OutputDev.h>
|
|
|
|
#include <GfxState.h>
|
|
|
|
#include <Stream.h>
|
|
|
|
#include <XRef.h>
|
|
|
|
#include <Catalog.h>
|
|
|
|
#include <Page.h>
|
|
|
|
#include <PDFDoc.h>
|
|
|
|
#include <goo/gtypes.h>
|
|
|
|
#include <Object.h>
|
|
|
|
#include <GfxFont.h>
|
|
|
|
|
|
|
|
#include "Param.h"
|
2012-08-10 13:30:41 +00:00
|
|
|
#include "util.h"
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-14 18:28:19 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Naming Convention
|
|
|
|
*
|
|
|
|
* CSS classes
|
|
|
|
*
|
|
|
|
* p - Page
|
|
|
|
* l - Line
|
2012-08-15 10:48:11 +00:00
|
|
|
* _ - white space
|
2012-08-14 18:28:19 +00:00
|
|
|
* i - Image
|
|
|
|
*
|
|
|
|
* Reusable CSS classes
|
|
|
|
*
|
2012-08-15 10:48:11 +00:00
|
|
|
* t<hex> - Transform matrix
|
2012-08-14 18:28:19 +00:00
|
|
|
* f<hex> - Font (also for font names)
|
|
|
|
* s<hex> - font Size
|
2012-08-15 10:48:11 +00:00
|
|
|
* l<hex> - Letter spacing
|
|
|
|
* w<hex> - Word spacing
|
2012-08-14 18:28:19 +00:00
|
|
|
* c<hex> - Color
|
2012-08-15 10:48:11 +00:00
|
|
|
* _<hex> - white space
|
2012-08-24 17:40:43 +00:00
|
|
|
* r<hex> - Rise
|
2012-08-14 18:28:19 +00:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2012-08-04 18:03:53 +00:00
|
|
|
class HTMLRenderer : public OutputDev
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
HTMLRenderer(const Param * param);
|
|
|
|
virtual ~HTMLRenderer();
|
|
|
|
|
|
|
|
void process(PDFDoc * doc);
|
|
|
|
|
2012-08-14 09:24:54 +00:00
|
|
|
////////////////////////////////////////////////////
|
|
|
|
// OutputDev interface
|
|
|
|
////////////////////////////////////////////////////
|
|
|
|
|
2012-08-04 18:03:53 +00:00
|
|
|
// Does this device use upside-down coordinates?
|
|
|
|
// (Upside-down means (0,0) is the top left corner of the page.)
|
|
|
|
virtual GBool upsideDown() { return gFalse; }
|
|
|
|
|
|
|
|
// Does this device use drawChar() or drawString()?
|
2012-08-07 07:03:06 +00:00
|
|
|
virtual GBool useDrawChar() { return gFalse; }
|
2012-08-04 18:03:53 +00:00
|
|
|
|
|
|
|
// Does this device use beginType3Char/endType3Char? Otherwise,
|
|
|
|
// text in Type 3 fonts will be drawn with drawChar/drawString.
|
|
|
|
virtual GBool interpretType3Chars() { return gFalse; }
|
|
|
|
|
|
|
|
// Does this device need non-text content?
|
|
|
|
virtual GBool needNonText() { return gFalse; }
|
|
|
|
|
2012-08-14 12:30:18 +00:00
|
|
|
virtual void pre_process();
|
|
|
|
virtual void post_process();
|
|
|
|
virtual void process_single_html();
|
|
|
|
|
2012-08-04 18:03:53 +00:00
|
|
|
// Start a page.
|
|
|
|
virtual void startPage(int pageNum, GfxState *state);
|
|
|
|
|
|
|
|
// End a page.
|
|
|
|
virtual void endPage();
|
|
|
|
|
2012-08-07 07:03:06 +00:00
|
|
|
/*
|
|
|
|
* To optmize false alarms
|
|
|
|
* We just mark as changed, and recheck if they have been changed when we are about to output a new string
|
|
|
|
*/
|
2012-08-07 11:39:47 +00:00
|
|
|
virtual void updateAll(GfxState * state);
|
2012-08-15 10:48:11 +00:00
|
|
|
|
|
|
|
virtual void updateRise(GfxState * state);
|
2012-08-07 11:39:47 +00:00
|
|
|
virtual void updateTextPos(GfxState * state);
|
|
|
|
virtual void updateTextShift(GfxState * state, double shift);
|
2012-08-15 10:48:11 +00:00
|
|
|
|
|
|
|
virtual void updateFont(GfxState * state);
|
|
|
|
virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
|
|
|
|
virtual void updateTextMat(GfxState * state);
|
|
|
|
virtual void updateHorizScaling(GfxState * state);
|
|
|
|
|
|
|
|
virtual void updateCharSpace(GfxState * state);
|
|
|
|
virtual void updateWordSpace(GfxState * state);
|
|
|
|
|
2012-08-07 11:39:47 +00:00
|
|
|
virtual void updateFillColor(GfxState * state);
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-15 10:48:11 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Rendering
|
|
|
|
*/
|
|
|
|
|
2012-08-04 18:03:53 +00:00
|
|
|
virtual void drawString(GfxState * state, GooString * s);
|
|
|
|
|
2012-08-12 10:53:22 +00:00
|
|
|
virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg);
|
|
|
|
|
2012-08-13 14:20:38 +00:00
|
|
|
protected:
|
2012-08-15 04:27:41 +00:00
|
|
|
////////////////////////////////////////////////////
|
|
|
|
// misc
|
|
|
|
////////////////////////////////////////////////////
|
|
|
|
void add_tmp_file (const std::string & fn);
|
|
|
|
void clean_tmp_files ();
|
2012-08-31 07:27:17 +00:00
|
|
|
boost::filesystem::path dump_embedded_font (GfxFont * font, long long fn_id);
|
2012-08-31 07:50:14 +00:00
|
|
|
void embed_font(const boost::filesystem::path & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false);
|
2012-08-10 15:40:57 +00:00
|
|
|
|
2012-08-14 09:24:54 +00:00
|
|
|
////////////////////////////////////////////////////
|
|
|
|
// manage styles
|
|
|
|
////////////////////////////////////////////////////
|
2012-08-27 15:09:01 +00:00
|
|
|
FontInfo install_font(GfxFont * font);
|
2012-08-31 07:27:17 +00:00
|
|
|
void install_embedded_font(GfxFont * font, FontInfo & info);
|
|
|
|
void install_base_font(GfxFont * font, GfxFontLoc * font_loc, FontInfo & info);
|
|
|
|
void install_external_font (GfxFont * font, FontInfo & info);
|
2012-08-04 18:03:53 +00:00
|
|
|
|
|
|
|
long long install_font_size(double font_size);
|
2012-08-06 10:10:06 +00:00
|
|
|
long long install_transform_matrix(const double * tm);
|
2012-08-15 10:48:11 +00:00
|
|
|
long long install_letter_space(double letter_space);
|
|
|
|
long long install_word_space(double word_space);
|
2012-08-06 16:48:33 +00:00
|
|
|
long long install_color(const GfxRGB * rgb);
|
2012-08-15 10:48:11 +00:00
|
|
|
long long install_whitespace(double ws_width, double & actual_width);
|
2012-08-24 17:40:43 +00:00
|
|
|
long long install_rise(double rise);
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-14 09:24:54 +00:00
|
|
|
////////////////////////////////////////////////////
|
|
|
|
// export css styles
|
|
|
|
////////////////////////////////////////////////////
|
2012-08-04 18:03:53 +00:00
|
|
|
/*
|
|
|
|
* remote font: to be retrieved from the web server
|
|
|
|
* local font: to be substituted with a local (client side) font
|
|
|
|
*/
|
2012-08-30 15:36:30 +00:00
|
|
|
void export_remote_font(const FontInfo & info, const std::string & suffix, const std::string & fontfileformat, GfxFont * font);
|
2012-08-04 18:03:53 +00:00
|
|
|
void export_remote_default_font(long long fn_id);
|
2012-08-31 07:50:14 +00:00
|
|
|
void export_local_font(const FontInfo & info, GfxFont * font, const std::string & original_font_name, const std::string & cssfont);
|
2012-08-29 17:01:45 +00:00
|
|
|
|
2012-08-04 18:03:53 +00:00
|
|
|
void export_font_size(long long fs_id, double font_size);
|
2012-08-06 10:10:06 +00:00
|
|
|
void export_transform_matrix(long long tm_id, const double * tm);
|
2012-08-15 10:48:11 +00:00
|
|
|
void export_letter_space(long long ls_id, double letter_space);
|
|
|
|
void export_word_space(long long ws_id, double word_space);
|
2012-08-06 16:48:33 +00:00
|
|
|
void export_color(long long color_id, const GfxRGB * rgb);
|
2012-08-15 10:48:11 +00:00
|
|
|
void export_whitespace(long long ws_id, double ws_width);
|
2012-08-24 17:40:43 +00:00
|
|
|
void export_rise(long long rise_id, double rise);
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-14 09:24:54 +00:00
|
|
|
////////////////////////////////////////////////////
|
|
|
|
// state tracking
|
|
|
|
////////////////////////////////////////////////////
|
2012-08-16 12:26:09 +00:00
|
|
|
// check updated states, and determine new_line_stauts
|
|
|
|
// make sure this function can be called several times consecutively without problem
|
2012-08-14 09:24:54 +00:00
|
|
|
void check_state_change(GfxState * state);
|
2012-08-16 12:26:09 +00:00
|
|
|
// reset all ***_changed flags
|
|
|
|
void reset_state_change();
|
|
|
|
// prepare the line context, (close old tags, open new tags)
|
|
|
|
// make sure the current HTML style consistent with PDF
|
|
|
|
void prepare_line(GfxState * state);
|
2012-08-16 10:11:22 +00:00
|
|
|
void close_line();
|
2012-08-14 09:24:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////
|
|
|
|
// PDF stuffs
|
|
|
|
////////////////////////////////////////////////////
|
|
|
|
|
2012-08-06 16:48:33 +00:00
|
|
|
XRef * xref;
|
2012-08-04 18:03:53 +00:00
|
|
|
|
|
|
|
// page info
|
2012-08-14 09:50:16 +00:00
|
|
|
int pageNum;
|
2012-08-05 11:39:37 +00:00
|
|
|
double pageWidth ;
|
|
|
|
double pageHeight ;
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-05 11:39:37 +00:00
|
|
|
|
2012-08-14 09:24:54 +00:00
|
|
|
////////////////////////////////////////////////////
|
|
|
|
// states
|
|
|
|
////////////////////////////////////////////////////
|
2012-08-16 10:11:22 +00:00
|
|
|
//line status
|
|
|
|
//indicating the status for current line & next line
|
|
|
|
//see comments: meaning for current line || meaning for next line
|
2012-08-16 08:20:16 +00:00
|
|
|
enum class LineStatus
|
|
|
|
{
|
2012-08-16 10:11:22 +00:00
|
|
|
NONE, // no line is opened (last <div> is closed) || stay with the same style
|
|
|
|
SPAN, // there's a pending opening <span> (within a pending opening <div>) || open a new <span> if possible, otherwise a new <div>
|
|
|
|
DIV // there's a pending opening <div> (but no <span>) || has to open a new <div>
|
|
|
|
} line_status, new_line_status;
|
2012-08-15 10:48:11 +00:00
|
|
|
|
|
|
|
// The order is according to the appearance in check_state_change
|
2012-08-14 09:50:16 +00:00
|
|
|
// any state changed
|
|
|
|
bool all_changed;
|
2012-08-06 16:48:33 +00:00
|
|
|
// current position
|
2012-08-07 11:39:47 +00:00
|
|
|
double cur_tx, cur_ty; // real text position, in text coords
|
|
|
|
bool text_pos_changed;
|
2012-08-05 11:39:37 +00:00
|
|
|
|
2012-08-15 10:48:11 +00:00
|
|
|
// font & size
|
2012-08-27 15:09:01 +00:00
|
|
|
FontInfo cur_font_info;
|
2012-08-05 11:39:37 +00:00
|
|
|
double cur_font_size;
|
|
|
|
long long cur_fs_id;
|
|
|
|
bool font_changed;
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-15 10:48:11 +00:00
|
|
|
// transform matrix
|
2012-08-06 10:10:06 +00:00
|
|
|
long long cur_tm_id;
|
|
|
|
bool ctm_changed;
|
|
|
|
bool text_mat_changed;
|
2012-08-15 10:48:11 +00:00
|
|
|
// horizontal scaling
|
|
|
|
bool hori_scale_changed;
|
2012-08-07 07:03:06 +00:00
|
|
|
// this is CTM * TextMAT in PDF, not only CTM
|
2012-08-15 10:48:11 +00:00
|
|
|
// [4] and [5] are ignored,
|
|
|
|
// as we'll calculate the position of the origin separately
|
|
|
|
// TODO: changed this for images
|
2012-08-07 07:03:06 +00:00
|
|
|
double cur_ctm[6]; // unscaled
|
2012-08-06 10:10:06 +00:00
|
|
|
|
2012-08-15 10:48:11 +00:00
|
|
|
// letter spacing
|
|
|
|
long long cur_ls_id;
|
|
|
|
double cur_letter_space;
|
|
|
|
bool letter_space_changed;
|
|
|
|
|
|
|
|
// word spacing
|
|
|
|
long long cur_ws_id;
|
|
|
|
double cur_word_space;
|
|
|
|
bool word_space_changed;
|
|
|
|
|
|
|
|
// color
|
2012-08-06 16:48:33 +00:00
|
|
|
long long cur_color_id;
|
|
|
|
GfxRGB cur_color;
|
|
|
|
bool color_changed;
|
2012-08-06 10:10:06 +00:00
|
|
|
|
2012-08-24 17:40:43 +00:00
|
|
|
// rise
|
|
|
|
long long cur_rise_id;
|
|
|
|
double cur_rise;
|
|
|
|
bool rise_changed;
|
|
|
|
|
2012-08-14 06:35:55 +00:00
|
|
|
// optimize for web
|
2012-08-06 14:46:50 +00:00
|
|
|
// we try to render the final font size directly
|
|
|
|
// to reduce the effect of ctm as much as possible
|
2012-08-07 07:03:06 +00:00
|
|
|
|
2012-08-07 11:39:47 +00:00
|
|
|
// draw_ctm is cur_ctm scaled by 1/draw_scale,
|
|
|
|
// so everything redenered should be multiplied by draw_scale
|
2012-08-06 10:10:06 +00:00
|
|
|
double draw_ctm[6];
|
|
|
|
double draw_font_size;
|
2012-08-07 07:03:06 +00:00
|
|
|
double draw_scale;
|
2012-08-04 18:03:53 +00:00
|
|
|
|
2012-08-07 11:39:47 +00:00
|
|
|
// the position of next char, in text coords
|
2012-08-28 11:10:31 +00:00
|
|
|
// this is actual position (in HTML), which might be different from cur_tx/ty (in PDF)
|
|
|
|
// also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty)
|
2012-08-07 11:39:47 +00:00
|
|
|
double draw_tx, draw_ty;
|
|
|
|
|
2012-08-30 16:25:05 +00:00
|
|
|
// some metrics have to be determined after all elements in the lines have been seen
|
|
|
|
// TODO: add a class for these
|
|
|
|
double line_x, line_y;
|
|
|
|
long long line_tm_id;
|
2012-08-30 17:08:31 +00:00
|
|
|
double line_ascent, line_height;
|
2012-08-30 16:25:05 +00:00
|
|
|
std::stringstream line_buf;
|
|
|
|
|
2012-08-14 09:24:54 +00:00
|
|
|
////////////////////////////////////////////////////
|
|
|
|
// styles & resources
|
|
|
|
////////////////////////////////////////////////////
|
|
|
|
|
2012-08-14 09:13:29 +00:00
|
|
|
std::unordered_map<long long, FontInfo> font_name_map;
|
|
|
|
std::map<double, long long> font_size_map;
|
2012-08-15 10:48:11 +00:00
|
|
|
|
2012-08-14 09:13:29 +00:00
|
|
|
std::map<TM, long long> transform_matrix_map;
|
2012-08-15 10:48:11 +00:00
|
|
|
|
|
|
|
std::map<double, long long> letter_space_map;
|
|
|
|
std::map<double, long long> word_space_map;
|
|
|
|
|
2012-08-14 09:13:29 +00:00
|
|
|
std::map<GfxRGB, long long> color_map;
|
2012-08-06 16:48:33 +00:00
|
|
|
|
2012-08-15 10:48:11 +00:00
|
|
|
std::map<double, long long> whitespace_map;
|
2012-08-24 17:40:43 +00:00
|
|
|
std::map<double, long long> rise_map;
|
2012-08-15 10:48:11 +00:00
|
|
|
|
2012-08-12 10:53:22 +00:00
|
|
|
int image_count;
|
|
|
|
|
2012-08-04 18:03:53 +00:00
|
|
|
const Param * param;
|
2012-08-14 09:50:16 +00:00
|
|
|
boost::filesystem::path dest_dir, tmp_dir;
|
2012-08-14 18:46:53 +00:00
|
|
|
boost::filesystem::ofstream html_fout, allcss_fout;
|
2012-08-15 04:27:41 +00:00
|
|
|
std::set<std::string> tmp_files;
|
2012-08-15 07:29:35 +00:00
|
|
|
|
|
|
|
static const std::string HEAD_HTML_FILENAME;
|
|
|
|
static const std::string NECK_HTML_FILENAME;
|
|
|
|
static const std::string TAIL_HTML_FILENAME;
|
|
|
|
static const std::string CSS_FILENAME;
|
2012-08-31 04:45:16 +00:00
|
|
|
static const std::string UNIFY_SCRIPT_FILENAME;
|
2012-08-15 07:43:49 +00:00
|
|
|
// for cross-platform purpose, use a "null" file instead of /dev/null
|
2012-08-15 07:29:35 +00:00
|
|
|
static const std::string NULL_FILENAME;
|
2012-08-04 18:03:53 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* HTMLRENDERER_H_ */
|