mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 13:00:08 +00:00
reorganizaing
This commit is contained in:
parent
63287ce491
commit
35fccdc28c
@ -163,15 +163,19 @@ add_executable(pdf2htmlEX
|
||||
src/BackgroundRenderer/SplashBackgroundRenderer.cc
|
||||
src/BackgroundRenderer/CairoBackgroundRenderer.h
|
||||
src/BackgroundRenderer/CairoBackgroundRenderer.cc
|
||||
src/util/namespace.h
|
||||
src/util/ffw.h
|
||||
src/util/ffw.c
|
||||
src/util/util.h
|
||||
src/util/util.cc
|
||||
src/util/TmpFiles.h
|
||||
src/util/TmpFiles.cc
|
||||
src/util/ArgParser.h
|
||||
src/util/ArgParser.cc
|
||||
src/util/const.h
|
||||
src/util/const.cc
|
||||
src/util/ffw.h
|
||||
src/util/ffw.c
|
||||
src/util/namespace.h
|
||||
src/util/TmpFiles.h
|
||||
src/util/TmpFiles.cc
|
||||
src/util/unicode.h
|
||||
src/util/unicode.cc
|
||||
src/util/util.h
|
||||
src/util/util.cc
|
||||
)
|
||||
target_link_libraries(pdf2htmlEX ${PDF2HTMLEX_LIBS})
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#include "HTMLRenderer.h"
|
||||
#include "util/namespace.h"
|
||||
#include "util/unicode.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
|
@ -99,7 +99,7 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
|
||||
// we have already shifted the origin
|
||||
|
||||
// TODO: recognize common matices
|
||||
if(_tm_equal(tm, id_matrix, 4))
|
||||
if(_tm_equal(tm, ID_MATRIX, 4))
|
||||
{
|
||||
auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"};
|
||||
for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter)
|
||||
|
@ -170,8 +170,8 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
||||
cur_font_size = draw_font_size = 0;
|
||||
cur_fs_id = install_font_size(cur_font_size);
|
||||
|
||||
memcpy(cur_text_tm, id_matrix, sizeof(cur_text_tm));
|
||||
memcpy(draw_text_tm, id_matrix, sizeof(draw_text_tm));
|
||||
memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm));
|
||||
memcpy(draw_text_tm, ID_MATRIX, sizeof(draw_text_tm));
|
||||
cur_ttm_id = install_transform_matrix(draw_text_tm);
|
||||
|
||||
cur_letter_space = cur_word_space = 0;
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "HTMLRenderer.h"
|
||||
#include "util/ffw.h"
|
||||
#include "util/namespace.h"
|
||||
#include "util/unicode.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
|
39
src/util/const.cc
Normal file
39
src/util/const.cc
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Constants
|
||||
*
|
||||
* by WangLu
|
||||
* 2012.11.29
|
||||
*/
|
||||
|
||||
#include "const.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
using std::map;
|
||||
using std::string;
|
||||
|
||||
const double ID_MATRIX[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
|
||||
|
||||
const map<string, string> BASE_14_FONT_CSS_FONT_MAP({
|
||||
{ "Courier", "Courier,monospace" },
|
||||
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },
|
||||
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },
|
||||
{ "Symbol", "Symbol,\"Standard Symbols L\"" },
|
||||
{ "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },
|
||||
});
|
||||
|
||||
const map<string, string> GB_ENCODED_FONT_NAME_MAP({
|
||||
{"\xCB\xCE\xCC\xE5", "SimSun"},
|
||||
{"\xBA\xDA\xCC\xE5", "SimHei"},
|
||||
{"\xBF\xAC\xCC\xE5_GB2312", "SimKai"},
|
||||
{"\xB7\xC2\xCB\xCE_GB2312", "SimFang"},
|
||||
{"\xC1\xA5\xCA\xE9", "SimLi"},
|
||||
});
|
||||
|
||||
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({
|
||||
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}},
|
||||
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}},
|
||||
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}},
|
||||
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}}
|
||||
});
|
||||
} //namespace pdf2htmlEX
|
31
src/util/const.h
Normal file
31
src/util/const.h
Normal file
@ -0,0 +1,31 @@
|
||||
/*
|
||||
* Constants
|
||||
*
|
||||
* by WangLu
|
||||
* 2012.11.29
|
||||
*/
|
||||
|
||||
#ifndef CONST_H__
|
||||
#define CONST_H__
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
static const double EPS = 1e-6;
|
||||
static const double DEFAULT_DPI = 72.0;
|
||||
extern const double ID_MATRIX[6];
|
||||
|
||||
// PDF base 14 font name -> CSS font name
|
||||
extern const std::map<std::string, std::string> BASE_14_FONT_CSS_FONT_MAP;
|
||||
// For GB encoded font names
|
||||
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
|
||||
// map to embed files into html
|
||||
// key: (suffix, if_embed_content)
|
||||
// value: (prefix string, suffix string)
|
||||
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP;
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
||||
#endif //CONST_H__
|
157
src/util/unicode.cc
Normal file
157
src/util/unicode.cc
Normal file
@ -0,0 +1,157 @@
|
||||
/*
|
||||
* Unicode manipulation functions
|
||||
*
|
||||
* by WangLu
|
||||
* 2012.11.29
|
||||
*/
|
||||
|
||||
#include <GlobalParams.h>
|
||||
|
||||
#include "unicode.h"
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
using std::cerr;
|
||||
using std::endl;
|
||||
using std::ostream;
|
||||
|
||||
bool isLegalUnicode(Unicode u)
|
||||
{
|
||||
/*
|
||||
if((u == 9) || (u == 10) || (u == 13))
|
||||
return true;
|
||||
*/
|
||||
|
||||
if(u <= 31)
|
||||
return false;
|
||||
|
||||
if((u >= 127) && (u <= 159))
|
||||
return false;
|
||||
|
||||
if((u >= 0xd800) && (u <= 0xdfff))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Unicode map_to_private(CharCode code)
|
||||
{
|
||||
Unicode private_mapping = (Unicode)(code + 0xE000);
|
||||
if(private_mapping > 0xF8FF)
|
||||
{
|
||||
private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000);
|
||||
if(private_mapping > 0xFFFFD)
|
||||
{
|
||||
private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000);
|
||||
if(private_mapping > 0x10FFFD)
|
||||
{
|
||||
cerr << "Warning: all private use unicode are used" << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
return private_mapping;
|
||||
}
|
||||
|
||||
Unicode unicode_from_font (CharCode code, GfxFont * font)
|
||||
{
|
||||
if(!font->isCIDFont())
|
||||
{
|
||||
char * cname = dynamic_cast<Gfx8BitFont*>(font)->getCharName(code);
|
||||
// may be untranslated ligature
|
||||
if(cname)
|
||||
{
|
||||
Unicode ou = globalParams->mapNameToUnicode(cname);
|
||||
|
||||
if(isLegalUnicode(ou))
|
||||
return ou;
|
||||
}
|
||||
}
|
||||
|
||||
return map_to_private(code);
|
||||
}
|
||||
|
||||
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
|
||||
{
|
||||
if(len == 0)
|
||||
return map_to_private(code);
|
||||
|
||||
if(len == 1)
|
||||
{
|
||||
if(isLegalUnicode(*u))
|
||||
return *u;
|
||||
}
|
||||
|
||||
return unicode_from_font(code, font);
|
||||
}
|
||||
|
||||
/*
|
||||
* Copied from UTF.h / UTF8.h in poppler
|
||||
*/
|
||||
static int mapUTF8(Unicode u, char *buf, int bufSize) {
|
||||
if (u <= 0x0000007f) {
|
||||
if (bufSize < 1) {
|
||||
return 0;
|
||||
}
|
||||
buf[0] = (char)u;
|
||||
return 1;
|
||||
} else if (u <= 0x000007ff) {
|
||||
if (bufSize < 2) {
|
||||
return 0;
|
||||
}
|
||||
buf[0] = (char)(0xc0 + (u >> 6));
|
||||
buf[1] = (char)(0x80 + (u & 0x3f));
|
||||
return 2;
|
||||
} else if (u <= 0x0000ffff) {
|
||||
if (bufSize < 3) {
|
||||
return 0;
|
||||
}
|
||||
buf[0] = (char)(0xe0 + (u >> 12));
|
||||
buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
|
||||
buf[2] = (char)(0x80 + (u & 0x3f));
|
||||
return 3;
|
||||
} else if (u <= 0x0010ffff) {
|
||||
if (bufSize < 4) {
|
||||
return 0;
|
||||
}
|
||||
buf[0] = (char)(0xf0 + (u >> 18));
|
||||
buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
|
||||
buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
|
||||
buf[3] = (char)(0x80 + (u & 0x3f));
|
||||
return 4;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void outputUnicodes(ostream & out, const Unicode * u, int uLen)
|
||||
{
|
||||
for(int i = 0; i < uLen; ++i)
|
||||
{
|
||||
switch(u[i])
|
||||
{
|
||||
case '&':
|
||||
out << "&";
|
||||
break;
|
||||
case '\"':
|
||||
out << """;
|
||||
break;
|
||||
case '\'':
|
||||
out << "'";
|
||||
break;
|
||||
case '<':
|
||||
out << "<";
|
||||
break;
|
||||
case '>':
|
||||
out << ">";
|
||||
break;
|
||||
default:
|
||||
{
|
||||
char buf[4];
|
||||
auto n = mapUTF8(u[i], buf, 4);
|
||||
out.write(buf, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} //namespace pdf2htmlEX
|
41
src/util/unicode.h
Normal file
41
src/util/unicode.h
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Unicode manipulation functions
|
||||
*
|
||||
* by WangLu
|
||||
* 2012.11.29
|
||||
*/
|
||||
|
||||
#ifndef UNICODE_H__
|
||||
#define UNICODE_H__
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <GfxFont.h>
|
||||
#include <CharTypes.h>
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
/*
|
||||
* Check if the unicode is valid for HTML
|
||||
* http://en.wikipedia.org/wiki/HTML_decimal_character_rendering
|
||||
*/
|
||||
bool isLegalUnicode(Unicode u);
|
||||
|
||||
Unicode map_to_private(CharCode code);
|
||||
|
||||
/* * Try to determine the Unicode value directly from the information in the font */
|
||||
Unicode unicode_from_font (CharCode code, GfxFont * font);
|
||||
|
||||
/*
|
||||
* We have to use a single Unicode value to reencode fonts
|
||||
* if we got multi-unicode values, it might be expanded ligature, try to restore it
|
||||
* if we cannot figure it out at the end, use a private mapping
|
||||
*/
|
||||
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
|
||||
|
||||
void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
|
||||
|
||||
|
||||
} // namespace pdf2htmlEX
|
||||
|
||||
#endif //UNICODE_H__
|
163
src/util/util.cc
163
src/util/util.cc
@ -29,31 +29,6 @@ using std::ostream;
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
|
||||
|
||||
const map<string, string> BASE_14_FONT_CSS_FONT_MAP({
|
||||
{ "Courier", "Courier,monospace" },
|
||||
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },
|
||||
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },
|
||||
{ "Symbol", "Symbol,\"Standard Symbols L\"" },
|
||||
{ "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },
|
||||
});
|
||||
|
||||
const map<string, string> GB_ENCODED_FONT_NAME_MAP({
|
||||
{"\xCB\xCE\xCC\xE5", "SimSun"},
|
||||
{"\xBA\xDA\xCC\xE5", "SimHei"},
|
||||
{"\xBF\xAC\xCC\xE5_GB2312", "SimKai"},
|
||||
{"\xB7\xC2\xCB\xCE_GB2312", "SimFang"},
|
||||
{"\xC1\xA5\xCA\xE9", "SimLi"},
|
||||
});
|
||||
|
||||
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({
|
||||
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}},
|
||||
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}},
|
||||
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}},
|
||||
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}}
|
||||
});
|
||||
|
||||
void _tm_transform(const double * tm, double & x, double & y, bool is_delta)
|
||||
{
|
||||
double xx = x, yy = y;
|
||||
@ -79,144 +54,6 @@ void _tm_multiply(double * tm_left, const double * tm_right)
|
||||
tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5];
|
||||
}
|
||||
|
||||
bool isLegalUnicode(Unicode u)
|
||||
{
|
||||
/*
|
||||
if((u == 9) || (u == 10) || (u == 13))
|
||||
return true;
|
||||
*/
|
||||
|
||||
if(u <= 31)
|
||||
return false;
|
||||
|
||||
if((u >= 127) && (u <= 159))
|
||||
return false;
|
||||
|
||||
if((u >= 0xd800) && (u <= 0xdfff))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Unicode map_to_private(CharCode code)
|
||||
{
|
||||
Unicode private_mapping = (Unicode)(code + 0xE000);
|
||||
if(private_mapping > 0xF8FF)
|
||||
{
|
||||
private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000);
|
||||
if(private_mapping > 0xFFFFD)
|
||||
{
|
||||
private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000);
|
||||
if(private_mapping > 0x10FFFD)
|
||||
{
|
||||
cerr << "Warning: all private use unicode are used" << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
return private_mapping;
|
||||
}
|
||||
|
||||
Unicode unicode_from_font (CharCode code, GfxFont * font)
|
||||
{
|
||||
if(!font->isCIDFont())
|
||||
{
|
||||
char * cname = dynamic_cast<Gfx8BitFont*>(font)->getCharName(code);
|
||||
// may be untranslated ligature
|
||||
if(cname)
|
||||
{
|
||||
Unicode ou = globalParams->mapNameToUnicode(cname);
|
||||
|
||||
if(isLegalUnicode(ou))
|
||||
return ou;
|
||||
}
|
||||
}
|
||||
|
||||
return map_to_private(code);
|
||||
}
|
||||
|
||||
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
|
||||
{
|
||||
if(len == 0)
|
||||
return map_to_private(code);
|
||||
|
||||
if(len == 1)
|
||||
{
|
||||
if(isLegalUnicode(*u))
|
||||
return *u;
|
||||
}
|
||||
|
||||
return unicode_from_font(code, font);
|
||||
}
|
||||
|
||||
/*
|
||||
* Copied from UTF.h / UTF8.h in poppler
|
||||
*/
|
||||
static int mapUTF8(Unicode u, char *buf, int bufSize) {
|
||||
if (u <= 0x0000007f) {
|
||||
if (bufSize < 1) {
|
||||
return 0;
|
||||
}
|
||||
buf[0] = (char)u;
|
||||
return 1;
|
||||
} else if (u <= 0x000007ff) {
|
||||
if (bufSize < 2) {
|
||||
return 0;
|
||||
}
|
||||
buf[0] = (char)(0xc0 + (u >> 6));
|
||||
buf[1] = (char)(0x80 + (u & 0x3f));
|
||||
return 2;
|
||||
} else if (u <= 0x0000ffff) {
|
||||
if (bufSize < 3) {
|
||||
return 0;
|
||||
}
|
||||
buf[0] = (char)(0xe0 + (u >> 12));
|
||||
buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
|
||||
buf[2] = (char)(0x80 + (u & 0x3f));
|
||||
return 3;
|
||||
} else if (u <= 0x0010ffff) {
|
||||
if (bufSize < 4) {
|
||||
return 0;
|
||||
}
|
||||
buf[0] = (char)(0xf0 + (u >> 18));
|
||||
buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
|
||||
buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
|
||||
buf[3] = (char)(0x80 + (u & 0x3f));
|
||||
return 4;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void outputUnicodes(ostream & out, const Unicode * u, int uLen)
|
||||
{
|
||||
for(int i = 0; i < uLen; ++i)
|
||||
{
|
||||
switch(u[i])
|
||||
{
|
||||
case '&':
|
||||
out << "&";
|
||||
break;
|
||||
case '\"':
|
||||
out << """;
|
||||
break;
|
||||
case '\'':
|
||||
out << "'";
|
||||
break;
|
||||
case '<':
|
||||
out << "<";
|
||||
break;
|
||||
case '>':
|
||||
out << ">";
|
||||
break;
|
||||
default:
|
||||
{
|
||||
char buf[4];
|
||||
auto n = mapUTF8(u[i], buf, 4);
|
||||
out.write(buf, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
/*
|
||||
* Constants & Misc functions
|
||||
*
|
||||
* Help classes and Functions
|
||||
*
|
||||
* by WangLu
|
||||
* 2012.08.10
|
||||
@ -19,7 +18,8 @@
|
||||
#include <map>
|
||||
|
||||
#include <GfxState.h>
|
||||
#include <CharTypes.h>
|
||||
|
||||
#include "const.h"
|
||||
|
||||
#ifndef nullptr
|
||||
#define nullptr (NULL)
|
||||
@ -27,18 +27,6 @@
|
||||
|
||||
namespace pdf2htmlEX {
|
||||
|
||||
static const double EPS = 1e-6;
|
||||
extern const double id_matrix[6];
|
||||
|
||||
static const double DEFAULT_DPI = 72.0;
|
||||
|
||||
extern const std::map<std::string, std::string> BASE_14_FONT_CSS_FONT_MAP;
|
||||
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
|
||||
// map to embed files into html
|
||||
// key: (suffix, if_embed_content)
|
||||
// value: (prefix string, suffix string)
|
||||
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP;
|
||||
|
||||
static inline double _round(double x) { return (std::abs(x) > EPS) ? x : 0.0; }
|
||||
static inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; }
|
||||
static inline bool _is_positive(double x) { return x > EPS; }
|
||||
@ -59,26 +47,6 @@ static inline long long hash_ref(const Ref * id)
|
||||
return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the unicode is valid for HTML
|
||||
* http://en.wikipedia.org/wiki/HTML_decimal_character_rendering
|
||||
*/
|
||||
bool isLegalUnicode(Unicode u);
|
||||
|
||||
Unicode map_to_private(CharCode code);
|
||||
|
||||
/* * Try to determine the Unicode value directly from the information in the font */
|
||||
Unicode unicode_from_font (CharCode code, GfxFont * font);
|
||||
|
||||
/*
|
||||
* We have to use a single Unicode value to reencode fonts
|
||||
* if we got multi-unicode values, it might be expanded ligature, try to restore it
|
||||
* if we cannot figure it out at the end, use a private mapping
|
||||
*/
|
||||
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
|
||||
|
||||
void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
|
||||
|
||||
class GfxRGB_hash
|
||||
{
|
||||
public:
|
||||
@ -233,5 +201,6 @@ void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2,
|
||||
|
||||
std::ostream & operator << (std::ostream & out, const GfxRGB & rgb);
|
||||
|
||||
} // namespace util
|
||||
} // namespace pdf2htmlEX
|
||||
|
||||
#endif //UTIL_H__
|
||||
|
Loading…
Reference in New Issue
Block a user