1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

reorganizaing

This commit is contained in:
Lu Wang 2012-11-29 17:45:26 +08:00
parent 63287ce491
commit 35fccdc28c
11 changed files with 289 additions and 209 deletions

View File

@ -163,15 +163,19 @@ add_executable(pdf2htmlEX
src/BackgroundRenderer/SplashBackgroundRenderer.cc
src/BackgroundRenderer/CairoBackgroundRenderer.h
src/BackgroundRenderer/CairoBackgroundRenderer.cc
src/util/namespace.h
src/util/ffw.h
src/util/ffw.c
src/util/util.h
src/util/util.cc
src/util/TmpFiles.h
src/util/TmpFiles.cc
src/util/ArgParser.h
src/util/ArgParser.cc
src/util/const.h
src/util/const.cc
src/util/ffw.h
src/util/ffw.c
src/util/namespace.h
src/util/TmpFiles.h
src/util/TmpFiles.cc
src/util/unicode.h
src/util/unicode.cc
src/util/util.h
src/util/util.cc
)
target_link_libraries(pdf2htmlEX ${PDF2HTMLEX_LIBS})

View File

@ -11,6 +11,7 @@
#include "HTMLRenderer.h"
#include "util/namespace.h"
#include "util/unicode.h"
namespace pdf2htmlEX {

View File

@ -99,7 +99,7 @@ void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm)
// we have already shifted the origin
// TODO: recognize common matices
if(_tm_equal(tm, id_matrix, 4))
if(_tm_equal(tm, ID_MATRIX, 4))
{
auto prefixes = {"", "-ms-", "-moz-", "-webkit-", "-o-"};
for(auto iter = prefixes.begin(); iter != prefixes.end(); ++iter)

View File

@ -170,8 +170,8 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
cur_font_size = draw_font_size = 0;
cur_fs_id = install_font_size(cur_font_size);
memcpy(cur_text_tm, id_matrix, sizeof(cur_text_tm));
memcpy(draw_text_tm, id_matrix, sizeof(draw_text_tm));
memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm));
memcpy(draw_text_tm, ID_MATRIX, sizeof(draw_text_tm));
cur_ttm_id = install_transform_matrix(draw_text_tm);
cur_letter_space = cur_word_space = 0;

View File

@ -18,6 +18,7 @@
#include "HTMLRenderer.h"
#include "util/ffw.h"
#include "util/namespace.h"
#include "util/unicode.h"
namespace pdf2htmlEX {

39
src/util/const.cc Normal file
View File

@ -0,0 +1,39 @@
/*
* Constants
*
* by WangLu
* 2012.11.29
*/
#include "const.h"
namespace pdf2htmlEX {
using std::map;
using std::string;
const double ID_MATRIX[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
const map<string, string> BASE_14_FONT_CSS_FONT_MAP({
{ "Courier", "Courier,monospace" },
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },
{ "Symbol", "Symbol,\"Standard Symbols L\"" },
{ "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },
});
const map<string, string> GB_ENCODED_FONT_NAME_MAP({
{"\xCB\xCE\xCC\xE5", "SimSun"},
{"\xBA\xDA\xCC\xE5", "SimHei"},
{"\xBF\xAC\xCC\xE5_GB2312", "SimKai"},
{"\xB7\xC2\xCB\xCE_GB2312", "SimFang"},
{"\xC1\xA5\xCA\xE9", "SimLi"},
});
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}},
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}},
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}},
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}}
});
} //namespace pdf2htmlEX

31
src/util/const.h Normal file
View File

@ -0,0 +1,31 @@
/*
* Constants
*
* by WangLu
* 2012.11.29
*/
#ifndef CONST_H__
#define CONST_H__
#include <map>
#include <string>
namespace pdf2htmlEX {
static const double EPS = 1e-6;
static const double DEFAULT_DPI = 72.0;
extern const double ID_MATRIX[6];
// PDF base 14 font name -> CSS font name
extern const std::map<std::string, std::string> BASE_14_FONT_CSS_FONT_MAP;
// For GB encoded font names
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
// map to embed files into html
// key: (suffix, if_embed_content)
// value: (prefix string, suffix string)
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP;
} // namespace pdf2htmlEX
#endif //CONST_H__

157
src/util/unicode.cc Normal file
View File

@ -0,0 +1,157 @@
/*
* Unicode manipulation functions
*
* by WangLu
* 2012.11.29
*/
#include <GlobalParams.h>
#include "unicode.h"
namespace pdf2htmlEX {
using std::cerr;
using std::endl;
using std::ostream;
bool isLegalUnicode(Unicode u)
{
/*
if((u == 9) || (u == 10) || (u == 13))
return true;
*/
if(u <= 31)
return false;
if((u >= 127) && (u <= 159))
return false;
if((u >= 0xd800) && (u <= 0xdfff))
return false;
return true;
}
Unicode map_to_private(CharCode code)
{
Unicode private_mapping = (Unicode)(code + 0xE000);
if(private_mapping > 0xF8FF)
{
private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000);
if(private_mapping > 0xFFFFD)
{
private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000);
if(private_mapping > 0x10FFFD)
{
cerr << "Warning: all private use unicode are used" << endl;
}
}
}
return private_mapping;
}
Unicode unicode_from_font (CharCode code, GfxFont * font)
{
if(!font->isCIDFont())
{
char * cname = dynamic_cast<Gfx8BitFont*>(font)->getCharName(code);
// may be untranslated ligature
if(cname)
{
Unicode ou = globalParams->mapNameToUnicode(cname);
if(isLegalUnicode(ou))
return ou;
}
}
return map_to_private(code);
}
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
{
if(len == 0)
return map_to_private(code);
if(len == 1)
{
if(isLegalUnicode(*u))
return *u;
}
return unicode_from_font(code, font);
}
/*
* Copied from UTF.h / UTF8.h in poppler
*/
static int mapUTF8(Unicode u, char *buf, int bufSize) {
if (u <= 0x0000007f) {
if (bufSize < 1) {
return 0;
}
buf[0] = (char)u;
return 1;
} else if (u <= 0x000007ff) {
if (bufSize < 2) {
return 0;
}
buf[0] = (char)(0xc0 + (u >> 6));
buf[1] = (char)(0x80 + (u & 0x3f));
return 2;
} else if (u <= 0x0000ffff) {
if (bufSize < 3) {
return 0;
}
buf[0] = (char)(0xe0 + (u >> 12));
buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[2] = (char)(0x80 + (u & 0x3f));
return 3;
} else if (u <= 0x0010ffff) {
if (bufSize < 4) {
return 0;
}
buf[0] = (char)(0xf0 + (u >> 18));
buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[3] = (char)(0x80 + (u & 0x3f));
return 4;
} else {
return 0;
}
}
void outputUnicodes(ostream & out, const Unicode * u, int uLen)
{
for(int i = 0; i < uLen; ++i)
{
switch(u[i])
{
case '&':
out << "&amp;";
break;
case '\"':
out << "&quot;";
break;
case '\'':
out << "&apos;";
break;
case '<':
out << "&lt;";
break;
case '>':
out << "&gt;";
break;
default:
{
char buf[4];
auto n = mapUTF8(u[i], buf, 4);
out.write(buf, n);
}
}
}
}
} //namespace pdf2htmlEX

41
src/util/unicode.h Normal file
View File

@ -0,0 +1,41 @@
/*
* Unicode manipulation functions
*
* by WangLu
* 2012.11.29
*/
#ifndef UNICODE_H__
#define UNICODE_H__
#include <iostream>
#include <GfxFont.h>
#include <CharTypes.h>
namespace pdf2htmlEX {
/*
* Check if the unicode is valid for HTML
* http://en.wikipedia.org/wiki/HTML_decimal_character_rendering
*/
bool isLegalUnicode(Unicode u);
Unicode map_to_private(CharCode code);
/* * Try to determine the Unicode value directly from the information in the font */
Unicode unicode_from_font (CharCode code, GfxFont * font);
/*
* We have to use a single Unicode value to reencode fonts
* if we got multi-unicode values, it might be expanded ligature, try to restore it
* if we cannot figure it out at the end, use a private mapping
*/
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
} // namespace pdf2htmlEX
#endif //UNICODE_H__

View File

@ -29,31 +29,6 @@ using std::ostream;
namespace pdf2htmlEX {
const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
const map<string, string> BASE_14_FONT_CSS_FONT_MAP({
{ "Courier", "Courier,monospace" },
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },
{ "Symbol", "Symbol,\"Standard Symbols L\"" },
{ "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },
});
const map<string, string> GB_ENCODED_FONT_NAME_MAP({
{"\xCB\xCE\xCC\xE5", "SimSun"},
{"\xBA\xDA\xCC\xE5", "SimHei"},
{"\xBF\xAC\xCC\xE5_GB2312", "SimKai"},
{"\xB7\xC2\xCB\xCE_GB2312", "SimFang"},
{"\xC1\xA5\xCA\xE9", "SimLi"},
});
const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP({
{{".css", 0}, {"<link rel=\"stylesheet\" type=\"text/css\" href=\"", "\"/>"}},
{{".css", 1}, {"<style type=\"text/css\">", "</style>"}},
{{".js", 0}, {"<script type=\"text/javascript\" src=\"", "\"></script>"}},
{{".js", 1}, {"<script type=\"text/javascript\">", "</script>"}}
});
void _tm_transform(const double * tm, double & x, double & y, bool is_delta)
{
double xx = x, yy = y;
@ -79,144 +54,6 @@ void _tm_multiply(double * tm_left, const double * tm_right)
tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5];
}
bool isLegalUnicode(Unicode u)
{
/*
if((u == 9) || (u == 10) || (u == 13))
return true;
*/
if(u <= 31)
return false;
if((u >= 127) && (u <= 159))
return false;
if((u >= 0xd800) && (u <= 0xdfff))
return false;
return true;
}
Unicode map_to_private(CharCode code)
{
Unicode private_mapping = (Unicode)(code + 0xE000);
if(private_mapping > 0xF8FF)
{
private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000);
if(private_mapping > 0xFFFFD)
{
private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000);
if(private_mapping > 0x10FFFD)
{
cerr << "Warning: all private use unicode are used" << endl;
}
}
}
return private_mapping;
}
Unicode unicode_from_font (CharCode code, GfxFont * font)
{
if(!font->isCIDFont())
{
char * cname = dynamic_cast<Gfx8BitFont*>(font)->getCharName(code);
// may be untranslated ligature
if(cname)
{
Unicode ou = globalParams->mapNameToUnicode(cname);
if(isLegalUnicode(ou))
return ou;
}
}
return map_to_private(code);
}
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
{
if(len == 0)
return map_to_private(code);
if(len == 1)
{
if(isLegalUnicode(*u))
return *u;
}
return unicode_from_font(code, font);
}
/*
* Copied from UTF.h / UTF8.h in poppler
*/
static int mapUTF8(Unicode u, char *buf, int bufSize) {
if (u <= 0x0000007f) {
if (bufSize < 1) {
return 0;
}
buf[0] = (char)u;
return 1;
} else if (u <= 0x000007ff) {
if (bufSize < 2) {
return 0;
}
buf[0] = (char)(0xc0 + (u >> 6));
buf[1] = (char)(0x80 + (u & 0x3f));
return 2;
} else if (u <= 0x0000ffff) {
if (bufSize < 3) {
return 0;
}
buf[0] = (char)(0xe0 + (u >> 12));
buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[2] = (char)(0x80 + (u & 0x3f));
return 3;
} else if (u <= 0x0010ffff) {
if (bufSize < 4) {
return 0;
}
buf[0] = (char)(0xf0 + (u >> 18));
buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[3] = (char)(0x80 + (u & 0x3f));
return 4;
} else {
return 0;
}
}
void outputUnicodes(ostream & out, const Unicode * u, int uLen)
{
for(int i = 0; i < uLen; ++i)
{
switch(u[i])
{
case '&':
out << "&amp;";
break;
case '\"':
out << "&quot;";
break;
case '\'':
out << "&apos;";
break;
case '<':
out << "&lt;";
break;
case '>':
out << "&gt;";
break;
default:
{
char buf[4];
auto n = mapUTF8(u[i], buf, 4);
out.write(buf, n);
}
}
}
}
const char * base64stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

View File

@ -1,6 +1,5 @@
/*
* Constants & Misc functions
*
* Help classes and Functions
*
* by WangLu
* 2012.08.10
@ -19,7 +18,8 @@
#include <map>
#include <GfxState.h>
#include <CharTypes.h>
#include "const.h"
#ifndef nullptr
#define nullptr (NULL)
@ -27,18 +27,6 @@
namespace pdf2htmlEX {
static const double EPS = 1e-6;
extern const double id_matrix[6];
static const double DEFAULT_DPI = 72.0;
extern const std::map<std::string, std::string> BASE_14_FONT_CSS_FONT_MAP;
extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
// map to embed files into html
// key: (suffix, if_embed_content)
// value: (prefix string, suffix string)
extern const std::map<std::pair<std::string, bool>, std::pair<std::string, std::string> > EMBED_STRING_MAP;
static inline double _round(double x) { return (std::abs(x) > EPS) ? x : 0.0; }
static inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; }
static inline bool _is_positive(double x) { return x > EPS; }
@ -59,26 +47,6 @@ static inline long long hash_ref(const Ref * id)
return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen);
}
/*
* Check if the unicode is valid for HTML
* http://en.wikipedia.org/wiki/HTML_decimal_character_rendering
*/
bool isLegalUnicode(Unicode u);
Unicode map_to_private(CharCode code);
/* * Try to determine the Unicode value directly from the information in the font */
Unicode unicode_from_font (CharCode code, GfxFont * font);
/*
* We have to use a single Unicode value to reencode fonts
* if we got multi-unicode values, it might be expanded ligature, try to restore it
* if we cannot figure it out at the end, use a private mapping
*/
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
class GfxRGB_hash
{
public:
@ -233,5 +201,6 @@ void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2,
std::ostream & operator << (std::ostream & out, const GfxRGB & rgb);
} // namespace util
} // namespace pdf2htmlEX
#endif //UTIL_H__