1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

url escaping in HTML

This commit is contained in:
Lu Wang 2013-02-15 13:07:00 +08:00
parent b8ff5be406
commit 03b0f382c8
8 changed files with 170 additions and 79 deletions

View File

@ -167,6 +167,8 @@ add_executable(pdf2htmlEX
src/util/base64stream.cc
src/util/const.h
src/util/const.cc
src/util/encoding.h
src/util/encoding.cc
src/util/ffw.h
src/util/ffw.c
src/util/math.h

View File

@ -14,6 +14,7 @@
#include "util/unicode.h"
#include "util/math.h"
#include "util/CSSClassNames.h"
#include "util/encoding.h"
namespace pdf2htmlEX {

View File

@ -17,6 +17,7 @@
#include "util/namespace.h"
#include "util/math.h"
#include "util/misc.h"
#include "util/encoding.h"
namespace pdf2htmlEX {
@ -193,7 +194,9 @@ void HTMLRenderer::processLink(AnnotLink * al)
if(!dest_str.empty())
{
f_pages.fs << "<a class=\"" << CSS::LINK_CN << "\" href=\"" << dest_str << "\"";
f_pages.fs << "<a class=\"" << CSS::LINK_CN << "\" href=\"";
outputURL(f_pages.fs, dest_str);
f_pages.fs << "\"";
if(!dest_detail_str.empty())
f_pages.fs << " data-dest-detail='" << dest_detail_str << "'";

View File

@ -14,7 +14,7 @@
#include "HTMLRenderer.h"
#include "util/namespace.h"
#include "util/unicode.h"
#include "util/encoding.h"
namespace pdf2htmlEX {

129
src/util/encoding.cc Normal file
View File

@ -0,0 +1,129 @@
/*
* Encodings used in HTML
*
* by WangLu
* 2013.02.15
*/
#include <cstring>
#include "encoding.h"
namespace pdf2htmlEX {
using std::ostream;
using std::string;
/*
* Copied from UTF.h / UTF8.h in poppler
*/
static int mapUTF8(Unicode u, char *buf, int bufSize)
{
if (u <= 0x0000007f) {
if (bufSize < 1) {
return 0;
}
buf[0] = (char)u;
return 1;
} else if (u <= 0x000007ff) {
if (bufSize < 2) {
return 0;
}
buf[0] = (char)(0xc0 + (u >> 6));
buf[1] = (char)(0x80 + (u & 0x3f));
return 2;
} else if (u <= 0x0000ffff) {
if (bufSize < 3) {
return 0;
}
buf[0] = (char)(0xe0 + (u >> 12));
buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[2] = (char)(0x80 + (u & 0x3f));
return 3;
} else if (u <= 0x0010ffff) {
if (bufSize < 4) {
return 0;
}
buf[0] = (char)(0xf0 + (u >> 18));
buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[3] = (char)(0x80 + (u & 0x3f));
return 4;
} else {
return 0;
}
}
void outputUnicodes(ostream & out, const Unicode * u, int uLen)
{
for(int i = 0; i < uLen; ++i)
{
switch(u[i])
{
case '&':
out << "&amp;";
break;
case '\"':
out << "&quot;";
break;
case '\'':
out << "&apos;";
break;
case '<':
out << "&lt;";
break;
case '>':
out << "&gt;";
break;
default:
{
char buf[4];
auto n = mapUTF8(u[i], buf, 4);
out.write(buf, n);
}
}
}
}
static void outputHEX(ostream & out, char c)
{
static const char * hexchars = "0123456789abcdef";
out << hexchars[(c>>4)&0xf] << hexchars[c&0xf];
}
void outputURL(ostream & out, const string & s)
{
static char * dont_escape = nullptr;
if(!dont_escape)
{
dont_escape = new char [256];
memset(dont_escape, 0, 256 * sizeof(char));
/*
* http://tools.ietf.org/html/rfc3986#section-2
*
* Also includes '%', in case that the original url has been escaped
*/
const char * no_escape_chars = ":/?#[]@!$&'()*+,;="
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789"
"-._~"
"%";
while(*no_escape_chars)
dont_escape[(int)*(no_escape_chars++)] = 1;
}
for (auto iter = s.begin(); iter != s.end(); ++iter)
{
char c = *iter;
if(dont_escape[(int)c])
out << c;
else
{
out << '%';
outputHEX(out, c);
}
}
}
} //namespace pdf2htmlEX

31
src/util/encoding.h Normal file
View File

@ -0,0 +1,31 @@
/*
* Encodings used in HTML
*
* by WangLu
* 2013.02.15
*/
#ifndef ENCODING_H__
#define ENCODING_H__
#include <string>
#include <iostream>
#include <CharTypes.h>
namespace pdf2htmlEX {
/*
* Escape necessary characters, and map Unicode to UTF-8
*/
void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
/*
* URL encoding
*/
void outputURL(std::ostream & out, const std::string & s);
} // namespace pdf2htmlEX
#endif //ENCODING_H__

View File

@ -5,6 +5,8 @@
* 2012.11.29
*/
#include <iostream>
#include <GlobalParams.h>
#include "unicode.h"
@ -84,74 +86,4 @@ Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
return unicode_from_font(code, font);
}
/*
* Copied from UTF.h / UTF8.h in poppler
*/
static int mapUTF8(Unicode u, char *buf, int bufSize) {
if (u <= 0x0000007f) {
if (bufSize < 1) {
return 0;
}
buf[0] = (char)u;
return 1;
} else if (u <= 0x000007ff) {
if (bufSize < 2) {
return 0;
}
buf[0] = (char)(0xc0 + (u >> 6));
buf[1] = (char)(0x80 + (u & 0x3f));
return 2;
} else if (u <= 0x0000ffff) {
if (bufSize < 3) {
return 0;
}
buf[0] = (char)(0xe0 + (u >> 12));
buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[2] = (char)(0x80 + (u & 0x3f));
return 3;
} else if (u <= 0x0010ffff) {
if (bufSize < 4) {
return 0;
}
buf[0] = (char)(0xf0 + (u >> 18));
buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
buf[3] = (char)(0x80 + (u & 0x3f));
return 4;
} else {
return 0;
}
}
void outputUnicodes(ostream & out, const Unicode * u, int uLen)
{
for(int i = 0; i < uLen; ++i)
{
switch(u[i])
{
case '&':
out << "&amp;";
break;
case '\"':
out << "&quot;";
break;
case '\'':
out << "&apos;";
break;
case '<':
out << "&lt;";
break;
case '>':
out << "&gt;";
break;
default:
{
char buf[4];
auto n = mapUTF8(u[i], buf, 4);
out.write(buf, n);
}
}
}
}
} //namespace pdf2htmlEX

View File

@ -8,8 +8,6 @@
#ifndef UNICODE_H__
#define UNICODE_H__
#include <iostream>
#include <GfxFont.h>
#include <CharTypes.h>
@ -33,11 +31,6 @@ Unicode unicode_from_font (CharCode code, GfxFont * font);
*/
Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
/*
* Escape necessary characters, and map Unicode to UTF-8
*/
void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
} // namespace pdf2htmlEX