url escaping in HTML

2024-12-22 04:50:09 +00:00 · 2013-02-15 13:07:00 +08:00 · 2013-02-15 13:07:00 +08:00 · 03b0f382c8
commit 03b0f382c8
parent b8ff5be406
8 changed files with 170 additions and 79 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -167,6 +167,8 @@ add_executable(pdf2htmlEX
    src/util/base64stream.cc
    src/util/const.h
    src/util/const.cc
+    src/util/encoding.h
+    src/util/encoding.cc
    src/util/ffw.h
    src/util/ffw.c
    src/util/math.h
--- a/src/HTMLRenderer/TextLineBuffer.cc
+++ b/src/HTMLRenderer/TextLineBuffer.cc
@ -14,6 +14,7 @@
 #include "util/unicode.h"
 #include "util/math.h"
 #include "util/CSSClassNames.h"
+#include "util/encoding.h"

 namespace pdf2htmlEX {

--- a/src/HTMLRenderer/link.cc
+++ b/src/HTMLRenderer/link.cc
@ -17,6 +17,7 @@
 #include "util/namespace.h"
 #include "util/math.h"
 #include "util/misc.h"
+#include "util/encoding.h"

 namespace pdf2htmlEX {
   
@ -193,7 +194,9 @@ void HTMLRenderer::processLink(AnnotLink * al)

    if(!dest_str.empty())
    {
-        f_pages.fs << "<a class=\"" << CSS::LINK_CN << "\" href=\"" << dest_str << "\"";
+        f_pages.fs << "<a class=\"" << CSS::LINK_CN << "\" href=\"";
+        outputURL(f_pages.fs, dest_str);
+        f_pages.fs << "\"";

        if(!dest_detail_str.empty())
            f_pages.fs << " data-dest-detail='" << dest_detail_str << "'";
--- a/src/HTMLRenderer/outline.cc
+++ b/src/HTMLRenderer/outline.cc
@ -14,7 +14,7 @@

 #include "HTMLRenderer.h"
 #include "util/namespace.h"
-#include "util/unicode.h"
+#include "util/encoding.h"

 namespace pdf2htmlEX {

--- a/src/util/encoding.cc
+++ b/src/util/encoding.cc
@ -0,0 +1,129 @@
+/*
+ * Encodings used in HTML
+ *
+ * by WangLu
+ * 2013.02.15
+ */
+
+#include <cstring>
+
+#include "encoding.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+using std::string;
+
+/*
+ * Copied from UTF.h / UTF8.h in poppler
+ */
+static int mapUTF8(Unicode u, char *buf, int bufSize) 
+{
+    if (u <= 0x0000007f) {
+        if (bufSize < 1) {
+            return 0;
+        }
+        buf[0] = (char)u;
+        return 1;
+    } else if (u <= 0x000007ff) {
+        if (bufSize < 2) {
+            return 0;
+        }
+        buf[0] = (char)(0xc0 + (u >> 6));
+        buf[1] = (char)(0x80 + (u & 0x3f));
+        return 2;
+    } else if (u <= 0x0000ffff) {
+        if (bufSize < 3) {
+            return 0;
+        }
+        buf[0] = (char)(0xe0 + (u >> 12));
+        buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
+        buf[2] = (char)(0x80 + (u & 0x3f));
+        return 3;
+    } else if (u <= 0x0010ffff) {
+        if (bufSize < 4) {
+            return 0;
+        }
+        buf[0] = (char)(0xf0 + (u >> 18));
+        buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
+        buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
+        buf[3] = (char)(0x80 + (u & 0x3f));
+        return 4;
+    } else {
+        return 0;
+    }
+}
+
+void outputUnicodes(ostream & out, const Unicode * u, int uLen)
+{
+    for(int i = 0; i < uLen; ++i)
+    {
+        switch(u[i])
+        {
+            case '&':
+                out << "&amp;";
+                break;
+            case '\"':
+                out << "&quot;";
+                break;
+            case '\'':
+                out << "&apos;";
+                break;
+            case '<':
+                out << "&lt;";
+                break;
+            case '>':
+                out << "&gt;";
+                break;
+            default:
+                {
+                    char buf[4];
+                    auto n = mapUTF8(u[i], buf, 4);
+                    out.write(buf, n);
+                }
+        }
+    }
+}
+
+static void outputHEX(ostream & out, char c)
+{
+    static const char * hexchars = "0123456789abcdef";
+    out << hexchars[(c>>4)&0xf] << hexchars[c&0xf];
+}
+
+void outputURL(ostream & out, const string & s)
+{
+    static char * dont_escape = nullptr;
+    if(!dont_escape)
+    {
+        dont_escape = new char [256];
+        memset(dont_escape, 0, 256 * sizeof(char));
+        /*
+         * http://tools.ietf.org/html/rfc3986#section-2
+         *
+         * Also includes '%', in case that the original url has been escaped
+         */
+        const char * no_escape_chars = ":/?#[]@!$&'()*+,;="
+            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+            "abcdefghijklmnopqrstuvwxyz"
+            "0123456789"
+            "-._~"
+            "%";
+        while(*no_escape_chars)
+            dont_escape[(int)*(no_escape_chars++)] = 1;
+    }
+
+    for (auto iter = s.begin(); iter != s.end(); ++iter)
+    {
+        char c = *iter;
+        if(dont_escape[(int)c])
+            out << c;
+        else
+        {
+            out << '%';
+            outputHEX(out, c);
+        }
+    }
+}
+
+} //namespace pdf2htmlEX
--- a/src/util/encoding.h
+++ b/src/util/encoding.h
@ -0,0 +1,31 @@
+/*
+ * Encodings used in HTML
+ *
+ * by WangLu
+ * 2013.02.15
+ */
+
+#ifndef ENCODING_H__
+#define ENCODING_H__
+
+#include <string>
+#include <iostream>
+
+#include <CharTypes.h>
+
+namespace pdf2htmlEX {
+
+/*
+ * Escape necessary characters, and map Unicode to UTF-8
+ */
+void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
+
+
+/*
+ * URL encoding
+ */
+void outputURL(std::ostream & out, const std::string & s);
+
+} // namespace pdf2htmlEX
+
+#endif //ENCODING_H__
--- a/src/util/unicode.cc
+++ b/src/util/unicode.cc
@ -5,6 +5,8 @@
 * 2012.11.29
 */

+#include <iostream>
+
 #include <GlobalParams.h>

 #include "unicode.h"
@ -84,74 +86,4 @@ Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
    return unicode_from_font(code, font);
 }

-/*
- * Copied from UTF.h / UTF8.h in poppler
- */
-static int mapUTF8(Unicode u, char *buf, int bufSize) {
-  if        (u <= 0x0000007f) {
-    if (bufSize < 1) {
-      return 0;
-    }
-    buf[0] = (char)u;
-    return 1;
-  } else if (u <= 0x000007ff) {
-    if (bufSize < 2) {
-      return 0;
-    }
-    buf[0] = (char)(0xc0 + (u >> 6));
-    buf[1] = (char)(0x80 + (u & 0x3f));
-    return 2;
-  } else if (u <= 0x0000ffff) {
-    if (bufSize < 3) {
-      return 0;
-    }
-    buf[0] = (char)(0xe0 + (u >> 12));
-    buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
-    buf[2] = (char)(0x80 + (u & 0x3f));
-    return 3;
-  } else if (u <= 0x0010ffff) {
-    if (bufSize < 4) {
-      return 0;
-    }
-    buf[0] = (char)(0xf0 + (u >> 18));
-    buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
-    buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
-    buf[3] = (char)(0x80 + (u & 0x3f));
-    return 4;
-  } else {
-    return 0;
-  }
-}
-
-void outputUnicodes(ostream & out, const Unicode * u, int uLen)
-{
-    for(int i = 0; i < uLen; ++i)
-    {
-        switch(u[i])
-        {
-            case '&':
-                out << "&amp;";
-                break;
-            case '\"':
-                out << "&quot;";
-                break;
-            case '\'':
-                out << "&apos;";
-                break;
-            case '<':
-                out << "&lt;";
-                break;
-            case '>':
-                out << "&gt;";
-                break;
-            default:
-                {
-                    char buf[4];
-                    auto n = mapUTF8(u[i], buf, 4);
-                    out.write(buf, n);
-                }
-        }
-    }
-}
-
 } //namespace pdf2htmlEX
--- a/src/util/unicode.h
+++ b/src/util/unicode.h
@ -8,8 +8,6 @@
 #ifndef UNICODE_H__
 #define UNICODE_H__

-#include <iostream>
-
 #include <GfxFont.h>
 #include <CharTypes.h>

@ -33,11 +31,6 @@ Unicode unicode_from_font (CharCode code, GfxFont * font);
 */
 Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);

-/*
- * Escape necessary characters, and map Unicode to UTF-8
- */
-void outputUnicodes(std::ostream & out, const Unicode * u, int uLen);
-

 } // namespace pdf2htmlEX