1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00

merge README

This commit is contained in:
Lu Wang 2012-08-07 10:06:48 +08:00
commit 37f2082817
6 changed files with 107 additions and 97 deletions

View File

@ -17,7 +17,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
add_executable(pdftohtmlEX src/pdftohtmlEX.cc src/HTMLRenderer.cc src/HTMLRenderer.h src/BackgroundRenderer.cc src/BackgroundRenderer.h) add_executable(pdftohtmlEX src/pdftohtmlEX.cc src/HTMLRenderer.cc src/HTMLRenderer.h src/BackgroundRenderer.cc src/BackgroundRenderer.h src/Consts.h)
target_link_libraries(pdftohtmlEX poppler boost_program_options) target_link_libraries(pdftohtmlEX poppler boost_program_options)

View File

@ -6,13 +6,9 @@ pdf2html**EX**
Introduction Introduction
----------------------------- -----------------------------
Traditional pdf -> html conversion tools are more likely pdf -> text tools. pdf2htmlEX renders PDF files in HTML, utilizing modern technologies of html/css, aims to provide an accuracy rendering, while keeping optimized for Web display.
For those who are not satisfied with them, this might be the right one for you. pdf2htmlEX is optimized for recent versions of modern web browsers such as Mozilla Firefox & Google Chrome.
pdf2htmlEX utilizes latest technologies of html/css, aims to provide an accuracy rendering, while keeping optimized for Web display.
pdf2htmlEX is optimized for recent versions of moderm web browsers such as Mozilla Firefox & Google Chrome.
Features Features
---------------------------- ----------------------------

View File

@ -14,7 +14,7 @@ void BackgroundRenderer::drawChar(GfxState *state, double x, double y,
CharCode code, int nBytes, Unicode *u, int uLen) CharCode code, int nBytes, Unicode *u, int uLen)
{ {
auto font = state->getFont(); auto font = state->getFont();
if((font->getType() == fontType3) || (font->getWMode())) // if((font->getType() == fontType3) || (font->getWMode()))
{ {
SplashOutputDev::drawChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); SplashOutputDev::drawChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen);
} }

79
src/Consts.h Normal file
View File

@ -0,0 +1,79 @@
/*
* Constants
*
* by WangLu
* 2012.08.07
*/
#ifndef CONSTS_H__
#define CONSTS_H__
#include <string>
#include <map>
const char * HTML_HEAD = "<!DOCTYPE html>\n\
<html><head>\
<meta charset=\"utf-8\">\
<style type=\"text/css\">\
#pdf-main {\
font-family: sans-serif;\
position:absolute;\
top:0;\
left:0;\
bottom:0;\
right:0;\
overflow:auto;\
background-color:grey;\
}\
#pdf-main > .p {\
position:relative;\
margin:13px auto;\
background-color:white;\
overflow:hidden;\
display:none;\
}\
.p > .l {\
position:absolute; \
white-space:pre;\
}\
.l > .w {\
display:inline-block;\
}\
::selection{\
background: rgba(168,209,255,0.5);\
}\
::-moz-selection{\
background: rgba(168,209,255,0.5);\
}\
</style><link rel=\"stylesheet\" type=\"text/css\" href=\"all.css\" />\
<script type=\"text/javascript\">\
function show_pages()\
{\
var pages = document.getElementById('pdf-main').childNodes;\
var idx = 0;\
var f = function(){\
if (idx < pages.length) {\
try{\
pages[idx].style.display='block';\
}catch(e){}\
++idx;\
setTimeout(f,100);\
}\
};\
f();\
};\
</script>\
</head><body onload=\"show_pages();\"><div id=\"pdf-main\">";
const char * HTML_TAIL = "</div></body></html>";
const std::map<string, string> BASE_14_FONT_CSS_FONT_MAP({\
{ "Courier", "Courier,monospace" },\
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },\
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },\
{ "Symbol", "Symbol,\"Standard Symbols L\"" },\
{ "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },\
});
const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
#endif //CONSTS_H__

View File

@ -25,6 +25,7 @@
#include "HTMLRenderer.h" #include "HTMLRenderer.h"
#include "BackgroundRenderer.h" #include "BackgroundRenderer.h"
#include "Consts.h"
/* /*
* CSS classes * CSS classes
@ -41,74 +42,9 @@
* w<hex> - White space * w<hex> - White space
* t<hex> - Transform matrix * t<hex> - Transform matrix
* c<hex> - Color * c<hex> - Color
*
*/ */
const char * HTML_HEAD = "<!DOCTYPE html>\n\
<html><head>\
<meta charset=\"utf-8\">\
<style type=\"text/css\">\
#pdf-main {\
font-family: sans-serif;\
position:absolute;\
top:0;\
left:0;\
bottom:0;\
right:0;\
overflow:auto;\
background-color:grey;\
}\
#pdf-main > .p {\
position:relative;\
margin:13px auto;\
background-color:white;\
overflow:hidden;\
display:none;\
}\
.p > .l {\
position:absolute; \
white-space:pre;\
}\
.l > .w {\
display:inline-block;\
}\
::selection{\
background: rgba(168,209,255,0.5);\
}\
::-moz-selection{\
background: rgba(168,209,255,0.5);\
}\
</style><link rel=\"stylesheet\" type=\"text/css\" href=\"all.css\" />\
<script type=\"text/javascript\">\
function show_pages()\
{\
var pages = document.getElementById('pdf-main').childNodes;\
var idx = 0;\
var f = function(){\
if (idx < pages.length) {\
try{\
pages[idx].style.display='block';\
}catch(e){}\
++idx;\
setTimeout(f,100);\
}\
};\
f();\
};\
</script>\
</head><body onload=\"show_pages();\"><div id=\"pdf-main\">";
const char * HTML_TAIL = "</div></body></html>";
const std::map<string, string> BASE_14_FONT_CSS_FONT_MAP({\
{ "Courier", "Courier,monospace" },\
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },\
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },\
{ "Symbol", "Symbol" },\
{ "ZapfDingbats", "ZapfDingbats" },\
});
const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
TextString::TextString(GfxState *state) TextString::TextString(GfxState *state)
:unicodes() :unicodes()
,x(state->getCurX()) ,x(state->getCurX())
@ -242,7 +178,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
cur_fn_id = cur_fs_id = cur_tm_id = cur_color_id = 0; cur_fn_id = cur_fs_id = cur_tm_id = cur_color_id = 0;
cur_line_x_offset = 0; cur_line_x_offset = 0;
cur_line_y = 0; cur_tx = cur_ty = 0;
cur_font_size = 0; cur_font_size = 0;
memcpy(draw_ctm, id_matrix, sizeof(draw_ctm)); memcpy(draw_ctm, id_matrix, sizeof(draw_ctm));
@ -370,6 +306,7 @@ void HTMLRenderer::beginString(GfxState *state, GooString *s) {
void HTMLRenderer::endString(GfxState *state) { void HTMLRenderer::endString(GfxState *state) {
if (cur_string->getSize() == 0) { if (cur_string->getSize() == 0) {
delete cur_string ; delete cur_string ;
cur_string = nullptr;
return; return;
} }
@ -378,8 +315,9 @@ void HTMLRenderer::endString(GfxState *state) {
{ {
if(at_same_line(cur_line, cur_string)) if(at_same_line(cur_line, cur_string))
{ {
double x1 = cur_line->getX() + cur_line->getWidth(); // TODO: this is not correct
double x2 = cur_string->getX(); double x1 = cur_line->getState()->getLineX() + cur_line->getWidth();
double x2 = cur_string->getState()->getLineX();
double target = (x2-x1-cur_line_x_offset) * draw_scale; double target = (x2-x1-cur_line_x_offset) * draw_scale;
if(target > -param->h_eps) if(target > -param->h_eps)
@ -437,7 +375,8 @@ void HTMLRenderer::endString(GfxState *state) {
html_fout << "\""; html_fout << "\"";
double x,y; double x,y;
cur_state->transform(cur_state->getCurX(), cur_state->getCurY(), &x, &y); cur_state->transform(cur_state->getCurX(), cur_state->getCurY(), &x, &y);
html_fout << boost::format(" data-x=\"%1%\" data-y=\"%2%\" hs=\"%3%")%x%y%(cur_state->getHorizScaling()); html_fout << boost::format("data-lx=\"%5%\" data-ly=\"%6%\" data-scale=\"%4%\" data-x=\"%1%\" data-y=\"%2%\" data-hs=\"%3%")
%x%y%(cur_state->getHorizScaling())%draw_scale%cur_state->getLineX()%cur_state->getLineY();
} }
html_fout << "\">"; html_fout << "\">";
@ -462,17 +401,6 @@ void HTMLRenderer::drawChar(GfxState *state, double x, double y,
cur_string->addUnicodes(state, x, y, dx, dy, u, uLen); cur_string->addUnicodes(state, x, y, dx, dy, u, uLen);
else else
{ {
if(nBytes > 0)
{
std::cerr << "Cannot map to Unicode!" << std::endl;
std::cerr << cur_fn_id << std::endl;
std::cerr << "*";
for(int i = 0; i < nBytes; ++i)
{
std::cerr << (int)(((char*)&code)[i]);
}
std::cerr << std::endl;
}
cur_string->addChars(state, x, y, dx, dy, code, nBytes); cur_string->addChars(state, x, y, dx, dy, code, nBytes);
} }
} }
@ -480,11 +408,16 @@ void HTMLRenderer::drawChar(GfxState *state, double x, double y,
// TODO // TODO
void HTMLRenderer::drawString(GfxState * state, GooString * s) void HTMLRenderer::drawString(GfxState * state, GooString * s)
{ {
check_state_change(state);
auto font = state->getFont(); auto font = state->getFont();
if(font->getWMode()) if(font->getWMode())
std::cerr << "TODO: writing mode" << std::endl; {
//TODO
return;
}
// stolen from poppler // from poppler
double dx = 0; double dx = 0;
double dy = 0; double dy = 0;
double dx2, dy2; double dx2, dy2;
@ -992,12 +925,11 @@ void HTMLRenderer::check_state_change(GfxState * state)
{ {
if(pos_changed) if(pos_changed)
{ {
if(!_equal(state->getLineY(), cur_line_y)) if(!_equal(state->getLineY(), cur_ty))
{ {
close_cur_line(); close_cur_line();
cur_line_y = state->getLineY(); cur_ty = state->getLineY();
} }
} }
if(color_changed) if(color_changed)
@ -1012,7 +944,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
} }
} }
bool need_rescale_font = true; bool need_rescale_font = false;
if(font_changed) if(font_changed)
{ {
long long new_fn_id = install_font(state->getFont()); long long new_fn_id = install_font(state->getFont());
@ -1022,6 +954,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
close_cur_line(); close_cur_line();
cur_fn_id = new_fn_id; cur_fn_id = new_fn_id;
} }
if(!_equal(cur_font_size, state->getFontSize())) if(!_equal(cur_font_size, state->getFontSize()))
{ {
cur_font_size = state->getFontSize(); cur_font_size = state->getFontSize();
@ -1043,7 +976,9 @@ void HTMLRenderer::check_state_change(GfxState * state)
new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3]; new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3];
new_ctm[4] = new_ctm[5] = 0; new_ctm[4] = new_ctm[5] = 0;
if(!_tm_equal(new_ctm, draw_ctm, 4)) // TODO: this is not correct
// what to check?
if(!_tm_equal(new_ctm, draw_ctm, 4)) { }
{ {
need_rescale_font = true; need_rescale_font = true;
} }

View File

@ -199,7 +199,7 @@ class HTMLRenderer : public OutputDev
double cur_line_x_offset; double cur_line_x_offset;
// current position // current position
double cur_line_y; double cur_tx, cur_ty; // in text coords
bool pos_changed; bool pos_changed;
long long cur_fn_id; long long cur_fn_id;