mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-12-22 04:50:09 +00:00
merge README
This commit is contained in:
commit
37f2082817
@ -17,7 +17,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
|
||||
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
|
||||
|
||||
add_executable(pdftohtmlEX src/pdftohtmlEX.cc src/HTMLRenderer.cc src/HTMLRenderer.h src/BackgroundRenderer.cc src/BackgroundRenderer.h)
|
||||
add_executable(pdftohtmlEX src/pdftohtmlEX.cc src/HTMLRenderer.cc src/HTMLRenderer.h src/BackgroundRenderer.cc src/BackgroundRenderer.h src/Consts.h)
|
||||
target_link_libraries(pdftohtmlEX poppler boost_program_options)
|
||||
|
||||
|
||||
|
@ -6,13 +6,9 @@ pdf2html**EX**
|
||||
|
||||
Introduction
|
||||
-----------------------------
|
||||
Traditional pdf -> html conversion tools are more likely pdf -> text tools.
|
||||
pdf2htmlEX renders PDF files in HTML, utilizing modern technologies of html/css, aims to provide an accuracy rendering, while keeping optimized for Web display.
|
||||
|
||||
For those who are not satisfied with them, this might be the right one for you.
|
||||
|
||||
pdf2htmlEX utilizes latest technologies of html/css, aims to provide an accuracy rendering, while keeping optimized for Web display.
|
||||
|
||||
pdf2htmlEX is optimized for recent versions of moderm web browsers such as Mozilla Firefox & Google Chrome.
|
||||
pdf2htmlEX is optimized for recent versions of modern web browsers such as Mozilla Firefox & Google Chrome.
|
||||
|
||||
Features
|
||||
----------------------------
|
||||
|
@ -14,7 +14,7 @@ void BackgroundRenderer::drawChar(GfxState *state, double x, double y,
|
||||
CharCode code, int nBytes, Unicode *u, int uLen)
|
||||
{
|
||||
auto font = state->getFont();
|
||||
if((font->getType() == fontType3) || (font->getWMode()))
|
||||
// if((font->getType() == fontType3) || (font->getWMode()))
|
||||
{
|
||||
SplashOutputDev::drawChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen);
|
||||
}
|
||||
|
79
src/Consts.h
Normal file
79
src/Consts.h
Normal file
@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Constants
|
||||
*
|
||||
* by WangLu
|
||||
* 2012.08.07
|
||||
*/
|
||||
|
||||
#ifndef CONSTS_H__
|
||||
#define CONSTS_H__
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
const char * HTML_HEAD = "<!DOCTYPE html>\n\
|
||||
<html><head>\
|
||||
<meta charset=\"utf-8\">\
|
||||
<style type=\"text/css\">\
|
||||
#pdf-main {\
|
||||
font-family: sans-serif;\
|
||||
position:absolute;\
|
||||
top:0;\
|
||||
left:0;\
|
||||
bottom:0;\
|
||||
right:0;\
|
||||
overflow:auto;\
|
||||
background-color:grey;\
|
||||
}\
|
||||
#pdf-main > .p {\
|
||||
position:relative;\
|
||||
margin:13px auto;\
|
||||
background-color:white;\
|
||||
overflow:hidden;\
|
||||
display:none;\
|
||||
}\
|
||||
.p > .l {\
|
||||
position:absolute; \
|
||||
white-space:pre;\
|
||||
}\
|
||||
.l > .w {\
|
||||
display:inline-block;\
|
||||
}\
|
||||
::selection{\
|
||||
background: rgba(168,209,255,0.5);\
|
||||
}\
|
||||
::-moz-selection{\
|
||||
background: rgba(168,209,255,0.5);\
|
||||
}\
|
||||
</style><link rel=\"stylesheet\" type=\"text/css\" href=\"all.css\" />\
|
||||
<script type=\"text/javascript\">\
|
||||
function show_pages()\
|
||||
{\
|
||||
var pages = document.getElementById('pdf-main').childNodes;\
|
||||
var idx = 0;\
|
||||
var f = function(){\
|
||||
if (idx < pages.length) {\
|
||||
try{\
|
||||
pages[idx].style.display='block';\
|
||||
}catch(e){}\
|
||||
++idx;\
|
||||
setTimeout(f,100);\
|
||||
}\
|
||||
};\
|
||||
f();\
|
||||
};\
|
||||
</script>\
|
||||
</head><body onload=\"show_pages();\"><div id=\"pdf-main\">";
|
||||
|
||||
const char * HTML_TAIL = "</div></body></html>";
|
||||
|
||||
const std::map<string, string> BASE_14_FONT_CSS_FONT_MAP({\
|
||||
{ "Courier", "Courier,monospace" },\
|
||||
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },\
|
||||
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },\
|
||||
{ "Symbol", "Symbol,\"Standard Symbols L\"" },\
|
||||
{ "ZapfDingbats", "ZapfDingbats,\"Dingbats\"" },\
|
||||
});
|
||||
|
||||
const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
|
||||
|
||||
#endif //CONSTS_H__
|
@ -25,6 +25,7 @@
|
||||
|
||||
#include "HTMLRenderer.h"
|
||||
#include "BackgroundRenderer.h"
|
||||
#include "Consts.h"
|
||||
|
||||
/*
|
||||
* CSS classes
|
||||
@ -41,74 +42,9 @@
|
||||
* w<hex> - White space
|
||||
* t<hex> - Transform matrix
|
||||
* c<hex> - Color
|
||||
*
|
||||
*/
|
||||
|
||||
const char * HTML_HEAD = "<!DOCTYPE html>\n\
|
||||
<html><head>\
|
||||
<meta charset=\"utf-8\">\
|
||||
<style type=\"text/css\">\
|
||||
#pdf-main {\
|
||||
font-family: sans-serif;\
|
||||
position:absolute;\
|
||||
top:0;\
|
||||
left:0;\
|
||||
bottom:0;\
|
||||
right:0;\
|
||||
overflow:auto;\
|
||||
background-color:grey;\
|
||||
}\
|
||||
#pdf-main > .p {\
|
||||
position:relative;\
|
||||
margin:13px auto;\
|
||||
background-color:white;\
|
||||
overflow:hidden;\
|
||||
display:none;\
|
||||
}\
|
||||
.p > .l {\
|
||||
position:absolute; \
|
||||
white-space:pre;\
|
||||
}\
|
||||
.l > .w {\
|
||||
display:inline-block;\
|
||||
}\
|
||||
::selection{\
|
||||
background: rgba(168,209,255,0.5);\
|
||||
}\
|
||||
::-moz-selection{\
|
||||
background: rgba(168,209,255,0.5);\
|
||||
}\
|
||||
</style><link rel=\"stylesheet\" type=\"text/css\" href=\"all.css\" />\
|
||||
<script type=\"text/javascript\">\
|
||||
function show_pages()\
|
||||
{\
|
||||
var pages = document.getElementById('pdf-main').childNodes;\
|
||||
var idx = 0;\
|
||||
var f = function(){\
|
||||
if (idx < pages.length) {\
|
||||
try{\
|
||||
pages[idx].style.display='block';\
|
||||
}catch(e){}\
|
||||
++idx;\
|
||||
setTimeout(f,100);\
|
||||
}\
|
||||
};\
|
||||
f();\
|
||||
};\
|
||||
</script>\
|
||||
</head><body onload=\"show_pages();\"><div id=\"pdf-main\">";
|
||||
|
||||
const char * HTML_TAIL = "</div></body></html>";
|
||||
|
||||
const std::map<string, string> BASE_14_FONT_CSS_FONT_MAP({\
|
||||
{ "Courier", "Courier,monospace" },\
|
||||
{ "Helvetica", "Helvetica,Arial,\"Nimbus Sans L\",sans-serif" },\
|
||||
{ "Times", "Times,\"Time New Roman\",\"Nimbus Roman No9 L\",serif" },\
|
||||
{ "Symbol", "Symbol" },\
|
||||
{ "ZapfDingbats", "ZapfDingbats" },\
|
||||
});
|
||||
|
||||
const double id_matrix[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
|
||||
|
||||
TextString::TextString(GfxState *state)
|
||||
:unicodes()
|
||||
,x(state->getCurX())
|
||||
@ -242,7 +178,7 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
||||
|
||||
cur_fn_id = cur_fs_id = cur_tm_id = cur_color_id = 0;
|
||||
cur_line_x_offset = 0;
|
||||
cur_line_y = 0;
|
||||
cur_tx = cur_ty = 0;
|
||||
cur_font_size = 0;
|
||||
|
||||
memcpy(draw_ctm, id_matrix, sizeof(draw_ctm));
|
||||
@ -370,6 +306,7 @@ void HTMLRenderer::beginString(GfxState *state, GooString *s) {
|
||||
void HTMLRenderer::endString(GfxState *state) {
|
||||
if (cur_string->getSize() == 0) {
|
||||
delete cur_string ;
|
||||
cur_string = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -378,8 +315,9 @@ void HTMLRenderer::endString(GfxState *state) {
|
||||
{
|
||||
if(at_same_line(cur_line, cur_string))
|
||||
{
|
||||
double x1 = cur_line->getX() + cur_line->getWidth();
|
||||
double x2 = cur_string->getX();
|
||||
// TODO: this is not correct
|
||||
double x1 = cur_line->getState()->getLineX() + cur_line->getWidth();
|
||||
double x2 = cur_string->getState()->getLineX();
|
||||
double target = (x2-x1-cur_line_x_offset) * draw_scale;
|
||||
|
||||
if(target > -param->h_eps)
|
||||
@ -437,7 +375,8 @@ void HTMLRenderer::endString(GfxState *state) {
|
||||
html_fout << "\"";
|
||||
double x,y;
|
||||
cur_state->transform(cur_state->getCurX(), cur_state->getCurY(), &x, &y);
|
||||
html_fout << boost::format(" data-x=\"%1%\" data-y=\"%2%\" hs=\"%3%")%x%y%(cur_state->getHorizScaling());
|
||||
html_fout << boost::format("data-lx=\"%5%\" data-ly=\"%6%\" data-scale=\"%4%\" data-x=\"%1%\" data-y=\"%2%\" data-hs=\"%3%")
|
||||
%x%y%(cur_state->getHorizScaling())%draw_scale%cur_state->getLineX()%cur_state->getLineY();
|
||||
}
|
||||
|
||||
html_fout << "\">";
|
||||
@ -462,17 +401,6 @@ void HTMLRenderer::drawChar(GfxState *state, double x, double y,
|
||||
cur_string->addUnicodes(state, x, y, dx, dy, u, uLen);
|
||||
else
|
||||
{
|
||||
if(nBytes > 0)
|
||||
{
|
||||
std::cerr << "Cannot map to Unicode!" << std::endl;
|
||||
std::cerr << cur_fn_id << std::endl;
|
||||
std::cerr << "*";
|
||||
for(int i = 0; i < nBytes; ++i)
|
||||
{
|
||||
std::cerr << (int)(((char*)&code)[i]);
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
cur_string->addChars(state, x, y, dx, dy, code, nBytes);
|
||||
}
|
||||
}
|
||||
@ -480,11 +408,16 @@ void HTMLRenderer::drawChar(GfxState *state, double x, double y,
|
||||
// TODO
|
||||
void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||
{
|
||||
check_state_change(state);
|
||||
|
||||
auto font = state->getFont();
|
||||
if(font->getWMode())
|
||||
std::cerr << "TODO: writing mode" << std::endl;
|
||||
{
|
||||
//TODO
|
||||
return;
|
||||
}
|
||||
|
||||
// stolen from poppler
|
||||
// from poppler
|
||||
double dx = 0;
|
||||
double dy = 0;
|
||||
double dx2, dy2;
|
||||
@ -992,12 +925,11 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
{
|
||||
if(pos_changed)
|
||||
{
|
||||
if(!_equal(state->getLineY(), cur_line_y))
|
||||
if(!_equal(state->getLineY(), cur_ty))
|
||||
{
|
||||
close_cur_line();
|
||||
cur_line_y = state->getLineY();
|
||||
cur_ty = state->getLineY();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(color_changed)
|
||||
@ -1012,7 +944,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
}
|
||||
}
|
||||
|
||||
bool need_rescale_font = true;
|
||||
bool need_rescale_font = false;
|
||||
if(font_changed)
|
||||
{
|
||||
long long new_fn_id = install_font(state->getFont());
|
||||
@ -1022,6 +954,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
close_cur_line();
|
||||
cur_fn_id = new_fn_id;
|
||||
}
|
||||
|
||||
if(!_equal(cur_font_size, state->getFontSize()))
|
||||
{
|
||||
cur_font_size = state->getFontSize();
|
||||
@ -1043,7 +976,9 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||
new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3];
|
||||
new_ctm[4] = new_ctm[5] = 0;
|
||||
|
||||
if(!_tm_equal(new_ctm, draw_ctm, 4))
|
||||
// TODO: this is not correct
|
||||
// what to check?
|
||||
if(!_tm_equal(new_ctm, draw_ctm, 4)) { }
|
||||
{
|
||||
need_rescale_font = true;
|
||||
}
|
||||
|
@ -199,7 +199,7 @@ class HTMLRenderer : public OutputDev
|
||||
double cur_line_x_offset;
|
||||
|
||||
// current position
|
||||
double cur_line_y;
|
||||
double cur_tx, cur_ty; // in text coords
|
||||
bool pos_changed;
|
||||
|
||||
long long cur_fn_id;
|
||||
|
Loading…
Reference in New Issue
Block a user