mirror of
https://github.com/pdf2htmlEX/pdf2htmlEX.git
synced 2024-07-03 00:35:40 +00:00
add HTMLTextPage for further optimization
This commit is contained in:
parent
eb26833dd1
commit
68791299e5
|
@ -181,15 +181,17 @@ add_executable(pdf2htmlEX
|
||||||
src/util/path.cc
|
src/util/path.cc
|
||||||
src/util/unicode.h
|
src/util/unicode.h
|
||||||
src/util/unicode.cc
|
src/util/unicode.cc
|
||||||
src/HTMLState.h
|
|
||||||
src/ArgParser.h
|
src/ArgParser.h
|
||||||
src/ArgParser.cc
|
src/ArgParser.cc
|
||||||
src/Base64Stream.h
|
src/Base64Stream.h
|
||||||
src/Base64Stream.cc
|
src/Base64Stream.cc
|
||||||
src/Color.h
|
src/Color.h
|
||||||
src/Color.cc
|
src/Color.cc
|
||||||
|
src/HTMLState.h
|
||||||
src/HTMLTextLine.h
|
src/HTMLTextLine.h
|
||||||
src/HTMLTextLine.cc
|
src/HTMLTextLine.cc
|
||||||
|
src/HTMLTextPage.h
|
||||||
|
src/HTMLTextPage.cc
|
||||||
src/Preprocessor.h
|
src/Preprocessor.h
|
||||||
src/Preprocessor.cc
|
src/Preprocessor.cc
|
||||||
src/StringFormatter.h
|
src/StringFormatter.h
|
||||||
|
|
|
@ -10,7 +10,6 @@
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
#include <OutputDev.h>
|
#include <OutputDev.h>
|
||||||
#include <GfxState.h>
|
#include <GfxState.h>
|
||||||
|
@ -27,7 +26,7 @@
|
||||||
#include "TmpFiles.h"
|
#include "TmpFiles.h"
|
||||||
#include "Color.h"
|
#include "Color.h"
|
||||||
#include "StateManager.h"
|
#include "StateManager.h"
|
||||||
#include "HTMLTextLine.h"
|
#include "HTMLTextPage.h"
|
||||||
|
|
||||||
#include "util/const.h"
|
#include "util/const.h"
|
||||||
#include "util/misc.h"
|
#include "util/misc.h"
|
||||||
|
@ -242,17 +241,11 @@ protected:
|
||||||
double print_scale (void) const { return 96.0 / DEFAULT_DPI / text_zoom_factor(); }
|
double print_scale (void) const { return 96.0 / DEFAULT_DPI / text_zoom_factor(); }
|
||||||
|
|
||||||
|
|
||||||
|
const Param & param;
|
||||||
|
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
// PDF states
|
// PDF states
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
bool line_opened;
|
|
||||||
enum NewLineState
|
|
||||||
{
|
|
||||||
NLS_NONE, // stay with the same style
|
|
||||||
NLS_SPAN, // open a new <span> if possible, otherwise a new <div>
|
|
||||||
NLS_DIV // has to open a new <div>
|
|
||||||
} new_line_state;
|
|
||||||
|
|
||||||
// track the original (unscaled) values to determine scaling and merge lines
|
// track the original (unscaled) values to determine scaling and merge lines
|
||||||
// current position
|
// current position
|
||||||
double cur_tx, cur_ty; // real text position, in text coords
|
double cur_tx, cur_ty; // real text position, in text coords
|
||||||
|
@ -290,8 +283,18 @@ protected:
|
||||||
// also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty)
|
// also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty)
|
||||||
double draw_tx, draw_ty;
|
double draw_tx, draw_ty;
|
||||||
|
|
||||||
// some metrics have to be determined after all elements in the lines have been seen
|
// managers store values actually used in HTML (i.e. scaled)
|
||||||
std::vector<std::unique_ptr<HTMLTextLine>> text_lines;
|
AllStateManater all_manager;
|
||||||
|
|
||||||
|
enum NewLineState
|
||||||
|
{
|
||||||
|
NLS_NONE, // stay with the same style
|
||||||
|
NLS_SPAN, // open a new <span> if possible, otherwise a new <div>
|
||||||
|
NLS_DIV // has to open a new <div>
|
||||||
|
} new_line_state;
|
||||||
|
|
||||||
|
|
||||||
|
HTMLTextPage html_text_page;
|
||||||
|
|
||||||
// for font reencoding
|
// for font reencoding
|
||||||
int32_t * cur_mapping;
|
int32_t * cur_mapping;
|
||||||
|
@ -311,11 +314,6 @@ protected:
|
||||||
HTMLState cur_html_state;
|
HTMLState cur_html_state;
|
||||||
std::unordered_map<long long, FontInfo> font_info_map;
|
std::unordered_map<long long, FontInfo> font_info_map;
|
||||||
|
|
||||||
// managers store values actually used in HTML (i.e. scaled)
|
|
||||||
AllStateManater all_manager;
|
|
||||||
|
|
||||||
const Param & param;
|
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
std::ofstream fs;
|
std::ofstream fs;
|
||||||
std::string path;
|
std::string path;
|
||||||
|
|
|
@ -40,10 +40,10 @@ using std::endl;
|
||||||
|
|
||||||
HTMLRenderer::HTMLRenderer(const Param & param)
|
HTMLRenderer::HTMLRenderer(const Param & param)
|
||||||
:OutputDev()
|
:OutputDev()
|
||||||
,line_opened(false)
|
,param(param)
|
||||||
|
,html_text_page(param, all_manager)
|
||||||
,preprocessor(param)
|
,preprocessor(param)
|
||||||
,tmp_files(param)
|
,tmp_files(param)
|
||||||
,param(param)
|
|
||||||
{
|
{
|
||||||
if(!(param.debug))
|
if(!(param.debug))
|
||||||
{
|
{
|
||||||
|
@ -51,7 +51,6 @@ HTMLRenderer::HTMLRenderer(const Param & param)
|
||||||
globalParams->setErrQuiet(gTrue);
|
globalParams->setErrQuiet(gTrue);
|
||||||
}
|
}
|
||||||
|
|
||||||
text_lines.emplace_back(new HTMLTextLine(param, all_manager));
|
|
||||||
ffw_init(param.debug);
|
ffw_init(param.debug);
|
||||||
cur_mapping = new int32_t [0x10000];
|
cur_mapping = new int32_t [0x10000];
|
||||||
cur_mapping2 = new char* [0x100];
|
cur_mapping2 = new char* [0x100];
|
||||||
|
@ -169,8 +168,6 @@ void HTMLRenderer::startPage(int pageNum, GfxState *state)
|
||||||
|
|
||||||
void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
|
void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
|
||||||
{
|
{
|
||||||
assert((!line_opened) && "Open line in startPage detected!");
|
|
||||||
|
|
||||||
this->pageNum = pageNum;
|
this->pageNum = pageNum;
|
||||||
|
|
||||||
long long wid = all_manager.width.install(state->getPageWidth());
|
long long wid = all_manager.width.install(state->getPageWidth());
|
||||||
|
@ -213,8 +210,9 @@ void HTMLRenderer::endPage() {
|
||||||
close_text_line();
|
close_text_line();
|
||||||
|
|
||||||
// dump all text
|
// dump all text
|
||||||
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
|
html_text_page.dump_text(f_pages.fs);
|
||||||
(*iter)->flush(f_pages.fs);
|
html_text_page.dump_css(f_css.fs);
|
||||||
|
html_text_page.clear();
|
||||||
|
|
||||||
// process links before the page is closed
|
// process links before the page is closed
|
||||||
cur_doc->processLinks(this, pageNum);
|
cur_doc->processLinks(this, pageNum);
|
||||||
|
|
|
@ -335,7 +335,7 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||||
|
|
||||||
if(merged)
|
if(merged)
|
||||||
{
|
{
|
||||||
text_lines.back()->append_offset(dx * old_draw_text_scale);
|
html_text_page.append_offset(dx * old_draw_text_scale);
|
||||||
if(equal(dy, 0))
|
if(equal(dy, 0))
|
||||||
{
|
{
|
||||||
cur_html_state.vertical_align = 0;
|
cur_html_state.vertical_align = 0;
|
||||||
|
@ -434,11 +434,6 @@ void HTMLRenderer::check_state_change(GfxState * state)
|
||||||
|
|
||||||
void HTMLRenderer::prepare_text_line(GfxState * state)
|
void HTMLRenderer::prepare_text_line(GfxState * state)
|
||||||
{
|
{
|
||||||
if(!line_opened)
|
|
||||||
{
|
|
||||||
new_line_state = NLS_DIV;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(new_line_state == NLS_DIV)
|
if(new_line_state == NLS_DIV)
|
||||||
{
|
{
|
||||||
close_text_line();
|
close_text_line();
|
||||||
|
@ -458,26 +453,20 @@ void HTMLRenderer::prepare_text_line(GfxState * state)
|
||||||
double target = (cur_tx - draw_tx) * draw_text_scale;
|
double target = (cur_tx - draw_tx) * draw_text_scale;
|
||||||
if(!equal(target, 0))
|
if(!equal(target, 0))
|
||||||
{
|
{
|
||||||
text_lines.back()->append_offset(target);
|
html_text_page.append_offset(target);
|
||||||
draw_tx += target / draw_text_scale;
|
draw_tx += target / draw_text_scale;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(new_line_state != NLS_NONE)
|
if(new_line_state != NLS_NONE)
|
||||||
{
|
{
|
||||||
text_lines.back()->append_state(cur_html_state);
|
html_text_page.append_state(cur_html_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
line_opened = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLRenderer::close_text_line()
|
void HTMLRenderer::close_text_line()
|
||||||
{
|
{
|
||||||
if(line_opened)
|
html_text_page.open_new_line();
|
||||||
{
|
|
||||||
line_opened = false;
|
|
||||||
text_lines.emplace_back(new HTMLTextLine(param, all_manager));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} //namespace pdf2htmlEX
|
} //namespace pdf2htmlEX
|
||||||
|
|
|
@ -90,13 +90,13 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||||
if(is_space && (param.space_as_offset))
|
if(is_space && (param.space_as_offset))
|
||||||
{
|
{
|
||||||
// ignore horiz_scaling, as it's merged in CTM
|
// ignore horiz_scaling, as it's merged in CTM
|
||||||
text_lines.back()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
|
html_text_page.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
|
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
|
||||||
{
|
{
|
||||||
text_lines.back()->append_unicodes(u, uLen);
|
html_text_page.append_unicodes(u, uLen);
|
||||||
// TODO: decomposed characters may be not with the same width as the original ligature, need to fix it.
|
// TODO: decomposed characters may be not with the same width as the original ligature, need to fix it.
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -110,14 +110,14 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
||||||
{
|
{
|
||||||
uu = unicode_from_font(code, font);
|
uu = unicode_from_font(code, font);
|
||||||
}
|
}
|
||||||
text_lines.back()->append_unicodes(&uu, 1);
|
html_text_page.append_unicodes(&uu, 1);
|
||||||
/*
|
/*
|
||||||
* In PDF, word_space is appended if (n == 1 and *p = ' ')
|
* In PDF, word_space is appended if (n == 1 and *p = ' ')
|
||||||
* but in HTML, word_space is appended if (uu == ' ')
|
* but in HTML, word_space is appended if (uu == ' ')
|
||||||
*/
|
*/
|
||||||
int space_count = (is_space ? 1 : 0) - (uu == ' ' ? 1 : 0);
|
int space_count = (is_space ? 1 : 0) - (uu == ' ' ? 1 : 0);
|
||||||
if(space_count != 0)
|
if(space_count != 0)
|
||||||
text_lines.back()->append_offset(cur_word_space * draw_text_scale * space_count);
|
html_text_page.append_offset(cur_word_space * draw_text_scale * space_count);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,17 +6,13 @@
|
||||||
* Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
|
* Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#include "HTMLTextLine.h"
|
#include "HTMLTextLine.h"
|
||||||
|
|
||||||
#include "util/namespace.h"
|
|
||||||
#include "util/unicode.h"
|
|
||||||
#include "util/math.h"
|
|
||||||
#include "util/css_const.h"
|
|
||||||
#include "util/encoding.h"
|
#include "util/encoding.h"
|
||||||
|
#include "util/css_const.h"
|
||||||
|
|
||||||
namespace pdf2htmlEX {
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
|
@ -29,6 +25,10 @@ using std::endl;
|
||||||
using std::find;
|
using std::find;
|
||||||
using std::abs;
|
using std::abs;
|
||||||
|
|
||||||
|
HTMLTextLine::HTMLTextLine (const Param & param, AllStateManater & all_manager)
|
||||||
|
: param(param), all_manager(all_manager)
|
||||||
|
{ }
|
||||||
|
|
||||||
void HTMLTextLine::append_unicodes(const Unicode * u, int l)
|
void HTMLTextLine::append_unicodes(const Unicode * u, int l)
|
||||||
{
|
{
|
||||||
text.insert(text.end(), u, u+l);
|
text.insert(text.end(), u, u+l);
|
||||||
|
@ -59,18 +59,14 @@ void HTMLTextLine::append_state(const HTMLState & html_state)
|
||||||
(HTMLState&)(states.back()) = html_state;
|
(HTMLState&)(states.back()) = html_state;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTextLine::flush(ostream & out)
|
void HTMLTextLine::dump_text(ostream & out)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Each Line is an independent absolute positioned block
|
* Each Line is an independent absolute positioned block
|
||||||
* so even we have a few states or offsets, we may omit them
|
* so even we have a few states or offsets, we may omit them
|
||||||
*/
|
*/
|
||||||
if(text.empty())
|
if(text.empty())
|
||||||
{
|
|
||||||
states.clear();
|
|
||||||
offsets.clear();
|
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
// remove unuseful states in the end
|
// remove unuseful states in the end
|
||||||
while((!states.empty()) && (states.back().start_idx >= text.size()))
|
while((!states.empty()) && (states.back().start_idx >= text.size()))
|
||||||
|
@ -79,9 +75,6 @@ void HTMLTextLine::flush(ostream & out)
|
||||||
if(states.empty() || (states[0].start_idx != 0))
|
if(states.empty() || (states[0].start_idx != 0))
|
||||||
{
|
{
|
||||||
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
|
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
|
||||||
states.clear();
|
|
||||||
text.clear();
|
|
||||||
offsets.clear();
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -256,7 +249,10 @@ void HTMLTextLine::flush(ostream & out)
|
||||||
}
|
}
|
||||||
|
|
||||||
out << "</div>";
|
out << "</div>";
|
||||||
|
}
|
||||||
|
|
||||||
|
void HTMLTextLine::clear(void)
|
||||||
|
{
|
||||||
states.clear();
|
states.clear();
|
||||||
offsets.clear();
|
offsets.clear();
|
||||||
text.clear();
|
text.clear();
|
||||||
|
|
|
@ -27,8 +27,7 @@ namespace pdf2htmlEX {
|
||||||
class HTMLTextLine
|
class HTMLTextLine
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
HTMLTextLine (const Param & param, AllStateManater & all_manager)
|
HTMLTextLine (const Param & param, AllStateManater & all_manager);
|
||||||
: param(param), all_manager(all_manager) { }
|
|
||||||
|
|
||||||
struct State : public HTMLState {
|
struct State : public HTMLState {
|
||||||
// before output
|
// before output
|
||||||
|
@ -78,7 +77,10 @@ public:
|
||||||
void append_unicodes(const Unicode * u, int l);
|
void append_unicodes(const Unicode * u, int l);
|
||||||
void append_offset(double width);
|
void append_offset(double width);
|
||||||
void append_state(const HTMLState & html_state);
|
void append_state(const HTMLState & html_state);
|
||||||
void flush(std::ostream & out);
|
void dump_text(std::ostream & out);
|
||||||
|
|
||||||
|
bool empty(void) const { return text.empty(); }
|
||||||
|
void clear(void);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void optimize(void);
|
void optimize(void);
|
||||||
|
|
78
src/HTMLTextPage.cc
Normal file
78
src/HTMLTextPage.cc
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
/*
|
||||||
|
* HTMLTextPage.cc
|
||||||
|
*
|
||||||
|
* Generate and optimized HTML for one Page
|
||||||
|
*
|
||||||
|
* Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "HTMLTextPage.h"
|
||||||
|
|
||||||
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
|
using std::ostream;
|
||||||
|
|
||||||
|
HTMLTextPage::HTMLTextPage(const Param & param, AllStateManater & all_manager)
|
||||||
|
: param(param)
|
||||||
|
, all_manager(all_manager)
|
||||||
|
, last_line(nullptr)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
void HTMLTextPage::dump_text(ostream & out)
|
||||||
|
{
|
||||||
|
for(auto iter = text_lines.begin(); iter != text_lines.end(); ++iter)
|
||||||
|
(*iter)->dump_text(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
void HTMLTextPage::append_unicodes(const Unicode * u, int l)
|
||||||
|
{
|
||||||
|
if(!last_line)
|
||||||
|
open_new_line();
|
||||||
|
last_line->append_unicodes(u, l);
|
||||||
|
}
|
||||||
|
|
||||||
|
void HTMLTextPage::append_offset(double offset)
|
||||||
|
{
|
||||||
|
if(!last_line)
|
||||||
|
open_new_line();
|
||||||
|
last_line->append_offset(offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
void HTMLTextPage::append_state(const HTMLState & state)
|
||||||
|
{
|
||||||
|
if(!last_line)
|
||||||
|
open_new_line();
|
||||||
|
last_line->append_state(state);
|
||||||
|
}
|
||||||
|
|
||||||
|
void HTMLTextPage::dump_css(ostream & out)
|
||||||
|
{
|
||||||
|
//TODO
|
||||||
|
}
|
||||||
|
|
||||||
|
void HTMLTextPage::clear(void)
|
||||||
|
{
|
||||||
|
text_lines.clear();
|
||||||
|
last_line = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HTMLTextPage::open_new_line(void)
|
||||||
|
{
|
||||||
|
if(last_line && (last_line->empty()))
|
||||||
|
{
|
||||||
|
// state and offsets might be nonempty
|
||||||
|
last_line->clear();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
text_lines.emplace_back(new HTMLTextLine(param, all_manager));
|
||||||
|
last_line = text_lines.back().get();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void HTMLTextPage::optimize(void)
|
||||||
|
{
|
||||||
|
//TODO
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace pdf2htmlEX
|
51
src/HTMLTextPage.h
Normal file
51
src/HTMLTextPage.h
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
/*
|
||||||
|
* Header file for HTMLTextPage
|
||||||
|
* Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef HTMLTEXTPAGE_H__
|
||||||
|
#define HTMLTEXTPAGE_H__
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include <ostream>
|
||||||
|
|
||||||
|
#include "Param.h"
|
||||||
|
#include "StateManager.h"
|
||||||
|
#include "HTMLTextLine.h"
|
||||||
|
#include "HTMLState.h"
|
||||||
|
|
||||||
|
namespace pdf2htmlEX {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Store and optimize a page of text in HTML
|
||||||
|
*
|
||||||
|
* contains a series of HTMLTextLine
|
||||||
|
*/
|
||||||
|
|
||||||
|
class HTMLTextPage
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
HTMLTextPage (const Param & param, AllStateManater & all_manager);
|
||||||
|
|
||||||
|
void append_unicodes(const Unicode * u, int l);
|
||||||
|
void append_offset(double offset);
|
||||||
|
void append_state(const HTMLState & state);
|
||||||
|
|
||||||
|
void dump_text(std::ostream & out);
|
||||||
|
void dump_css(std::ostream & out);
|
||||||
|
void clear(void);
|
||||||
|
|
||||||
|
void open_new_line(void);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void optimize(void);
|
||||||
|
|
||||||
|
const Param & param;
|
||||||
|
AllStateManater & all_manager;
|
||||||
|
HTMLTextLine * last_line;
|
||||||
|
std::vector<std::unique_ptr<HTMLTextLine>> text_lines;
|
||||||
|
};
|
||||||
|
|
||||||
|
} //namespace pdf2htmlEX
|
||||||
|
#endif //HTMLTEXTPAGE_H__
|
Loading…
Reference in New Issue
Block a user