1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 13:00:08 +00:00
pdf2htmlEX/src/HTMLTextLine.cc

737 lines
24 KiB
C++
Raw Permalink Normal View History

2012-09-04 15:33:15 +00:00
/*
2013-04-06 15:51:33 +00:00
* HTMLTextLine.cc
2012-09-04 15:33:15 +00:00
*
* Generate and optimized HTML for one line
*
2013-02-05 06:36:36 +00:00
* Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
2012-09-04 15:33:15 +00:00
*/
2013-03-21 04:18:26 +00:00
#include <cmath>
2013-03-20 15:46:58 +00:00
#include <algorithm>
2012-09-05 07:13:21 +00:00
2013-04-06 15:51:33 +00:00
#include "HTMLTextLine.h"
2013-04-06 08:45:01 +00:00
2013-02-15 05:07:00 +00:00
#include "util/encoding.h"
#include "util/css_const.h"
2012-09-04 15:33:15 +00:00
2012-09-12 15:26:14 +00:00
namespace pdf2htmlEX {
2012-09-04 15:33:15 +00:00
using std::min;
using std::max;
2012-09-05 07:13:21 +00:00
using std::vector;
2012-09-10 17:53:33 +00:00
using std::ostream;
2012-11-29 10:28:07 +00:00
using std::cerr;
using std::endl;
2013-03-20 15:46:58 +00:00
using std::find;
2013-03-21 04:18:26 +00:00
using std::abs;
2012-09-04 15:33:15 +00:00
HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager)
:param(param)
,all_manager(all_manager)
,line_state(line_state)
2013-05-04 11:26:26 +00:00
,clip_x1(0)
,clip_y1(0)
2013-12-22 08:59:59 +00:00
,width(0)
{ }
2013-12-22 08:59:59 +00:00
void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
2012-09-04 15:33:15 +00:00
{
if (l == 1)
text.push_back(min(u[0], (unsigned)INT_MAX));
else if (l > 1)
{
text.push_back(- decomposed_text.size() - 1);
decomposed_text.emplace_back();
decomposed_text.back().assign(u, u + l);
}
2013-12-22 08:59:59 +00:00
this->width += width;
2012-09-04 15:33:15 +00:00
}
2013-04-06 15:51:33 +00:00
void HTMLTextLine::append_offset(double width)
2012-09-04 15:33:15 +00:00
{
2013-04-03 17:35:44 +00:00
/*
* If the last offset is very thin, we can ignore it and directly use it
* But this should not happen often, and we will also filter near-zero offsets when outputting them
* So don't check it.
*
* Offset must be appended immediately after the last real (non-padding) char, or the text optimizing
* algorithm may be confused: it may wrongly convert offsets at the beginning of a line to word-space.
2013-04-03 17:35:44 +00:00
*/
auto offset_idx = text.size();
while (offset_idx > 0 && text[offset_idx - 1] == 0)
--offset_idx;
if((!offsets.empty()) && (offsets.back().start_idx == offset_idx))
2012-09-04 15:33:15 +00:00
offsets.back().width += width;
else
offsets.emplace_back(offset_idx, width);
2013-12-22 08:59:59 +00:00
this->width += width;
2012-09-04 15:33:15 +00:00
}
void HTMLTextLine::append_state(const HTMLTextState & text_state)
2012-09-04 15:33:15 +00:00
{
if(states.empty() || (states.back().start_idx != text.size()))
{
2013-04-03 17:35:44 +00:00
states.emplace_back();
2012-09-04 15:33:15 +00:00
states.back().start_idx = text.size();
2013-03-20 15:46:58 +00:00
states.back().hash_umask = 0;
2012-09-04 15:33:15 +00:00
}
HTMLTextState & last_state = states.back();
last_state = text_state;
//apply font scale
last_state.font_size *= last_state.font_info->font_size_scale;
2012-09-04 15:33:15 +00:00
}
void HTMLTextLine::dump_char(std::ostream & out, int pos)
2014-06-14 19:44:28 +00:00
{
int c = text[pos];
if (c > 0)
{
Unicode u = c;
writeUnicodes(out, &u, 1);
}
else if (c < 0)
2014-06-14 19:44:28 +00:00
{
auto dt = decomposed_text[- c - 1];
writeUnicodes(out, &dt.front(), dt.size());
}
}
void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
{
2014-06-27 09:18:29 +00:00
static const Color transparent(0, 0, 0, true);
if (line_state.first_char_index < 0)
{
for (int i = 0; i < len; i++)
dump_char(out, begin + i);
2014-06-14 19:44:28 +00:00
return;
}
bool invisible_group_open = false;
for(int i = 0; i < len; i++)
2014-06-14 19:44:28 +00:00
{
if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible
2014-06-14 19:44:28 +00:00
{
if (invisible_group_open)
{
invisible_group_open = false;
out << "</span>";
}
dump_char(out, begin + i);
2014-06-14 19:44:28 +00:00
}
else
{
if (!invisible_group_open)
{
out << "<span class=\"" << all_manager.fill_color.get_css_class_name()
<< all_manager.fill_color.install(transparent) << " " << all_manager.stroke_color.get_css_class_name()
<< all_manager.stroke_color.install(transparent) << "\">";
invisible_group_open = true;
}
dump_char(out, begin + i);
2014-06-14 19:44:28 +00:00
}
}
if (invisible_group_open)
out << "</span>";
2014-06-14 19:44:28 +00:00
}
2013-04-07 09:10:36 +00:00
void HTMLTextLine::dump_text(ostream & out)
2012-09-04 15:33:15 +00:00
{
/*
2012-12-11 12:52:36 +00:00
* Each Line is an independent absolute positioned block
2012-09-04 15:33:15 +00:00
* so even we have a few states or offsets, we may omit them
*/
if(text.empty())
2013-04-07 09:10:36 +00:00
return;
2013-04-03 06:04:39 +00:00
2012-09-04 15:33:15 +00:00
if(states.empty() || (states[0].start_idx != 0))
{
cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
2013-04-07 09:10:36 +00:00
return;
2012-09-04 15:33:15 +00:00
}
2013-04-03 17:35:44 +00:00
// Start Output
2013-04-03 06:04:39 +00:00
{
2013-04-03 17:35:44 +00:00
// open <div> for the current text line
2013-04-03 06:04:39 +00:00
out << "<div class=\"" << CSS::LINE_CN
<< " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(line_state.transform_matrix)
2013-05-04 11:26:26 +00:00
<< " " << CSS::LEFT_CN << all_manager.left.install(line_state.x - clip_x1)
2013-04-07 08:10:52 +00:00
<< " " << CSS::HEIGHT_CN << all_manager.height.install(ascent)
2013-05-04 11:26:26 +00:00
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(line_state.y - clip_y1)
2013-04-04 14:57:50 +00:00
;
2013-04-04 08:28:59 +00:00
// it will be closed by the first state
2013-04-03 06:04:39 +00:00
}
2012-09-04 15:33:15 +00:00
2013-04-07 08:10:52 +00:00
std::vector<State*> stack;
2013-04-03 17:35:44 +00:00
// a special safeguard in the bottom
stack.push_back(nullptr);
2013-04-03 06:04:39 +00:00
//accumulated horizontal offset;
double dx = 0;
// whenever a negative offset appears, we should not pop out that <span>
// otherwise the effect of negative margin-left would disappear
2012-09-06 11:05:49 +00:00
size_t last_text_pos_with_negative_offset = 0;
2012-09-04 15:33:15 +00:00
size_t cur_text_idx = 0;
2013-04-03 06:04:39 +00:00
auto cur_offset_iter = offsets.begin();
for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++;
state_iter1 != states.end();
++state_iter1, ++state_iter2)
2012-09-04 15:33:15 +00:00
{
2013-04-03 06:04:39 +00:00
// export current state, find a closest parent
{
// greedy
2013-04-05 13:53:34 +00:00
double vertical_align = state_iter1->vertical_align;
int best_cost = State::HASH_ID_COUNT + 1;
// we have a nullptr at the beginning, so no need to check for rend
for(auto iter = stack.rbegin(); *iter; ++iter)
2012-09-05 07:13:21 +00:00
{
2013-04-03 06:04:39 +00:00
int cost = state_iter1->diff(**iter);
2013-04-05 13:53:34 +00:00
if(!equal(vertical_align,0))
++cost;
if(cost < best_cost)
{
while(stack.back() != *iter)
{
stack.back()->end(out);
stack.pop_back();
}
best_cost = cost;
2013-04-05 13:53:34 +00:00
state_iter1->vertical_align = vertical_align;
if(best_cost == 0)
break;
}
2012-09-05 07:13:21 +00:00
// cannot go further
if((*iter)->start_idx <= last_text_pos_with_negative_offset)
break;
2013-04-05 13:53:34 +00:00
vertical_align += (*iter)->vertical_align;
}
2013-04-05 13:53:34 +00:00
//
state_iter1->ids[State::VERTICAL_ALIGN_ID] = all_manager.vertical_align.install(state_iter1->vertical_align);
2013-04-03 17:35:44 +00:00
// export the diff between *state_iter1 and stack.back()
2013-04-03 06:04:39 +00:00
state_iter1->begin(out, stack.back());
stack.push_back(&*state_iter1);
2012-09-04 15:33:15 +00:00
}
2013-04-03 17:35:44 +00:00
// [state_iter1->start_idx, text_idx2) are covered by the current state
2013-04-03 06:04:39 +00:00
size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
2012-09-04 15:33:15 +00:00
2013-04-03 06:10:11 +00:00
// dump all text and offsets before next state
2013-04-03 06:04:39 +00:00
while(true)
{
2013-04-03 17:35:44 +00:00
if((cur_offset_iter != offsets.end())
&& (cur_offset_iter->start_idx <= cur_text_idx))
2013-03-21 04:18:26 +00:00
{
2013-04-03 06:04:39 +00:00
if(cur_offset_iter->start_idx > text_idx2)
break;
2013-04-03 06:10:11 +00:00
// next is offset
2013-04-03 06:04:39 +00:00
double target = cur_offset_iter->width + dx;
double actual_offset = 0;
2013-04-03 17:35:44 +00:00
//ignore near-zero offsets
2014-06-24 08:31:33 +00:00
if(std::abs(target) <= param.h_eps)
2013-03-24 13:42:51 +00:00
{
2013-04-03 06:04:39 +00:00
actual_offset = 0;
2013-03-24 13:42:51 +00:00
}
2013-04-03 06:04:39 +00:00
else
2013-03-24 13:42:51 +00:00
{
2013-04-03 06:04:39 +00:00
bool done = false;
2013-04-03 17:35:44 +00:00
// check if the offset is equivalent to a single ' '
2013-04-03 06:04:39 +00:00
if(!(state_iter1->hash_umask & State::umask_by_id(State::WORD_SPACE_ID)))
{
double space_off = state_iter1->single_space_offset();
if(std::abs(target - space_off) <= param.h_eps)
2013-04-03 06:04:39 +00:00
{
Unicode u = ' ';
New master (#2) * Show header in font map files * fix a usage of unique_ptr with array * Added '--quiet' argument to hide progress messages (resolves #503) * Revert cout messages to cerr (see #622) * bump version * fix build; fix some coverity warnings * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Rationlise DPI to single number. Implement actual_dpi - clamp maximum background image size in cases of huge PDF pages * DPI fixes - increase DPI when partially covered text to covered-text-dpi Add font-style italic for oblique fonts Reduce char bbox for occlusion tests * Don't shrink bbox - not required if zoom=25 used * Ignore occlusion from stroke/fill with opacity < 0.5 Better compute char bbox for occlusion Use 10% inset for char bbox for occlusion Back out adding font-weight: bold to potentially bold fonts Fix bug to ensure CID ascent/descent matches subfont values * Removed zero char logging * Remove forced italic - missing italic is due to fontforge bug which needs fixing * Typos fixed, readme updated * Typos * Increase maximum background image width Fix private use range to avoid stupid mobile safari switching to emoji font * included -pthread switch to link included 3rdparty poppler files. * Updated files from poppler 0.59.0 and adjusted includes. * Support updated "Object" class from poppler 0.59.0
2018-01-10 19:31:38 +00:00
// Sometimes we guess wrong whether we have a valid space character, so ensure it is always hidden
out << "<span class=\"" << CSS::WHITESPACE_CN << "\">";
2013-10-18 08:31:59 +00:00
writeUnicodes(out, &u, 1);
New master (#2) * Show header in font map files * fix a usage of unique_ptr with array * Added '--quiet' argument to hide progress messages (resolves #503) * Revert cout messages to cerr (see #622) * bump version * fix build; fix some coverity warnings * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Rationlise DPI to single number. Implement actual_dpi - clamp maximum background image size in cases of huge PDF pages * DPI fixes - increase DPI when partially covered text to covered-text-dpi Add font-style italic for oblique fonts Reduce char bbox for occlusion tests * Don't shrink bbox - not required if zoom=25 used * Ignore occlusion from stroke/fill with opacity < 0.5 Better compute char bbox for occlusion Use 10% inset for char bbox for occlusion Back out adding font-weight: bold to potentially bold fonts Fix bug to ensure CID ascent/descent matches subfont values * Removed zero char logging * Remove forced italic - missing italic is due to fontforge bug which needs fixing * Typos fixed, readme updated * Typos * Increase maximum background image width Fix private use range to avoid stupid mobile safari switching to emoji font * included -pthread switch to link included 3rdparty poppler files. * Updated files from poppler 0.59.0 and adjusted includes. * Support updated "Object" class from poppler 0.59.0
2018-01-10 19:31:38 +00:00
out << "</span>";
2013-04-03 06:04:39 +00:00
actual_offset = space_off;
done = true;
}
}
2012-09-04 15:33:15 +00:00
2013-04-03 17:35:44 +00:00
// finally, just dump it
2013-04-03 06:04:39 +00:00
if(!done)
2013-03-24 13:42:51 +00:00
{
2013-12-22 08:59:59 +00:00
long long wid = all_manager.whitespace.install(target, &actual_offset);
2013-12-21 14:47:54 +00:00
2013-12-22 08:59:59 +00:00
if(!equal(actual_offset, 0))
2013-12-21 14:47:54 +00:00
{
2013-12-22 08:59:59 +00:00
if(is_positive(-actual_offset))
last_text_pos_with_negative_offset = cur_text_idx;
2013-12-21 14:47:54 +00:00
double threshold = state_iter1->em_size() * (param.space_threshold);
2013-12-22 08:59:59 +00:00
out << "<span class=\"" << CSS::WHITESPACE_CN
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
2013-04-03 06:04:39 +00:00
}
2013-03-24 13:42:51 +00:00
}
}
2013-04-03 06:04:39 +00:00
dx = target - actual_offset;
++ cur_offset_iter;
2013-03-21 04:18:26 +00:00
}
2013-04-03 06:04:39 +00:00
else
{
if(cur_text_idx >= text_idx2)
break;
2013-04-03 06:10:11 +00:00
// next is text
2013-04-03 06:04:39 +00:00
size_t next_text_idx = text_idx2;
if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
next_text_idx = cur_offset_iter->start_idx;
dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx);
2013-04-03 06:04:39 +00:00
cur_text_idx = next_text_idx;
}
}
2012-09-04 15:33:15 +00:00
}
2012-09-05 07:13:21 +00:00
// we have a nullptr in the bottom
while(stack.back())
2012-09-05 07:13:21 +00:00
{
stack.back()->end(out);
stack.pop_back();
2012-09-05 07:13:21 +00:00
}
2012-09-04 15:33:15 +00:00
out << "</div>";
}
2012-09-04 15:33:15 +00:00
void HTMLTextLine::clear(void)
{
2012-09-04 15:33:15 +00:00
states.clear();
offsets.clear();
text.clear();
}
2013-05-04 13:17:35 +00:00
void HTMLTextLine::clip(const HTMLClipState & clip_state)
2013-05-04 11:26:26 +00:00
{
2013-05-04 13:17:35 +00:00
clip_x1 = clip_state.xmin;
clip_y1 = clip_state.ymin;
2013-05-04 11:26:26 +00:00
}
2013-04-07 08:10:52 +00:00
void HTMLTextLine::prepare(void)
{
// max_ascent determines the height of the div
double accum_vertical_align = 0; // accumulated
ascent = 0;
descent = 0;
// note that vertical_align cannot be calculated here
for(auto iter = states.begin(); iter != states.end(); ++iter)
{
auto font_info = iter->font_info;
iter->ids[State::FONT_ID] = font_info->id;
2013-04-07 08:10:52 +00:00
iter->ids[State::FONT_SIZE_ID] = all_manager.font_size.install(iter->font_size);
iter->ids[State::FILL_COLOR_ID] = all_manager.fill_color.install(iter->fill_color);
iter->ids[State::STROKE_COLOR_ID] = all_manager.stroke_color.install(iter->stroke_color);
iter->ids[State::LETTER_SPACE_ID] = all_manager.letter_space.install(iter->letter_space);
iter->ids[State::WORD_SPACE_ID] = all_manager.word_space.install(iter->word_space);
iter->hash();
accum_vertical_align += iter->vertical_align;
double cur_ascent = accum_vertical_align + font_info->ascent * iter->font_size;
2013-04-07 08:10:52 +00:00
if(cur_ascent > ascent)
ascent = cur_ascent;
double cur_descent = accum_vertical_align + font_info->descent * iter->font_size;
2013-04-07 08:10:52 +00:00
if(cur_descent < descent)
descent = cur_descent;
}
}
2013-12-22 08:59:59 +00:00
void HTMLTextLine::optimize(std::vector<HTMLTextLine*> & lines)
{
if(param.optimize_text == 3)
{
optimize_aggressive(lines);
}
else
{
optimize_normal(lines);
}
}
2013-04-03 07:44:28 +00:00
/*
* Adjust letter space and word space in order to reduce the number of HTML elements
* May also unmask word space
*/
2013-12-22 08:59:59 +00:00
void HTMLTextLine::optimize_normal(std::vector<HTMLTextLine*> & lines)
2013-03-20 15:46:58 +00:00
{
2014-07-13 23:59:30 +00:00
// remove useless states in the end
2013-04-07 08:10:52 +00:00
while((!states.empty()) && (states.back().start_idx >= text.size()))
states.pop_back();
2013-03-20 15:46:58 +00:00
assert(!states.empty());
2013-04-03 07:44:28 +00:00
const long long word_space_umask = State::umask_by_id(State::WORD_SPACE_ID);
2013-04-03 06:04:39 +00:00
// for optimization, we need accurate values
auto & ls_manager = all_manager.letter_space;
auto & ws_manager = all_manager.word_space;
2013-04-03 06:04:39 +00:00
2013-04-03 07:44:28 +00:00
// statistics of widths
std::map<double, size_t> width_map;
2013-04-03 07:44:28 +00:00
// store optimized offsets
std::vector<Offset> new_offsets;
new_offsets.reserve(offsets.size());
2013-03-30 17:00:04 +00:00
2013-04-03 07:44:28 +00:00
auto offset_iter1 = offsets.begin();
New master (#2) * Show header in font map files * fix a usage of unique_ptr with array * Added '--quiet' argument to hide progress messages (resolves #503) * Revert cout messages to cerr (see #622) * bump version * fix build; fix some coverity warnings * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Rationlise DPI to single number. Implement actual_dpi - clamp maximum background image size in cases of huge PDF pages * DPI fixes - increase DPI when partially covered text to covered-text-dpi Add font-style italic for oblique fonts Reduce char bbox for occlusion tests * Don't shrink bbox - not required if zoom=25 used * Ignore occlusion from stroke/fill with opacity < 0.5 Better compute char bbox for occlusion Use 10% inset for char bbox for occlusion Back out adding font-weight: bold to potentially bold fonts Fix bug to ensure CID ascent/descent matches subfont values * Removed zero char logging * Remove forced italic - missing italic is due to fontforge bug which needs fixing * Typos fixed, readme updated * Typos * Increase maximum background image width Fix private use range to avoid stupid mobile safari switching to emoji font * included -pthread switch to link included 3rdparty poppler files. * Updated files from poppler 0.59.0 and adjusted includes. * Support updated "Object" class from poppler 0.59.0
2018-01-10 19:31:38 +00:00
for(auto state_iter1 = states.begin(); state_iter1 != states.end(); ++state_iter1)
2013-03-21 04:18:26 +00:00
{
New master (#2) * Show header in font map files * fix a usage of unique_ptr with array * Added '--quiet' argument to hide progress messages (resolves #503) * Revert cout messages to cerr (see #622) * bump version * fix build; fix some coverity warnings * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Rationlise DPI to single number. Implement actual_dpi - clamp maximum background image size in cases of huge PDF pages * DPI fixes - increase DPI when partially covered text to covered-text-dpi Add font-style italic for oblique fonts Reduce char bbox for occlusion tests * Don't shrink bbox - not required if zoom=25 used * Ignore occlusion from stroke/fill with opacity < 0.5 Better compute char bbox for occlusion Use 10% inset for char bbox for occlusion Back out adding font-weight: bold to potentially bold fonts Fix bug to ensure CID ascent/descent matches subfont values * Removed zero char logging * Remove forced italic - missing italic is due to fontforge bug which needs fixing * Typos fixed, readme updated * Typos * Increase maximum background image width Fix private use range to avoid stupid mobile safari switching to emoji font * included -pthread switch to link included 3rdparty poppler files. * Updated files from poppler 0.59.0 and adjusted includes. * Support updated "Object" class from poppler 0.59.0
2018-01-10 19:31:38 +00:00
const auto state_iter2 = std::next(state_iter1);
2013-04-03 07:44:28 +00:00
const size_t text_idx1 = state_iter1->start_idx;
const size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
New master (#2) * Show header in font map files * fix a usage of unique_ptr with array * Added '--quiet' argument to hide progress messages (resolves #503) * Revert cout messages to cerr (see #622) * bump version * fix build; fix some coverity warnings * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Many bug fixes and improvements, including: - Incorporated latest Cairo files from cairo-0.15.2 - Moved build to out-of-source - Added clean script - Rewritten correct_text_visibility option to improve accuracy - Transparent characters drawn on background layer - Improved bad unicode detection * Rationlise DPI to single number. Implement actual_dpi - clamp maximum background image size in cases of huge PDF pages * DPI fixes - increase DPI when partially covered text to covered-text-dpi Add font-style italic for oblique fonts Reduce char bbox for occlusion tests * Don't shrink bbox - not required if zoom=25 used * Ignore occlusion from stroke/fill with opacity < 0.5 Better compute char bbox for occlusion Use 10% inset for char bbox for occlusion Back out adding font-weight: bold to potentially bold fonts Fix bug to ensure CID ascent/descent matches subfont values * Removed zero char logging * Remove forced italic - missing italic is due to fontforge bug which needs fixing * Typos fixed, readme updated * Typos * Increase maximum background image width Fix private use range to avoid stupid mobile safari switching to emoji font * included -pthread switch to link included 3rdparty poppler files. * Updated files from poppler 0.59.0 and adjusted includes. * Support updated "Object" class from poppler 0.59.0
2018-01-10 19:31:38 +00:00
const size_t text_count = text_idx2 - text_idx1;
2013-03-30 17:00:04 +00:00
2013-04-03 17:35:44 +00:00
// there might be some offsets before the first state
while((offset_iter1 != offsets.end())
&& (offset_iter1->start_idx <= text_idx1))
2013-04-03 07:51:55 +00:00
{
new_offsets.push_back(*(offset_iter1++));
}
2013-04-03 17:35:44 +00:00
// find the last offset covered by the current state
2013-04-03 06:17:27 +00:00
auto offset_iter2 = offset_iter1;
for(; (offset_iter2 != offsets.end()) && (offset_iter2->start_idx <= text_idx2); ++offset_iter2) { }
2013-04-03 17:35:44 +00:00
2013-04-03 07:44:28 +00:00
// There are `offset_count` <span>'s, the target is to reduce this number
size_t offset_count = offset_iter2 - offset_iter1;
assert(text_count >= offset_count);
2013-04-03 17:35:44 +00:00
2013-04-03 07:44:28 +00:00
// Optimize letter space
2013-04-03 17:35:44 +00:00
// how much letter_space is changed
// will be later used for optimizing word space
double letter_space_diff = 0;
width_map.clear();
2013-03-30 17:00:04 +00:00
2013-04-03 07:44:28 +00:00
// In some PDF files all letter spaces are implemented as position shifts between each letter
// try to simplify it with a proper letter space
if(offset_count > 0)
2013-03-21 04:18:26 +00:00
{
2013-04-03 07:44:28 +00:00
// mark the current letter_space
if(text_count > offset_count)
width_map.insert(std::make_pair(0, text_count - offset_count));
2013-03-30 17:21:14 +00:00
2013-04-03 07:44:28 +00:00
for(auto off_iter = offset_iter1; off_iter != offset_iter2; ++off_iter)
2013-03-21 04:18:26 +00:00
{
2013-04-03 07:44:28 +00:00
const double target = off_iter->width;
auto iter = width_map.lower_bound(target-EPS);
2014-06-24 08:31:33 +00:00
if((iter != width_map.end()) && (std::abs(iter->first - target) <= EPS))
2013-04-03 07:44:28 +00:00
{
++ iter->second;
}
else
{
width_map.insert(iter, std::make_pair(target, 1));
}
}
2013-04-03 17:35:44 +00:00
// TODO snapping the widths may result a better result
// e.g. for (-0.7 0.6 -0.2 0.3 10 10), 0 is better than 10
2013-04-03 07:44:28 +00:00
double most_used_width = 0;
size_t max_count = 0;
2013-04-03 07:44:28 +00:00
for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
{
if(iter->second > max_count)
{
most_used_width = iter->first;
max_count = iter->second;
}
}
2013-04-05 13:53:34 +00:00
// negative letter space may cause problems
2013-04-05 14:44:49 +00:00
if((max_count <= text_count / 2) || (!is_positive(state_iter1->letter_space + most_used_width)))
2013-04-03 07:44:28 +00:00
{
// the old value is the best
2013-04-03 17:35:44 +00:00
// just copy old offsets
2013-04-03 07:44:28 +00:00
new_offsets.insert(new_offsets.end(), offset_iter1, offset_iter2);
2013-03-21 04:18:26 +00:00
}
2013-03-30 17:00:04 +00:00
else
2013-03-21 04:18:26 +00:00
{
2013-04-05 13:53:34 +00:00
// now we would like to adjust letter space to most_used width
2013-04-03 07:44:28 +00:00
// install new letter space
2013-04-04 14:57:50 +00:00
const double old_ls = state_iter1->letter_space;
state_iter1->ids[State::LETTER_SPACE_ID] = ls_manager.install(old_ls + most_used_width, &(state_iter1->letter_space));
letter_space_diff = old_ls - state_iter1->letter_space;
2013-04-03 07:44:28 +00:00
// update offsets
auto off_iter = offset_iter1;
// re-count number of offsets
offset_count = 0;
for(size_t cur_text_idx = text_idx1; cur_text_idx < text_idx2; ++cur_text_idx)
{
double cur_width = 0;
if((off_iter != offset_iter2) && (off_iter->start_idx == cur_text_idx + 1))
{
cur_width = off_iter->width + letter_space_diff;
++off_iter;
}
else
{
cur_width = letter_space_diff ;
}
if(!equal(cur_width, 0))
{
2013-04-03 17:35:44 +00:00
new_offsets.emplace_back(cur_text_idx+1, cur_width);
2013-04-03 07:44:28 +00:00
++ offset_count;
}
}
2013-03-21 04:18:26 +00:00
}
2013-03-30 17:00:04 +00:00
}
2013-03-21 04:18:26 +00:00
2013-04-03 07:44:28 +00:00
// Optimize word space
2014-07-13 23:59:30 +00:00
// In some PDF files all spaces are converted into positioning shift
2013-04-03 07:44:28 +00:00
// We may try to change (some of) them to ' ' by adjusting word_space
2014-07-13 23:59:30 +00:00
// for now, we consider only the no-space scenario
2013-04-06 09:01:05 +00:00
// which also includes the case when param.space_as_offset is set
2013-04-03 17:35:44 +00:00
// get the text segment covered by current state (*state_iter1)
const auto text_iter1 = text.begin() + text_idx1;
const auto text_iter2 = text.begin() + text_idx2;
2013-05-02 06:32:17 +00:00
if(find(text_iter1, text_iter2, ' ') == text_iter2)
2013-03-30 17:00:04 +00:00
{
2013-04-03 07:44:28 +00:00
// if there is not any space, we may change the value of word_space arbitrarily
// note that we may only change word space, no offset will be affected
// The actual effect will emerge during flushing, where it could be detected that an offset can be optimized as a single space character
if(offset_count > 0)
2013-03-21 04:18:26 +00:00
{
2013-04-06 09:01:05 +00:00
double threshold = (state_iter1->em_size()) * (param.space_threshold);
2013-04-03 07:44:28 +00:00
// set word_space for the most frequently used offset
double most_used_width = 0;
size_t max_count = 0;
2013-04-03 07:44:28 +00:00
// if offset_count > 0, we must have updated width_map in the previous step
// find the most frequent width, with new letter space applied
for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
{
2013-04-03 17:35:44 +00:00
double fixed_width = iter->first + letter_space_diff; // this is the actual offset in HTML
2013-04-03 07:44:28 +00:00
// we don't want to add spaces for tiny gaps, or even negative shifts
if((fixed_width >= threshold - EPS) && (iter->second > max_count))
{
max_count = iter->second;
most_used_width = fixed_width;
}
}
2013-04-04 14:57:50 +00:00
state_iter1->word_space = 0; // clear word_space for single_space_offset
2013-04-03 07:44:28 +00:00
double new_word_space = most_used_width - state_iter1->single_space_offset();
2013-04-04 14:57:50 +00:00
state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space)); // install new word_space
2013-04-03 17:35:44 +00:00
state_iter1->hash_umask &= (~word_space_umask); // mark that the word_space is not free
2013-04-03 07:44:28 +00:00
}
2013-04-03 17:35:44 +00:00
else // there is no offset at all
2013-04-03 07:44:28 +00:00
{
2013-04-03 17:35:44 +00:00
state_iter1->hash_umask |= word_space_umask; // we just free word_space
2013-03-21 04:18:26 +00:00
}
}
2013-04-03 06:17:27 +00:00
offset_iter1 = offset_iter2;
2013-03-30 17:00:04 +00:00
}
2013-04-03 07:44:28 +00:00
// apply optimization
std::swap(offsets, new_offsets);
2013-12-22 08:59:59 +00:00
lines.push_back(this);
}
// for optimize-text == 3
void HTMLTextLine::optimize_aggressive(std::vector<HTMLTextLine*> & lines)
{
2014-06-07 04:43:53 +00:00
/*
2013-12-26 11:52:03 +00:00
HTMLLineState original_line_state = line_state;
// break the line if there are a large (positive or negative) shift
// letter space / word space are not taken into consideration (yet)
2013-12-22 08:59:59 +00:00
while(true)
{
}
// aggressive optimization
if(target > state_iter1->em_size() * (param.space_threshold) - EPS)
out << ' ';
dx = 0;
lines.push_back(this);
*/
2013-03-20 15:46:58 +00:00
}
// this state will be converted to a child node of the node of prev_state
// dump the difference between previous state
// also clone corresponding states
2013-04-06 15:51:33 +00:00
void HTMLTextLine::State::begin (ostream & out, const State * prev_state)
2012-09-05 07:13:21 +00:00
{
2013-04-04 08:28:59 +00:00
if(prev_state)
2012-09-04 15:33:15 +00:00
{
2013-04-04 08:28:59 +00:00
long long cur_mask = 0xff;
bool first = true;
2013-04-05 13:53:34 +00:00
for(int i = 0; i < HASH_ID_COUNT; ++i, cur_mask<<=8)
2013-03-20 15:46:58 +00:00
{
2013-04-04 08:28:59 +00:00
if(hash_umask & cur_mask) // we don't care about this ID
2013-03-20 15:46:58 +00:00
{
2013-04-04 08:28:59 +00:00
if (prev_state->hash_umask & cur_mask) // if prev_state do not care about it either
continue;
// otherwise
2013-03-20 15:46:58 +00:00
// we have to inherit it
ids[i] = prev_state->ids[i];
hash_umask &= (~cur_mask);
2013-03-25 04:23:29 +00:00
//copy the corresponding value
//TODO: this is so ugly
switch(i)
{
2013-04-04 08:28:59 +00:00
case FONT_SIZE_ID:
2013-04-04 14:57:50 +00:00
font_size = prev_state->font_size;
2013-04-04 08:28:59 +00:00
break;
case LETTER_SPACE_ID:
2013-04-04 14:57:50 +00:00
letter_space = prev_state->letter_space;
2013-04-04 08:28:59 +00:00
break;
case WORD_SPACE_ID:
2013-04-04 14:57:50 +00:00
word_space = prev_state->word_space;
2013-04-04 08:28:59 +00:00
break;
default:
2013-04-05 13:53:34 +00:00
cerr << "unexpected state mask" << endl;
2013-04-04 08:28:59 +00:00
break;
2013-03-25 04:23:29 +00:00
}
2013-03-20 15:46:58 +00:00
}
2013-04-04 08:28:59 +00:00
// now we care about the ID
// if the value from prev_state is the same, we don't need to dump it
if((!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i]))
continue;
2012-09-05 07:13:21 +00:00
2013-04-04 08:28:59 +00:00
// so we have to dump it
if(first)
{
out << "<span class=\"";
first = false;
}
else
{
out << ' ';
}
// out should have hex set
out << css_class_names[i];
if (ids[i] == -1)
out << CSS::INVALID_ID;
else
out << ids[i];
2012-09-05 07:13:21 +00:00
}
2014-07-13 23:59:30 +00:00
// vertical align
2013-04-05 13:53:34 +00:00
if(!equal(vertical_align, 0))
{
// so we have to dump it
if(first)
{
out << "<span class=\"";
first = false;
}
else
{
out << ' ';
}
// out should have hex set
out << CSS::VERTICAL_ALIGN_CN;
auto id = ids[VERTICAL_ALIGN_ID];
if (id == -1)
out << CSS::INVALID_ID;
else
out << id;
}
2013-04-04 08:28:59 +00:00
if(first) // we actually just inherit the whole prev_state
2012-09-05 07:13:21 +00:00
{
2013-04-04 08:28:59 +00:00
need_close = false;
2012-09-05 07:13:21 +00:00
}
2013-01-31 22:21:57 +00:00
else
2013-04-04 08:28:59 +00:00
{
out << "\">";
need_close = true;
}
}
else
{
2013-04-04 08:28:59 +00:00
// prev_state == nullptr
// which means this is the first state of the line
// there should be a open pending <div> left there
2013-04-05 13:53:34 +00:00
// it is not necessary to output vertical align
2013-04-04 08:28:59 +00:00
long long cur_mask = 0xff;
2013-04-05 13:53:34 +00:00
for(int i = 0; i < HASH_ID_COUNT; ++i, cur_mask<<=8)
2013-04-04 08:28:59 +00:00
{
if(hash_umask & cur_mask) // we don't care about this ID
continue;
// now we care about the ID
out << ' ';
// out should have hex set
out << css_class_names[i];
if (ids[i] == -1)
out << CSS::INVALID_ID;
else
out << ids[i];
}
out << "\">";
2013-04-04 08:28:59 +00:00
need_close = false;
}
2012-09-04 15:33:15 +00:00
}
2013-04-06 15:51:33 +00:00
void HTMLTextLine::State::end(ostream & out) const
2012-09-05 07:13:21 +00:00
{
if(need_close)
out << "</span>";
}
2013-04-06 15:51:33 +00:00
void HTMLTextLine::State::hash(void)
2012-09-05 07:13:21 +00:00
{
hash_value = 0;
for(int i = 0; i < ID_COUNT; ++i)
{
hash_value = (hash_value << 8) | (ids[i] & 0xff);
}
}
2013-04-06 15:51:33 +00:00
int HTMLTextLine::State::diff(const State & s) const
2012-09-04 15:33:15 +00:00
{
2012-09-05 07:13:21 +00:00
/*
* A quick check based on hash_value
* it could be wrong when there are more then 256 classes,
2012-09-05 08:19:01 +00:00
* in which case the output may not be optimal, but still 'correct' in terms of HTML
2012-09-05 07:13:21 +00:00
*/
2013-03-20 15:46:58 +00:00
long long common_mask = ~(hash_umask | s.hash_umask);
if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0;
2012-09-05 07:13:21 +00:00
2013-03-20 15:46:58 +00:00
long long cur_mask = 0xff;
2012-09-05 07:13:21 +00:00
int d = 0;
for(int i = 0; i < ID_COUNT; ++i)
2013-03-20 15:46:58 +00:00
{
if((common_mask & cur_mask) && (ids[i] != s.ids[i]))
2012-09-05 07:13:21 +00:00
++ d;
2013-03-20 15:46:58 +00:00
cur_mask <<= 8;
}
2012-09-05 07:13:21 +00:00
return d;
2012-09-04 15:33:15 +00:00
}
2013-04-06 15:51:33 +00:00
long long HTMLTextLine::State::umask_by_id(int id)
2013-03-25 04:23:29 +00:00
{
return (((long long)0xff) << (8*id));
}
2013-02-05 10:19:25 +00:00
// the order should be the same as in the enum
2013-04-06 15:51:33 +00:00
const char * const HTMLTextLine::State::css_class_names [] = {
2013-02-28 07:59:14 +00:00
CSS::FONT_FAMILY_CN,
2013-02-05 10:19:25 +00:00
CSS::FONT_SIZE_CN,
CSS::FILL_COLOR_CN,
CSS::STROKE_COLOR_CN,
CSS::LETTER_SPACE_CN,
CSS::WORD_SPACE_CN,
2013-04-05 13:53:34 +00:00
CSS::VERTICAL_ALIGN_CN,
2013-02-05 10:19:25 +00:00
};
2012-09-12 15:26:14 +00:00
} //namespace pdf2htmlEX