1
0
mirror of https://github.com/pdf2htmlEX/pdf2htmlEX.git synced 2024-12-22 04:50:09 +00:00

option 'space-as-offset'

This commit is contained in:
Lu Wang 2012-09-07 08:39:21 +08:00
parent 9ae3e4dcb9
commit c82b140243
5 changed files with 24 additions and 5 deletions

View File

@ -92,6 +92,11 @@ A ToUnicode map may be provided for fonts in PDF which indicates the 'meaning' o
However often there is better "ToUnicode" info in Type 1 fonts, and sometimes the ToUnicode map provided is wrong. So by default pdf2htmlEX will find the Unicode value directly from the fonts instead of ToUnicode map. This behavior may be changed by turning on this switch.
.TP
.B --space-as-offset <0|1> (Default: 0)
Treat space characters as offsets, which may increase the size of the output.
Turn it on if space characters are not displayed correctly, or you want to remove positional spaces.
.TP
.B --font-suffix <suffix> (Default: ".ttf"), --font-format <format> (Default: "truetype")
Specify the suffix and format of fonts extracted from the PDF file. They should be consistent.
.TP

View File

@ -139,7 +139,9 @@ void HTMLRenderer::LineBuffer::flush(void)
if(w < 0)
last_text_pos_with_negative_offset = cur_text_idx;
double threshold = cur_state_iter->draw_font_size * (cur_state_iter->ascent - cur_state_iter->descent) * (renderer->param->space_threshold);
auto * p = stack.back();
double threshold = p->draw_font_size * (p->ascent - p->descent) * (renderer->param->space_threshold);
out << format("<span class=\"_ _%|1$x|\">%2%</span>") % wid % (target > (threshold - EPS) ? " " : "");
dx = target - w;

View File

@ -426,19 +426,29 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
cerr << "TODO: non-zero origins" << endl;
}
bool is_space = false;
if (n == 1 && *p == ' ')
{
++nSpaces;
is_space = true;
}
if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode))
if(is_space && (param->space_as_offset))
{
line_buf.append_unicodes(u, uLen);
// ignore horiz_scaling, as it's merged in CTM
line_buf.append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_scale);
}
else
{
Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
line_buf.append_unicodes(&uu, 1);
if((param->decompose_ligature) && all_of(u, u+uLen, isLegalUnicode))
{
line_buf.append_unicodes(u, uLen);
}
else
{
Unicode uu = (cur_font_info->use_tounicode ? check_unicode(u, uLen, code, font) : unicode_from_font(code, font));
line_buf.append_unicodes(&uu, 1);
}
}
dx += dx1;

View File

@ -37,6 +37,7 @@ struct Param
double space_threshold;
double font_size_multiplier;
int always_apply_tounicode;
int space_as_offset;
std::string font_suffix, font_format;

View File

@ -85,6 +85,7 @@ po::variables_map parse_options (int argc, char **argv)
("space-threshold", po::value<double>(&param.space_threshold)->default_value(1.0/6), "distance no thiner than (threshold * em) will be considered as a space character")
("font-size-multiplier", po::value<double>(&param.font_size_multiplier)->default_value(10.0), "setting a value greater than 1 would increase the rendering accuracy")
("always-apply-tounicode", po::value<int>(&param.always_apply_tounicode)->default_value(0), "ToUnicode map is ignore for non-TTF fonts unless this switch is on")
("space-as-offset", po::value<int>(&param.space_as_offset)->default_value(0), "treat space characters as offsets")
("font-suffix", po::value<string>(&param.font_suffix)->default_value(".ttf"), "suffix for extracted font files")
("font-format", po::value<string>(&param.font_format)->default_value("truetype"), "format for extracted font files")