2012-08-14 08:23:15 +00:00
|
|
|
/*
|
2012-09-03 12:57:14 +00:00
|
|
|
* text.cc
|
2012-08-14 08:23:15 +00:00
|
|
|
*
|
2012-08-31 07:50:14 +00:00
|
|
|
* Handling text & font, and relative stuffs
|
2012-08-14 08:23:15 +00:00
|
|
|
*
|
2012-10-05 15:38:17 +00:00
|
|
|
* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>
|
2012-08-14 08:23:15 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
2013-02-05 14:07:51 +00:00
|
|
|
#include <algorithm>
|
2012-08-27 15:09:01 +00:00
|
|
|
|
2012-08-14 08:23:15 +00:00
|
|
|
#include "HTMLRenderer.h"
|
2013-04-06 08:45:01 +00:00
|
|
|
|
2012-11-29 09:28:05 +00:00
|
|
|
#include "util/namespace.h"
|
2012-11-29 09:45:26 +00:00
|
|
|
#include "util/unicode.h"
|
2012-08-14 08:23:15 +00:00
|
|
|
|
2012-09-12 15:26:14 +00:00
|
|
|
namespace pdf2htmlEX {
|
|
|
|
|
2012-09-06 07:09:47 +00:00
|
|
|
using std::all_of;
|
2012-11-29 10:28:07 +00:00
|
|
|
using std::cerr;
|
|
|
|
using std::endl;
|
2012-08-20 21:48:21 +00:00
|
|
|
|
2012-08-14 08:23:15 +00:00
|
|
|
void HTMLRenderer::drawString(GfxState * state, GooString * s)
|
|
|
|
{
|
|
|
|
if(s->getLength() == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
auto font = state->getFont();
|
2013-04-03 08:28:06 +00:00
|
|
|
// unscaled
|
2013-02-05 05:57:11 +00:00
|
|
|
double cur_letter_space = state->getCharSpace();
|
2013-02-05 06:21:07 +00:00
|
|
|
double cur_word_space = state->getWordSpace();
|
2013-01-19 12:13:31 +00:00
|
|
|
|
|
|
|
// Writing mode fonts and Type 3 fonts are rendered as images
|
|
|
|
// I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
|
|
|
|
// For type 3 fonts, due to the font matrix, still it's hard to show it on HTML
|
|
|
|
if( (font == nullptr)
|
|
|
|
|| (font->getWMode())
|
2013-09-18 18:41:00 +00:00
|
|
|
// || (font->getType() == fontType3)
|
2013-01-19 12:13:31 +00:00
|
|
|
)
|
2012-08-14 08:23:15 +00:00
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// see if the line has to be closed due to state change
|
|
|
|
check_state_change(state);
|
2012-10-01 17:59:04 +00:00
|
|
|
prepare_text_line(state);
|
2012-08-14 08:23:15 +00:00
|
|
|
|
|
|
|
// Now ready to output
|
|
|
|
// get the unicodes
|
|
|
|
char *p = s->getCString();
|
|
|
|
int len = s->getLength();
|
|
|
|
|
|
|
|
double dx = 0;
|
|
|
|
double dy = 0;
|
|
|
|
double dx1,dy1;
|
|
|
|
double ox, oy;
|
|
|
|
|
|
|
|
int nChars = 0;
|
|
|
|
int nSpaces = 0;
|
|
|
|
int uLen;
|
|
|
|
|
|
|
|
CharCode code;
|
|
|
|
Unicode *u = nullptr;
|
|
|
|
|
2012-11-30 09:33:27 +00:00
|
|
|
while (len > 0)
|
|
|
|
{
|
2012-08-14 08:23:15 +00:00
|
|
|
auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy);
|
2012-09-17 18:37:30 +00:00
|
|
|
|
2012-11-29 10:16:05 +00:00
|
|
|
if(!(equal(ox, 0) && equal(oy, 0)))
|
2012-08-14 08:23:15 +00:00
|
|
|
{
|
2012-08-14 09:13:29 +00:00
|
|
|
cerr << "TODO: non-zero origins" << endl;
|
2012-08-14 08:23:15 +00:00
|
|
|
}
|
|
|
|
|
2012-09-07 00:39:21 +00:00
|
|
|
bool is_space = false;
|
2012-08-19 20:50:28 +00:00
|
|
|
if (n == 1 && *p == ' ')
|
|
|
|
{
|
2013-04-03 17:35:44 +00:00
|
|
|
/*
|
|
|
|
* This is by standard
|
|
|
|
* however some PDF will use ' ' as a normal encoding slot
|
|
|
|
* such that it will be mapped to other unicodes
|
2013-05-01 16:56:37 +00:00
|
|
|
* In that case, when space_as_offset is on, we will simply ignore that character...
|
2013-04-03 17:35:44 +00:00
|
|
|
*
|
|
|
|
* Checking mapped unicode may or may not work
|
2013-05-01 16:56:37 +00:00
|
|
|
* There are always ugly PDF files with no useful info at all.
|
2013-04-03 17:35:44 +00:00
|
|
|
*/
|
2012-09-07 00:39:21 +00:00
|
|
|
is_space = true;
|
2013-04-03 17:35:44 +00:00
|
|
|
++nSpaces;
|
2012-08-19 20:50:28 +00:00
|
|
|
}
|
2012-08-24 06:21:20 +00:00
|
|
|
|
2013-04-06 09:01:05 +00:00
|
|
|
if(is_space && (param.space_as_offset))
|
2012-09-06 07:09:47 +00:00
|
|
|
{
|
2012-09-07 00:39:21 +00:00
|
|
|
// ignore horiz_scaling, as it's merged in CTM
|
2013-04-07 15:50:23 +00:00
|
|
|
html_text_page.get_cur_line()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
|
2012-09-06 07:09:47 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2013-04-06 09:01:05 +00:00
|
|
|
if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
|
2012-09-07 00:39:21 +00:00
|
|
|
{
|
2013-04-07 15:50:23 +00:00
|
|
|
html_text_page.get_cur_line()->append_unicodes(u, uLen);
|
2013-04-05 10:07:37 +00:00
|
|
|
// TODO: decomposed characters may be not with the same width as the original ligature, need to fix it.
|
2012-09-07 00:39:21 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2013-04-05 10:07:37 +00:00
|
|
|
Unicode uu;
|
2013-04-07 15:50:23 +00:00
|
|
|
if(cur_text_state.font_info->use_tounicode)
|
2012-09-28 09:25:12 +00:00
|
|
|
{
|
2013-04-05 10:07:37 +00:00
|
|
|
uu = check_unicode(u, uLen, code, font);
|
2012-09-28 09:25:12 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2013-04-05 10:07:37 +00:00
|
|
|
uu = unicode_from_font(code, font);
|
2012-09-28 09:25:12 +00:00
|
|
|
}
|
2013-04-07 15:50:23 +00:00
|
|
|
html_text_page.get_cur_line()->append_unicodes(&uu, 1);
|
2013-04-05 10:07:37 +00:00
|
|
|
/*
|
|
|
|
* In PDF, word_space is appended if (n == 1 and *p = ' ')
|
2013-05-02 06:32:17 +00:00
|
|
|
* but in HTML, word_space is appended if (uu == ' ')
|
2013-04-05 10:07:37 +00:00
|
|
|
*/
|
2013-05-02 06:32:17 +00:00
|
|
|
int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
|
2013-04-05 10:07:37 +00:00
|
|
|
if(space_count != 0)
|
2013-05-01 16:56:37 +00:00
|
|
|
{
|
2013-04-07 15:50:23 +00:00
|
|
|
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
|
2013-05-01 16:56:37 +00:00
|
|
|
}
|
2012-09-07 00:39:21 +00:00
|
|
|
}
|
2012-09-06 07:09:47 +00:00
|
|
|
}
|
2012-08-14 08:23:15 +00:00
|
|
|
|
2012-08-23 20:36:27 +00:00
|
|
|
dx += dx1;
|
|
|
|
dy += dy1;
|
2012-08-14 08:23:15 +00:00
|
|
|
|
|
|
|
++nChars;
|
|
|
|
p += n;
|
|
|
|
len -= n;
|
|
|
|
}
|
|
|
|
|
2012-09-04 04:54:47 +00:00
|
|
|
double hs = state->getHorizScaling();
|
|
|
|
|
2012-08-21 20:34:39 +00:00
|
|
|
// horiz_scaling is merged into ctm now,
|
2012-08-21 19:44:48 +00:00
|
|
|
// so the coordinate system is ugly
|
2012-09-04 04:54:47 +00:00
|
|
|
dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs;
|
2012-08-14 08:23:15 +00:00
|
|
|
|
2012-09-04 04:54:47 +00:00
|
|
|
dy *= cur_font_size;
|
2012-08-14 08:23:15 +00:00
|
|
|
|
|
|
|
cur_tx += dx;
|
|
|
|
cur_ty += dy;
|
|
|
|
|
2013-01-26 11:45:48 +00:00
|
|
|
draw_tx += dx;
|
2012-08-14 08:23:15 +00:00
|
|
|
draw_ty += dy;
|
|
|
|
}
|
2012-09-12 15:26:14 +00:00
|
|
|
|
|
|
|
} // namespace pdf2htmlEX
|