pdf2htmlEX/src/HTMLRenderer/text.cc

/*
 * text.cc
 *
 * Handling text & font, and relative stuffs
 *
 * Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>
 */


#include <algorithm>

#include "HTMLRenderer.h"

#include "util/namespace.h"
#include "util/unicode.h"

namespace pdf2htmlEX {

using std::all_of;
using std::cerr;
using std::endl;

void HTMLRenderer::drawString(GfxState * state, GooString * s)
{
    if(s->getLength() == 0)
        return;

    auto font = state->getFont();
    // unscaled
    double cur_letter_space = state->getCharSpace();
    double cur_word_space   = state->getWordSpace();

    // Writing mode fonts and Type 3 fonts are rendered as images
    // I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
    // For type 3 fonts, due to the font matrix, still it's hard to show it on HTML
    if( (font == nullptr) 
        || (font->getWMode())
        || (font->getType() == fontType3)
      )
    {
        return;
    }

    // see if the line has to be closed due to state change
    check_state_change(state);
    prepare_text_line(state);

    // Now ready to output
    // get the unicodes
    char *p = s->getCString();
    int len = s->getLength();

    double dx = 0;
    double dy = 0;
    double dx1,dy1;
    double ox, oy;

    int nChars = 0;
    int nSpaces = 0;
    int uLen;

    CharCode code;
    Unicode *u = nullptr;

    while (len > 0) 
    {
        auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy);

        if(!(equal(ox, 0) && equal(oy, 0)))
        {
            cerr << "TODO: non-zero origins" << endl;
        }

        bool is_space = false;
        if (n == 1 && *p == ' ') 
        {
            /*
             * This is by standard
             * however some PDF will use ' ' as a normal encoding slot
             * such that it will be mapped to other unicodes
             * In that case, when sapce_as_offset is on, we will simply ignore that character...
             *
             * Checking mapped unicode may or may not work
             * There are always ugly PDF files with no usefull info at all.
             */
            is_space = true;
            ++nSpaces;
        }
        
        if(is_space && (param->space_as_offset))
        {
            // ignore horiz_scaling, as it's merged in CTM
            text_line_buffers.back()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); 
        }
        else
        {
            if((param->decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
            {
                text_line_buffers.back()->append_unicodes(u, uLen);
                // TODO: decomposed characters may be not with the same width as the original ligature, need to fix it.
            }
            else
            {
                Unicode uu;
                if(cur_html_state.font_info->use_tounicode)
                {
                    uu = check_unicode(u, uLen, code, font);
                }
                else
                {
                    uu = unicode_from_font(code, font);
                }
                text_line_buffers.back()->append_unicodes(&uu, 1);
                /*
                 * In PDF, word_space is appended if (n == 1 and *p = ' ')
                 * but in HTML, word_space is appended if (uu == ' ')
                 */
                int space_count = (is_space ? 1 : 0) - (uu == ' ' ? 1 : 0);
                if(space_count != 0)
                    text_line_buffers.back()->append_offset(cur_word_space * draw_text_scale * space_count);
            }
        }

        dx += dx1;
        dy += dy1;

        ++nChars;
        p += n;
        len -= n;
    }

    double hs = state->getHorizScaling();

    // horiz_scaling is merged into ctm now, 
    // so the coordinate system is ugly
    dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs;
    
    dy *= cur_font_size;

    cur_tx += dx;
    cur_ty += dy;
        
    draw_tx += dx;
    draw_ty += dy;
}

} // namespace pdf2htmlEX
better file structure 2012-08-14 08:23:15 +00:00			`/*`
remove scripts, link to fontforge directly 2012-09-03 12:57:14 +00:00			`* text.cc`
better file structure 2012-08-14 08:23:15 +00:00			`*`
use asc/dec of local matched font if possible 2012-08-31 07:50:14 +00:00			`* Handling text & font, and relative stuffs`
better file structure 2012-08-14 08:23:15 +00:00			`*`
new option: remove unused glyphs 2012-10-05 15:38:17 +00:00			`* Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>`
better file structure 2012-08-14 08:23:15 +00:00			`*/`


clean 2013-02-05 14:07:51 +00:00			`#include <algorithm>`
disable toUnicode for non-ttf fonts by default 2012-08-27 15:09:01 +00:00
better file structure 2012-08-14 08:23:15 +00:00			`#include "HTMLRenderer.h"`
rearrange files 2013-04-06 08:45:01 +00:00
reorganize files 2012-11-29 09:28:05 +00:00			`#include "util/namespace.h"`
reorganizaing 2012-11-29 09:45:26 +00:00			`#include "util/unicode.h"`
better file structure 2012-08-14 08:23:15 +00:00
add manifest & split-pages 2012-09-12 15:26:14 +00:00			`namespace pdf2htmlEX {`

add option 'decompose-ligature' 2012-09-06 07:09:47 +00:00			`using std::all_of;`
split util.h 2012-11-29 10:28:07 +00:00			`using std::cerr;`
			`using std::endl;`
working on cid truetype 2012-08-20 21:48:21 +00:00
better file structure 2012-08-14 08:23:15 +00:00			`void HTMLRenderer::drawString(GfxState * state, GooString * s)`
			`{`
			`if(s->getLength() == 0)`
			`return;`

			`auto font = state->getFont();`
make sure contents are cleared in TextLineBuffer::flush 2013-04-03 08:28:06 +00:00			`// unscaled`
letter_space_tracker 2013-02-05 05:57:11 +00:00			`double cur_letter_space = state->getCharSpace();`
working on state trackers 2013-02-05 06:21:07 +00:00			`double cur_word_space = state->getWordSpace();`
fix width of type 3 fonts; don't show hidden text for type 3 fonts 2013-01-19 12:13:31 +00:00
			`// Writing mode fonts and Type 3 fonts are rendered as images`
			`// I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly`
			`// For type 3 fonts, due to the font matrix, still it's hard to show it on HTML`
			`if( (font == nullptr)`
			`\|\| (font->getWMode())`
			`\|\| (font->getType() == fontType3)`
			`)`
better file structure 2012-08-14 08:23:15 +00:00			`{`
			`return;`
			`}`

			`// see if the line has to be closed due to state change`
			`check_state_change(state);`
working on CSS draw 2012-10-01 17:59:04 +00:00			`prepare_text_line(state);`
better file structure 2012-08-14 08:23:15 +00:00
			`// Now ready to output`
			`// get the unicodes`
			`char *p = s->getCString();`
			`int len = s->getLength();`

			`double dx = 0;`
			`double dy = 0;`
			`double dx1,dy1;`
			`double ox, oy;`

			`int nChars = 0;`
			`int nSpaces = 0;`
			`int uLen;`

			`CharCode code;`
			`Unicode *u = nullptr;`

.. 2012-11-30 09:33:27 +00:00			`while (len > 0)`
			`{`
better file structure 2012-08-14 08:23:15 +00:00			`auto n = font->getNextChar(p, len, &code, &u, &uLen, &dx1, &dy1, &ox, &oy);`
.. 2012-09-17 18:37:30 +00:00
working on util.h 2012-11-29 10:16:05 +00:00			`if(!(equal(ox, 0) && equal(oy, 0)))`
better file structure 2012-08-14 08:23:15 +00:00			`{`
clean code: namespace 2012-08-14 09:13:29 +00:00			`cerr << "TODO: non-zero origins" << endl;`
better file structure 2012-08-14 08:23:15 +00:00			`}`

option 'space-as-offset' 2012-09-07 00:39:21 +00:00			`bool is_space = false;`
.. 2012-08-19 20:50:28 +00:00			`if (n == 1 && *p == ' ')`
			`{`
clean code 2013-04-03 17:35:44 +00:00			`/*`
			`* This is by standard`
			`* however some PDF will use ' ' as a normal encoding slot`
			`* such that it will be mapped to other unicodes`
			`* In that case, when sapce_as_offset is on, we will simply ignore that character...`
			`*`
			`* Checking mapped unicode may or may not work`
			`* There are always ugly PDF files with no usefull info at all.`
			`*/`
option 'space-as-offset' 2012-09-07 00:39:21 +00:00			`is_space = true;`
clean code 2013-04-03 17:35:44 +00:00			`++nSpaces;`
.. 2012-08-19 20:50:28 +00:00			`}`
.. 2012-08-24 06:21:20 +00:00
option 'space-as-offset' 2012-09-07 00:39:21 +00:00			`if(is_space && (param->space_as_offset))`
add option 'decompose-ligature' 2012-09-06 07:09:47 +00:00			`{`
option 'space-as-offset' 2012-09-07 00:39:21 +00:00			`// ignore horiz_scaling, as it's merged in CTM`
separate TextLinebuffer and HTMLRenderer 2013-04-06 08:32:31 +00:00			`text_line_buffers.back()->append_offset((dx1 * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);`
add option 'decompose-ligature' 2012-09-06 07:09:47 +00:00			`}`
			`else`
			`{`
dirty fix:decompose ligatures when tounicode is not used 2012-09-28 09:53:36 +00:00			`if((param->decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))`
option 'space-as-offset' 2012-09-07 00:39:21 +00:00			`{`
separate TextLinebuffer and HTMLRenderer 2013-04-06 08:32:31 +00:00			`text_line_buffers.back()->append_unicodes(u, uLen);`
fix word space 2013-04-05 10:07:37 +00:00			`// TODO: decomposed characters may be not with the same width as the original ligature, need to fix it.`
option 'space-as-offset' 2012-09-07 00:39:21 +00:00			`}`
			`else`
			`{`
fix word space 2013-04-05 10:07:37 +00:00			`Unicode uu;`
refactor state managers 2013-04-04 13:19:28 +00:00			`if(cur_html_state.font_info->use_tounicode)`
don't decompose ligature when tounicode map is not used, need more fix laster 2012-09-28 09:25:12 +00:00			`{`
fix word space 2013-04-05 10:07:37 +00:00			`uu = check_unicode(u, uLen, code, font);`
don't decompose ligature when tounicode map is not used, need more fix laster 2012-09-28 09:25:12 +00:00			`}`
			`else`
			`{`
fix word space 2013-04-05 10:07:37 +00:00			`uu = unicode_from_font(code, font);`
don't decompose ligature when tounicode map is not used, need more fix laster 2012-09-28 09:25:12 +00:00			`}`
separate TextLinebuffer and HTMLRenderer 2013-04-06 08:32:31 +00:00			`text_line_buffers.back()->append_unicodes(&uu, 1);`
fix word space 2013-04-05 10:07:37 +00:00			`/*`
			`* In PDF, word_space is appended if (n == 1 and *p = ' ')`
			`* but in HTML, word_space is appended if (uu == ' ')`
			`*/`
			`int space_count = (is_space ? 1 : 0) - (uu == ' ' ? 1 : 0);`
			`if(space_count != 0)`
separate TextLinebuffer and HTMLRenderer 2013-04-06 08:32:31 +00:00			`text_line_buffers.back()->append_offset(cur_word_space * draw_text_scale * space_count);`
option 'space-as-offset' 2012-09-07 00:39:21 +00:00			`}`
add option 'decompose-ligature' 2012-09-06 07:09:47 +00:00			`}`
better file structure 2012-08-14 08:23:15 +00:00
better positioning 2012-08-23 20:36:27 +00:00			`dx += dx1;`
			`dy += dy1;`
better file structure 2012-08-14 08:23:15 +00:00
			`++nChars;`
			`p += n;`
			`len -= n;`
			`}`

clean code 2012-09-04 04:54:47 +00:00			`double hs = state->getHorizScaling();`

.. 2012-08-21 20:34:39 +00:00			`// horiz_scaling is merged into ctm now,`
leave space for illegal unicodes 2012-08-21 19:44:48 +00:00			`// so the coordinate system is ugly`
clean code 2012-09-04 04:54:47 +00:00			`dx = (dx * cur_font_size + nChars * cur_letter_space + nSpaces * cur_word_space) * hs;`
better file structure 2012-08-14 08:23:15 +00:00
clean code 2012-09-04 04:54:47 +00:00			`dy *= cur_font_size;`
better file structure 2012-08-14 08:23:15 +00:00
			`cur_tx += dx;`
			`cur_ty += dy;`

.. 2013-01-26 11:45:48 +00:00			`draw_tx += dx;`
better file structure 2012-08-14 08:23:15 +00:00			`draw_ty += dy;`
			`}`
add manifest & split-pages 2012-09-12 15:26:14 +00:00
			`} // namespace pdf2htmlEX`