diff --git a/CMakeLists.txt b/CMakeLists.txt index da4b351..42b9ea4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,7 @@ link_directories ( ${POPPLER_LIBRARY_DIRS} ) find_package(Boost REQUIRED COMPONENTS program_options) include_directories(${Boost_INCLUDE_DIRS}) link_directories ( ${Boost_LIBRARY_DIRS} ) +include_directories(src) set(PDF2HTMLEX_VERSION "0.1") @@ -23,6 +24,9 @@ add_executable(pdf2htmlEX src/pdf2htmlEX.cc src/HTMLRenderer.h src/HTMLRenderer.cc + src/HTMLRenderer/state.cc + src/HTMLRenderer/install.cc + src/HTMLRenderer/export.cc src/BackgroundRenderer.h src/BackgroundRenderer.cc src/Consts.h diff --git a/src/HTMLRenderer.cc b/src/HTMLRenderer.cc index e9b67c7..c1e7b84 100644 --- a/src/HTMLRenderer.cc +++ b/src/HTMLRenderer.cc @@ -165,53 +165,6 @@ void HTMLRenderer::endPage() { html_fout << "" << endl; } -void HTMLRenderer::close_cur_line() -{ - if(line_opened) - { - html_fout << "" << endl; - line_opened = false; - } -} - -void HTMLRenderer::updateAll(GfxState * state) -{ - all_changed = true; - updateTextPos(state); -} - -void HTMLRenderer::updateFont(GfxState * state) -{ - font_changed = true; -} - -void HTMLRenderer::updateTextMat(GfxState * state) -{ - text_mat_changed = true; -} - -void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) -{ - ctm_changed = true; -} - -void HTMLRenderer::updateTextPos(GfxState * state) -{ - text_pos_changed = true; - cur_tx = state->getLineX(); - cur_ty = state->getLineY(); -} - -void HTMLRenderer::updateTextShift(GfxState * state, double shift) -{ - text_pos_changed = true; - cur_tx -= shift * 0.001 * state->getFontSize() * state->getHorizScaling(); -} - -void HTMLRenderer::updateFillColor(GfxState * state) -{ - color_changed = true; -} void HTMLRenderer::drawString(GfxState * state, GooString * s) { @@ -424,600 +377,7 @@ void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int w ++ image_count; } -// The font installation code is stolen from PSOutputDev.cc in poppler -long long HTMLRenderer::install_font(GfxFont * font) -{ - assert(sizeof(long long) == 2*sizeof(int)); - - long long fn_id = (font == nullptr) ? 0 : *reinterpret_cast(font->getID()); - auto iter = font_name_map.find(fn_id); - if(iter != font_name_map.end()) - return iter->second.fn_id; - long long new_fn_id = font_name_map.size(); - font_name_map.insert(std::make_pair(fn_id, FontInfo({new_fn_id}))); - - if(font == nullptr) - { - export_remote_default_font(new_fn_id); - return new_fn_id; - } - - if(param->debug) - { - std::cerr << "Install font: (" << (font->getID()->num) << ' ' << (font->getID()->gen) << ") -> " << boost::format("f%|1$x|")%new_fn_id << std::endl; - } - - if(font->getType() == fontType3) { - std::cerr << "Type 3 fonts are unsupported and will be rendered as Image" << std::endl; - export_remote_default_font(new_fn_id); - return new_fn_id; - } - if(font->getWMode()) { - std::cerr << "Writing mode is unsupported and will be rendered as Image" << std::endl; - export_remote_default_font(new_fn_id); - return new_fn_id; - } - - auto * font_loc = font->locateFont(xref, gTrue); - if(font_loc != nullptr) - { - switch(font_loc -> locType) - { - case gfxFontLocEmbedded: - { - std::string suffix = dump_embedded_font(font, new_fn_id); - if(suffix != "") - { - install_embedded_font(font, suffix, new_fn_id); - } - else - { - export_remote_default_font(new_fn_id); - } - } - break; - case gfxFontLocExternal: - install_external_font(font, new_fn_id); - break; - case gfxFontLocResident: - install_base_font(font, font_loc, new_fn_id); - break; - default: - std::cerr << "TODO: other font loc" << std::endl; - export_remote_default_font(new_fn_id); - break; - } - delete font_loc; - } - else - { - export_remote_default_font(new_fn_id); - } - - return new_fn_id; - -} - -std::string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) -{ - // mupdf consulted - - Object ref_obj, font_obj, font_obj2, fontdesc_obj; - Object obj, obj1, obj2; - Dict * dict = nullptr; - - std::string suffix, subtype; - - char buf[1024]; - int len; - - ofstream outf; - - auto * id = font->getID(); - ref_obj.initRef(id->num, id->gen); - ref_obj.fetch(xref, &font_obj); - ref_obj.free(); - - if(!font_obj.isDict()) - { - std::cerr << "Font object is not a dictionary" << std::endl; - goto err; - } - - dict = font_obj.getDict(); - if(dict->lookup("DescendantFonts", &font_obj2)->isArray()) - { - if(font_obj2.arrayGetLength() == 0) - { - std::cerr << "Warning: empty DescendantFonts array" << std::endl; - } - else - { - if(font_obj2.arrayGetLength() > 1) - std::cerr << "TODO: multiple entries in DescendantFonts array" << std::endl; - - if(font_obj2.arrayGet(0, &obj2)->isDict()) - { - dict = obj2.getDict(); - } - } - } - - if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict()) - { - std::cerr << "Cannot find FontDescriptor " << std::endl; - goto err; - } - - dict = fontdesc_obj.getDict(); - - if(dict->lookup("FontFile3", &obj)->isStream()) - { - if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName()) - { - subtype = obj1.getName(); - if(subtype == "Type1C") - { - suffix = ".cff"; - } - else if (subtype == "CIDFontType0C") - { - suffix = ".cid"; - } - else - { - std::cerr << "Unknown subtype: " << subtype << std::endl; - goto err; - } - } - else - { - std::cerr << "Invalid subtype in font descriptor" << std::endl; - goto err; - } - } - else if (dict->lookup("FontFile2", &obj)->isStream()) - { - suffix = ".ttf"; - } - else if (dict->lookup("FontFile", &obj)->isStream()) - { - suffix = ".ttf"; - } - else - { - std::cerr << "Cannot find FontFile for dump" << std::endl; - goto err; - } - - if(suffix == "") - { - std::cerr << "Font type unrecognized" << std::endl; - goto err; - } - - obj.streamReset(); - outf.open((boost::format("%1%/f%|2$x|%3%")%TMP_DIR%fn_id%suffix).str().c_str(), ofstream::binary); - while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0) - { - outf.write(buf, len); - } - outf.close(); - obj.streamClose(); - -err: - obj2.free(); - obj1.free(); - obj.free(); - - fontdesc_obj.free(); - font_obj2.free(); - font_obj.free(); - return suffix; -} - -void HTMLRenderer::install_embedded_font(GfxFont * font, const std::string & suffix, long long fn_id) -{ - // TODO Should use standard way to handle CID fonts - /* - * How it works: - * - * 1.dump the font file directly from the font descriptor and put the glyphs into the correct slots - * - * for nonCID - * nothing need to do - * - * for CID + nonTrueType - * Flatten the font - * - * for CID Truetype - * Just use glyph order, and later we'll map GID (instead of char code) to Unicode - * - * - * 2. map charcode (or GID for CID truetype) to Unicode - * - * generate an encoding file and let fontforge handle it. - */ - - std::string fn = (boost::format("f%|1$x|") % fn_id).str(); - - fontscript_fout << boost::format("Open(\"%1%/%2%%3%\",1)") % TMP_DIR % fn % suffix << endl; - - auto ctu = font->getToUnicode(); - int * code2GID = nullptr; - if(ctu) - { - // TODO: ctu could be CID2Unicode for CID fonts - int maxcode = 0; - - if(!font->isCIDFont()) - { - maxcode = 0xff; - } - else - { - maxcode = 0xffff; - if(suffix != ".ttf") - { - fontscript_fout << "CIDFlatten()" << endl; - } - else - { - fontscript_fout << boost::format("Reencode(\"original\")") << endl; - int len; - // code2GID has been stored for embedded CID fonts - code2GID = dynamic_cast(font)->getCodeToGIDMap(nullptr, &len); - } - } - - if(maxcode > 0) - { - ofstream map_fout((boost::format("%1%/%2%.encoding") % TMP_DIR % fn).str().c_str()); - int cnt = 0; - for(int i = 0; i <= maxcode; ++i) - { - Unicode * u; - auto n = ctu->mapToUnicode(i, &u); - // not sure what to do when n > 1 - if(n > 0) - { - ++cnt; - map_fout << boost::format("0x%|1$X|") % (code2GID ? code2GID[i] : i); - for(int j = 0; j < n; ++j) - map_fout << boost::format(" 0x%|1$X|") % u[j]; - map_fout << boost::format(" # 0x%|1$X|") % i << endl; - } - } - - if(cnt > 0) - { - fontscript_fout << boost::format("LoadEncodingFile(\"%1%/%2%.encoding\", \"%2%\")") % TMP_DIR % fn << endl; - fontscript_fout << boost::format("Reencode(\"%1%\", 1)") % fn << endl; - } - } - - ctu->decRefCnt(); - } - - fontscript_fout << boost::format("Generate(\"%1%.ttf\")") % fn << endl; - - export_remote_font(fn_id, ".ttf", "truetype", font); -} - -void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id) -{ - std::string psname(font_loc->path->getCString()); - string basename = psname.substr(0, psname.find('-')); - string cssfont; - auto iter = BASE_14_FONT_CSS_FONT_MAP.find(basename); - if(iter == BASE_14_FONT_CSS_FONT_MAP.end()) - { - std::cerr << "PS Font: " << basename << " not found in the base 14 font map" << std::endl; - cssfont = ""; - } - else - cssfont = iter->second; - - export_local_font(fn_id, font, psname, cssfont); -} - -void HTMLRenderer::install_external_font( GfxFont * font, long long fn_id) -{ - std::string fontname(font->getName()->getCString()); - - // resolve bad encodings in GB - auto iter = GB_ENCODED_FONT_NAME_MAP.find(fontname); - if(iter != GB_ENCODED_FONT_NAME_MAP.end()) - { - fontname = iter->second; - std::cerr << "Warning: workaround for font names in bad encodings." << std::endl; - } - - export_local_font(fn_id, font, fontname, ""); -} - -long long HTMLRenderer::install_font_size(double font_size) -{ - auto iter = font_size_map.lower_bound(font_size - EPS); - if((iter != font_size_map.end()) && (_equal(iter->first, font_size))) - return iter->second; - - long long new_fs_id = font_size_map.size(); - font_size_map.insert(std::make_pair(font_size, new_fs_id)); - export_font_size(new_fs_id, font_size); - return new_fs_id; -} - -long long HTMLRenderer::install_whitespace(double ws_width, double & actual_width) -{ - auto iter = whitespace_map.lower_bound(ws_width - param->h_eps); - if((iter != whitespace_map.end()) && (std::abs(iter->first - ws_width) < param->h_eps)) - { - actual_width = iter->first; - return iter->second; - } - - actual_width = ws_width; - long long new_ws_id = whitespace_map.size(); - whitespace_map.insert(std::make_pair(ws_width, new_ws_id)); - export_whitespace(new_ws_id, ws_width); - return new_ws_id; -} - -long long HTMLRenderer::install_transform_matrix(const double * tm) -{ - TM m(tm); - auto iter = transform_matrix_map.lower_bound(m); - if((iter != transform_matrix_map.end()) && (m == (iter->first))) - return iter->second; - - long long new_tm_id = transform_matrix_map.size(); - transform_matrix_map.insert(std::make_pair(m, new_tm_id)); - export_transform_matrix(new_tm_id, tm); - return new_tm_id; -} - -long long HTMLRenderer::install_color(const GfxRGB * rgb) -{ - Color c(rgb); - auto iter = color_map.lower_bound(c); - if((iter != color_map.end()) && (c == (iter->first))) - return iter->second; - - long long new_color_id = color_map.size(); - color_map.insert(std::make_pair(c, new_color_id)); - export_color(new_color_id, rgb); - return new_color_id; -} - -void HTMLRenderer::export_remote_font(long long fn_id, const string & suffix, const string & format, GfxFont * font) -{ - allcss_fout << boost::format("@font-face{font-family:f%|1$x|;src:url(f%|1$x|%2%)format(\"%3%\");}.f%|1$x|{font-family:f%|1$x|;") % fn_id % suffix % format; - - double a = font->getAscent(); - double d = font->getDescent(); - double r = _is_positive(a-d) ? (a/(a-d)) : 1.0; - - for(const std::string & prefix : {"", "-ms-", "-moz-", "-webkit-", "-o-"}) - { - allcss_fout << prefix << "transform-origin:0% " << (r*100.0) << "%;"; - } - - allcss_fout << "line-height:" << (a-d) << ";"; - - allcss_fout << "}" << endl; -} - -// TODO: this function is called when some font is unable to process, may use the name there as a hint -void HTMLRenderer::export_remote_default_font(long long fn_id) -{ - allcss_fout << boost::format(".f%|1$x|{font-family:sans-serif;color:transparent;visibility:hidden;}")%fn_id << endl; -} - -void HTMLRenderer::export_local_font(long long fn_id, GfxFont * font, const string & original_font_name, const string & cssfont) -{ - allcss_fout << boost::format(".f%|1$x|{") % fn_id; - allcss_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";"; - - if(font->isBold()) - allcss_fout << "font-weight:bold;"; - - if(boost::algorithm::ifind_first(original_font_name, "oblique")) - allcss_fout << "font-style:oblique;"; - else if(font->isItalic()) - allcss_fout << "font-style:italic;"; - - double a = font->getAscent(); - double d = font->getDescent(); - double r = _is_positive(a-d) ? (a/(a-d)) : 1.0; - - for(const std::string & prefix : {"", "-ms-", "-moz-", "-webkit-", "-o-"}) - { - allcss_fout << prefix << "transform-origin:0% " << (r*100.0) << "%;"; - } - - allcss_fout << "line-height:" << (a-d) << ";"; - - allcss_fout << "}" << endl; -} - -std::string HTMLRenderer::general_font_family(GfxFont * font) -{ - if(font -> isFixedWidth()) - return "monospace"; - else if (font -> isSerif()) - return "serif"; - else - return "sans-serif"; -} - -void HTMLRenderer::export_font_size (long long fs_id, double font_size) -{ - allcss_fout << boost::format(".s%|1$x|{font-size:%2%px;}") % fs_id % font_size << endl; -} - -void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) -{ - allcss_fout << boost::format(".w%|1$x|{width:%2%px;}") % ws_id % ws_width << endl; -} - -void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) -{ - allcss_fout << boost::format(".t%|1$x|{") % tm_id; - - // TODO: recognize common matices - if(_tm_equal(tm, id_matrix)) - { - // no need to output anything - } - else - { - for(const std::string & prefix : {"", "-ms-", "-moz-", "-webkit-", "-o-"}) - { - // PDF use a different coordinate system from Web - allcss_fout << prefix << "transform:matrix(" - << tm[0] << ',' - << -tm[1] << ',' - << -tm[2] << ',' - << tm[3] << ','; - - if(prefix == "-moz-") - allcss_fout << boost::format("%1%px,%2%px);") % tm[4] % -tm[5]; - else - allcss_fout << boost::format("%1%,%2%);") % tm[4] % -tm[5]; - } - } - allcss_fout << "}" << endl; -} - -void HTMLRenderer::export_color(long long color_id, const GfxRGB * rgb) -{ - allcss_fout << boost::format(".c%|1$x|{color:rgb(%2%,%3%,%4%);}") - % color_id % (int)colToByte(rgb->r) % (int)colToByte(rgb->g) % (int)colToByte(rgb->b) - << endl; -} - -void HTMLRenderer::check_state_change(GfxState * state) -{ - bool close_line = false; - - if(all_changed || text_pos_changed) - { - if(!(std::abs(cur_ty - draw_ty) * draw_scale < param->v_eps)) - { - close_line = true; - draw_ty = cur_ty; - draw_tx = cur_tx; - } - } - - // TODO, we may use nested span if only color has been changed - if(all_changed || color_changed) - { - GfxRGB new_color; - state->getFillRGB(&new_color); - if(!((new_color.r == cur_color.r) && (new_color.g == cur_color.g) && (new_color.b == cur_color.b))) - { - close_line = true; - cur_color = new_color; - cur_color_id = install_color(&new_color); - } - } - - bool need_rescale_font = false; - if(all_changed || font_changed) - { - long long new_fn_id = install_font(state->getFont()); - - if(!(new_fn_id == cur_fn_id)) - { - close_line = true; - cur_fn_id = new_fn_id; - } - - if(!_equal(cur_font_size, state->getFontSize())) - { - cur_font_size = state->getFontSize(); - need_rescale_font = true; - } - } - - // TODO - // Rise, HorizScale etc - if(all_changed || text_mat_changed || ctm_changed) - { - double new_ctm[6]; - double * m1 = state->getCTM(); - double * m2 = state->getTextMat(); - new_ctm[0] = m1[0] * m2[0] + m1[2] * m2[1]; - new_ctm[1] = m1[1] * m2[0] + m1[3] * m2[1]; - new_ctm[2] = m1[0] * m2[2] + m1[2] * m2[3]; - new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3]; - new_ctm[4] = new_ctm[5] = 0; - - if(!_tm_equal(new_ctm, cur_ctm)) - { - need_rescale_font = true; - memcpy(cur_ctm, new_ctm, sizeof(cur_ctm)); - } - } - - if(need_rescale_font) - { - double new_draw_ctm[6]; - memcpy(new_draw_ctm, cur_ctm, sizeof(new_draw_ctm)); - - draw_scale = std::sqrt(new_draw_ctm[2] * new_draw_ctm[2] + new_draw_ctm[3] * new_draw_ctm[3]); - - double new_draw_font_size = cur_font_size; - if(_is_positive(draw_scale)) - { - new_draw_font_size *= draw_scale; - for(int i = 0; i < 4; ++i) - new_draw_ctm[i] /= draw_scale; - } - else - { - draw_scale = 1.0; - } - - if(!(_equal(new_draw_font_size, draw_font_size))) - { - draw_font_size = new_draw_font_size; - cur_fs_id = install_font_size(draw_font_size); - close_line = true; - } - if(!(_tm_equal(new_draw_ctm, draw_ctm))) - { - memcpy(draw_ctm, new_draw_ctm, sizeof(draw_ctm)); - cur_tm_id = install_transform_matrix(draw_ctm); - close_line = true; - } - } - - // TODO: track these - /* - if(!(_equal(s1->getCharSpace(), s2->getCharSpace()) && _equal(s1->getWordSpace(), s2->getWordSpace()) - && _equal(s1->getHorizScaling(), s2->getHorizScaling()))) - return false; - */ - - reset_state_track(); - - if(close_line) - close_cur_line(); -} - -void HTMLRenderer::reset_state_track() -{ - all_changed = false; - text_pos_changed = false; - ctm_changed = false; - text_mat_changed = false; - font_changed = false; - color_changed = false; -} diff --git a/src/HTMLRenderer.h b/src/HTMLRenderer.h index e362489..037df8f 100644 --- a/src/HTMLRenderer.h +++ b/src/HTMLRenderer.h @@ -16,10 +16,7 @@ #include #include -#include #include -#include -#include #include #include #include @@ -99,9 +96,8 @@ class HTMLRenderer : public OutputDev protected: void close_cur_line(); - // return the mapped font name - long long install_font(GfxFont * font); + long long install_font(GfxFont * font); static void output_to_file(void * outf, const char * data, int len); std::string dump_embedded_font (GfxFont * font, long long fn_id); @@ -122,8 +118,6 @@ class HTMLRenderer : public OutputDev void export_remote_font(long long fn_id, const string & suffix, const string & format, GfxFont * font); void export_remote_default_font(long long fn_id); void export_local_font(long long fn_id, GfxFont * font, const string & original_font_name, const string & cssfont); - std::string general_font_family(GfxFont * font); - void export_font_size(long long fs_id, double font_size); void export_whitespace(long long ws_id, double ws_width); void export_transform_matrix(long long tm_id, const double * tm); @@ -168,7 +162,7 @@ class HTMLRenderer : public OutputDev GfxRGB cur_color; bool color_changed; - // optmize for web + // optimize for web // we try to render the final font size directly // to reduce the effect of ctm as much as possible diff --git a/src/HTMLRenderer/export.cc b/src/HTMLRenderer/export.cc new file mode 100644 index 0000000..f326a46 --- /dev/null +++ b/src/HTMLRenderer/export.cc @@ -0,0 +1,122 @@ +/* + * export.cc + * + * Export styles to HTML + * + * by WangLu + * 2012.08.14 + */ + +#include "HTMLRenderer.h" + +#include +#include + +void HTMLRenderer::export_remote_font(long long fn_id, const string & suffix, const string & format, GfxFont * font) +{ + allcss_fout << boost::format("@font-face{font-family:f%|1$x|;src:url(f%|1$x|%2%)format(\"%3%\");}.f%|1$x|{font-family:f%|1$x|;") % fn_id % suffix % format; + + double a = font->getAscent(); + double d = font->getDescent(); + double r = _is_positive(a-d) ? (a/(a-d)) : 1.0; + + for(const std::string & prefix : {"", "-ms-", "-moz-", "-webkit-", "-o-"}) + { + allcss_fout << prefix << "transform-origin:0% " << (r*100.0) << "%;"; + } + + allcss_fout << "line-height:" << (a-d) << ";"; + + allcss_fout << "}" << endl; +} + +static std::string general_font_family(GfxFont * font) +{ + if(font -> isFixedWidth()) + return "monospace"; + else if (font -> isSerif()) + return "serif"; + else + return "sans-serif"; +} + +// TODO: this function is called when some font is unable to process, may use the name there as a hint +void HTMLRenderer::export_remote_default_font(long long fn_id) +{ + allcss_fout << boost::format(".f%|1$x|{font-family:sans-serif;color:transparent;visibility:hidden;}")%fn_id << endl; +} + +void HTMLRenderer::export_local_font(long long fn_id, GfxFont * font, const string & original_font_name, const string & cssfont) +{ + allcss_fout << boost::format(".f%|1$x|{") % fn_id; + allcss_fout << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";"; + + if(font->isBold()) + allcss_fout << "font-weight:bold;"; + + if(boost::algorithm::ifind_first(original_font_name, "oblique")) + allcss_fout << "font-style:oblique;"; + else if(font->isItalic()) + allcss_fout << "font-style:italic;"; + + double a = font->getAscent(); + double d = font->getDescent(); + double r = _is_positive(a-d) ? (a/(a-d)) : 1.0; + + for(const std::string & prefix : {"", "-ms-", "-moz-", "-webkit-", "-o-"}) + { + allcss_fout << prefix << "transform-origin:0% " << (r*100.0) << "%;"; + } + + allcss_fout << "line-height:" << (a-d) << ";"; + + allcss_fout << "}" << endl; +} + + +void HTMLRenderer::export_font_size (long long fs_id, double font_size) +{ + allcss_fout << boost::format(".s%|1$x|{font-size:%2%px;}") % fs_id % font_size << endl; +} + +void HTMLRenderer::export_whitespace (long long ws_id, double ws_width) +{ + allcss_fout << boost::format(".w%|1$x|{width:%2%px;}") % ws_id % ws_width << endl; +} + +void HTMLRenderer::export_transform_matrix (long long tm_id, const double * tm) +{ + allcss_fout << boost::format(".t%|1$x|{") % tm_id; + + // TODO: recognize common matices + if(_tm_equal(tm, id_matrix)) + { + // no need to output anything + } + else + { + for(const std::string & prefix : {"", "-ms-", "-moz-", "-webkit-", "-o-"}) + { + // PDF use a different coordinate system from Web + allcss_fout << prefix << "transform:matrix(" + << tm[0] << ',' + << -tm[1] << ',' + << -tm[2] << ',' + << tm[3] << ','; + + if(prefix == "-moz-") + allcss_fout << boost::format("%1%px,%2%px);") % tm[4] % -tm[5]; + else + allcss_fout << boost::format("%1%,%2%);") % tm[4] % -tm[5]; + } + } + allcss_fout << "}" << endl; +} + +void HTMLRenderer::export_color(long long color_id, const GfxRGB * rgb) +{ + allcss_fout << boost::format(".c%|1$x|{color:rgb(%2%,%3%,%4%);}") + % color_id % (int)colToByte(rgb->r) % (int)colToByte(rgb->g) % (int)colToByte(rgb->b) + << endl; +} + diff --git a/src/HTMLRenderer/install.cc b/src/HTMLRenderer/install.cc new file mode 100644 index 0000000..1ece909 --- /dev/null +++ b/src/HTMLRenderer/install.cc @@ -0,0 +1,385 @@ +/* + * install.cc + * + * maintaining all known styles + * + * by WangLu + * 2012.08.14 + */ + +#include + +#include + +#include + +#include "HTMLRenderer.h" + +long long HTMLRenderer::install_font(GfxFont * font) +{ + assert(sizeof(long long) == 2*sizeof(int)); + + long long fn_id = (font == nullptr) ? 0 : *reinterpret_cast(font->getID()); + + auto iter = font_name_map.find(fn_id); + if(iter != font_name_map.end()) + return iter->second.fn_id; + + long long new_fn_id = font_name_map.size(); + + font_name_map.insert(std::make_pair(fn_id, FontInfo({new_fn_id}))); + + if(font == nullptr) + { + export_remote_default_font(new_fn_id); + return new_fn_id; + } + + if(param->debug) + { + std::cerr << "Install font: (" << (font->getID()->num) << ' ' << (font->getID()->gen) << ") -> " << boost::format("f%|1$x|")%new_fn_id << std::endl; + } + + if(font->getType() == fontType3) { + std::cerr << "Type 3 fonts are unsupported and will be rendered as Image" << std::endl; + export_remote_default_font(new_fn_id); + return new_fn_id; + } + if(font->getWMode()) { + std::cerr << "Writing mode is unsupported and will be rendered as Image" << std::endl; + export_remote_default_font(new_fn_id); + return new_fn_id; + } + + auto * font_loc = font->locateFont(xref, gTrue); + if(font_loc != nullptr) + { + switch(font_loc -> locType) + { + case gfxFontLocEmbedded: + { + std::string suffix = dump_embedded_font(font, new_fn_id); + if(suffix != "") + { + install_embedded_font(font, suffix, new_fn_id); + } + else + { + export_remote_default_font(new_fn_id); + } + } + break; + case gfxFontLocExternal: + install_external_font(font, new_fn_id); + break; + case gfxFontLocResident: + install_base_font(font, font_loc, new_fn_id); + break; + default: + std::cerr << "TODO: other font loc" << std::endl; + export_remote_default_font(new_fn_id); + break; + } + delete font_loc; + } + else + { + export_remote_default_font(new_fn_id); + } + + return new_fn_id; + +} + +std::string HTMLRenderer::dump_embedded_font (GfxFont * font, long long fn_id) +{ + // mupdf consulted + + Object ref_obj, font_obj, font_obj2, fontdesc_obj; + Object obj, obj1, obj2; + Dict * dict = nullptr; + + std::string suffix, subtype; + + char buf[1024]; + int len; + + ofstream outf; + + auto * id = font->getID(); + ref_obj.initRef(id->num, id->gen); + ref_obj.fetch(xref, &font_obj); + ref_obj.free(); + + if(!font_obj.isDict()) + { + std::cerr << "Font object is not a dictionary" << std::endl; + goto err; + } + + dict = font_obj.getDict(); + if(dict->lookup("DescendantFonts", &font_obj2)->isArray()) + { + if(font_obj2.arrayGetLength() == 0) + { + std::cerr << "Warning: empty DescendantFonts array" << std::endl; + } + else + { + if(font_obj2.arrayGetLength() > 1) + std::cerr << "TODO: multiple entries in DescendantFonts array" << std::endl; + + if(font_obj2.arrayGet(0, &obj2)->isDict()) + { + dict = obj2.getDict(); + } + } + } + + if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict()) + { + std::cerr << "Cannot find FontDescriptor " << std::endl; + goto err; + } + + dict = fontdesc_obj.getDict(); + + if(dict->lookup("FontFile3", &obj)->isStream()) + { + if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName()) + { + subtype = obj1.getName(); + if(subtype == "Type1C") + { + suffix = ".cff"; + } + else if (subtype == "CIDFontType0C") + { + suffix = ".cid"; + } + else + { + std::cerr << "Unknown subtype: " << subtype << std::endl; + goto err; + } + } + else + { + std::cerr << "Invalid subtype in font descriptor" << std::endl; + goto err; + } + } + else if (dict->lookup("FontFile2", &obj)->isStream()) + { + suffix = ".ttf"; + } + else if (dict->lookup("FontFile", &obj)->isStream()) + { + suffix = ".ttf"; + } + else + { + std::cerr << "Cannot find FontFile for dump" << std::endl; + goto err; + } + + if(suffix == "") + { + std::cerr << "Font type unrecognized" << std::endl; + goto err; + } + + obj.streamReset(); + outf.open((boost::format("%1%/f%|2$x|%3%")%TMP_DIR%fn_id%suffix).str().c_str(), ofstream::binary); + while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0) + { + outf.write(buf, len); + } + outf.close(); + obj.streamClose(); + +err: + obj2.free(); + obj1.free(); + obj.free(); + + fontdesc_obj.free(); + font_obj2.free(); + font_obj.free(); + return suffix; +} + +void HTMLRenderer::install_embedded_font(GfxFont * font, const std::string & suffix, long long fn_id) +{ + // TODO Should use standard way to handle CID fonts + /* + * How it works: + * + * 1.dump the font file directly from the font descriptor and put the glyphs into the correct slots + * + * for nonCID + * nothing need to do + * + * for CID + nonTrueType + * Flatten the font + * + * for CID Truetype + * Just use glyph order, and later we'll map GID (instead of char code) to Unicode + * + * + * 2. map charcode (or GID for CID truetype) to Unicode + * + * generate an encoding file and let fontforge handle it. + */ + + std::string fn = (boost::format("f%|1$x|") % fn_id).str(); + + fontscript_fout << boost::format("Open(\"%1%/%2%%3%\",1)") % TMP_DIR % fn % suffix << endl; + + auto ctu = font->getToUnicode(); + int * code2GID = nullptr; + if(ctu) + { + // TODO: ctu could be CID2Unicode for CID fonts + int maxcode = 0; + + if(!font->isCIDFont()) + { + maxcode = 0xff; + } + else + { + maxcode = 0xffff; + if(suffix != ".ttf") + { + fontscript_fout << "CIDFlatten()" << endl; + } + else + { + fontscript_fout << boost::format("Reencode(\"original\")") << endl; + int len; + // code2GID has been stored for embedded CID fonts + code2GID = dynamic_cast(font)->getCodeToGIDMap(nullptr, &len); + } + } + + if(maxcode > 0) + { + ofstream map_fout((boost::format("%1%/%2%.encoding") % TMP_DIR % fn).str().c_str()); + int cnt = 0; + for(int i = 0; i <= maxcode; ++i) + { + Unicode * u; + auto n = ctu->mapToUnicode(i, &u); + // not sure what to do when n > 1 + if(n > 0) + { + ++cnt; + map_fout << boost::format("0x%|1$X|") % (code2GID ? code2GID[i] : i); + for(int j = 0; j < n; ++j) + map_fout << boost::format(" 0x%|1$X|") % u[j]; + map_fout << boost::format(" # 0x%|1$X|") % i << endl; + } + } + + if(cnt > 0) + { + fontscript_fout << boost::format("LoadEncodingFile(\"%1%/%2%.encoding\", \"%2%\")") % TMP_DIR % fn << endl; + fontscript_fout << boost::format("Reencode(\"%1%\", 1)") % fn << endl; + } + } + + ctu->decRefCnt(); + } + + fontscript_fout << boost::format("Generate(\"%1%.ttf\")") % fn << endl; + + export_remote_font(fn_id, ".ttf", "truetype", font); +} + +void HTMLRenderer::install_base_font(GfxFont * font, GfxFontLoc * font_loc, long long fn_id) +{ + std::string psname(font_loc->path->getCString()); + string basename = psname.substr(0, psname.find('-')); + string cssfont; + auto iter = BASE_14_FONT_CSS_FONT_MAP.find(basename); + if(iter == BASE_14_FONT_CSS_FONT_MAP.end()) + { + std::cerr << "PS Font: " << basename << " not found in the base 14 font map" << std::endl; + cssfont = ""; + } + else + cssfont = iter->second; + + export_local_font(fn_id, font, psname, cssfont); +} + +void HTMLRenderer::install_external_font( GfxFont * font, long long fn_id) +{ + std::string fontname(font->getName()->getCString()); + + // resolve bad encodings in GB + auto iter = GB_ENCODED_FONT_NAME_MAP.find(fontname); + if(iter != GB_ENCODED_FONT_NAME_MAP.end()) + { + fontname = iter->second; + std::cerr << "Warning: workaround for font names in bad encodings." << std::endl; + } + + export_local_font(fn_id, font, fontname, ""); +} + +long long HTMLRenderer::install_font_size(double font_size) +{ + auto iter = font_size_map.lower_bound(font_size - EPS); + if((iter != font_size_map.end()) && (_equal(iter->first, font_size))) + return iter->second; + + long long new_fs_id = font_size_map.size(); + font_size_map.insert(std::make_pair(font_size, new_fs_id)); + export_font_size(new_fs_id, font_size); + return new_fs_id; +} + +long long HTMLRenderer::install_whitespace(double ws_width, double & actual_width) +{ + auto iter = whitespace_map.lower_bound(ws_width - param->h_eps); + if((iter != whitespace_map.end()) && (std::abs(iter->first - ws_width) < param->h_eps)) + { + actual_width = iter->first; + return iter->second; + } + + actual_width = ws_width; + long long new_ws_id = whitespace_map.size(); + whitespace_map.insert(std::make_pair(ws_width, new_ws_id)); + export_whitespace(new_ws_id, ws_width); + return new_ws_id; +} + +long long HTMLRenderer::install_transform_matrix(const double * tm) +{ + TM m(tm); + auto iter = transform_matrix_map.lower_bound(m); + if((iter != transform_matrix_map.end()) && (m == (iter->first))) + return iter->second; + + long long new_tm_id = transform_matrix_map.size(); + transform_matrix_map.insert(std::make_pair(m, new_tm_id)); + export_transform_matrix(new_tm_id, tm); + return new_tm_id; +} + +long long HTMLRenderer::install_color(const GfxRGB * rgb) +{ + Color c(rgb); + auto iter = color_map.lower_bound(c); + if((iter != color_map.end()) && (c == (iter->first))) + return iter->second; + + long long new_color_id = color_map.size(); + color_map.insert(std::make_pair(c, new_color_id)); + export_color(new_color_id, rgb); + return new_color_id; +} + diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc new file mode 100644 index 0000000..c575da7 --- /dev/null +++ b/src/HTMLRenderer/state.cc @@ -0,0 +1,178 @@ +/* + * state.cc + * + * track the current state + * + * by WangLu + * 2012.08.14 + */ + + +#include "HTMLRenderer.h" + +void HTMLRenderer::check_state_change(GfxState * state) +{ + bool close_line = false; + + if(all_changed || text_pos_changed) + { + if(!(std::abs(cur_ty - draw_ty) * draw_scale < param->v_eps)) + { + close_line = true; + draw_ty = cur_ty; + draw_tx = cur_tx; + } + } + + // TODO, we may use nested span if only color has been changed + if(all_changed || color_changed) + { + GfxRGB new_color; + state->getFillRGB(&new_color); + if(!((new_color.r == cur_color.r) && (new_color.g == cur_color.g) && (new_color.b == cur_color.b))) + { + close_line = true; + cur_color = new_color; + cur_color_id = install_color(&new_color); + } + } + + bool need_rescale_font = false; + if(all_changed || font_changed) + { + long long new_fn_id = install_font(state->getFont()); + + if(!(new_fn_id == cur_fn_id)) + { + close_line = true; + cur_fn_id = new_fn_id; + } + + if(!_equal(cur_font_size, state->getFontSize())) + { + cur_font_size = state->getFontSize(); + need_rescale_font = true; + } + } + + // TODO + // Rise, HorizScale etc + if(all_changed || text_mat_changed || ctm_changed) + { + double new_ctm[6]; + double * m1 = state->getCTM(); + double * m2 = state->getTextMat(); + new_ctm[0] = m1[0] * m2[0] + m1[2] * m2[1]; + new_ctm[1] = m1[1] * m2[0] + m1[3] * m2[1]; + new_ctm[2] = m1[0] * m2[2] + m1[2] * m2[3]; + new_ctm[3] = m1[1] * m2[2] + m1[3] * m2[3]; + new_ctm[4] = new_ctm[5] = 0; + + if(!_tm_equal(new_ctm, cur_ctm)) + { + need_rescale_font = true; + memcpy(cur_ctm, new_ctm, sizeof(cur_ctm)); + } + } + + if(need_rescale_font) + { + double new_draw_ctm[6]; + memcpy(new_draw_ctm, cur_ctm, sizeof(new_draw_ctm)); + + draw_scale = std::sqrt(new_draw_ctm[2] * new_draw_ctm[2] + new_draw_ctm[3] * new_draw_ctm[3]); + + double new_draw_font_size = cur_font_size; + if(_is_positive(draw_scale)) + { + new_draw_font_size *= draw_scale; + for(int i = 0; i < 4; ++i) + new_draw_ctm[i] /= draw_scale; + } + else + { + draw_scale = 1.0; + } + + if(!(_equal(new_draw_font_size, draw_font_size))) + { + draw_font_size = new_draw_font_size; + cur_fs_id = install_font_size(draw_font_size); + close_line = true; + } + if(!(_tm_equal(new_draw_ctm, draw_ctm))) + { + memcpy(draw_ctm, new_draw_ctm, sizeof(draw_ctm)); + cur_tm_id = install_transform_matrix(draw_ctm); + close_line = true; + } + } + + // TODO: track these + /* + if(!(_equal(s1->getCharSpace(), s2->getCharSpace()) && _equal(s1->getWordSpace(), s2->getWordSpace()) + && _equal(s1->getHorizScaling(), s2->getHorizScaling()))) + return false; + */ + + reset_state_track(); + + if(close_line) + close_cur_line(); +} +void HTMLRenderer::reset_state_track() +{ + all_changed = false; + text_pos_changed = false; + ctm_changed = false; + text_mat_changed = false; + font_changed = false; + color_changed = false; +} +void HTMLRenderer::close_cur_line() +{ + if(line_opened) + { + html_fout << "" << endl; + line_opened = false; + } +} + +void HTMLRenderer::updateAll(GfxState * state) +{ + all_changed = true; + updateTextPos(state); +} + +void HTMLRenderer::updateFont(GfxState * state) +{ + font_changed = true; +} + +void HTMLRenderer::updateTextMat(GfxState * state) +{ + text_mat_changed = true; +} + +void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) +{ + ctm_changed = true; +} + +void HTMLRenderer::updateTextPos(GfxState * state) +{ + text_pos_changed = true; + cur_tx = state->getLineX(); + cur_ty = state->getLineY(); +} + +void HTMLRenderer::updateTextShift(GfxState * state, double shift) +{ + text_pos_changed = true; + cur_tx -= shift * 0.001 * state->getFontSize() * state->getHorizScaling(); +} + +void HTMLRenderer::updateFillColor(GfxState * state) +{ + color_changed = true; +} diff --git a/src/util.h b/src/util.h index 8d9af9e..54fe828 100644 --- a/src/util.h +++ b/src/util.h @@ -17,6 +17,13 @@ #include "Consts.h" +// mute gcc +namespace +{ + template + void dummy1(){ auto _ = &mapUCS2; } +} + static inline bool _equal(double x, double y) { return std::abs(x-y) < EPS; } static inline bool _is_positive(double x) { return x > EPS; } static inline bool _tm_equal(const double * tm1, const double * tm2, int size = 6)