UNPKG

epubinator

Version:

NPM package to generate epub files out of url

199 lines (198 loc) 6.61 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.generateLink = exports.getMain = exports.removeTitle = exports.getTitle = exports.getBodyHtmlFromDom = exports.removeToc = exports.getArticle = exports.getDom = void 0; var tslib_1 = require("tslib"); var node_fetch_1 = require("node-fetch"); var jsdom_1 = require("jsdom"); var jsdom_2 = require("./util/jsdom"); var logger_1 = require("./logger"); var url_1 = require("./url"); var ramda_1 = require("ramda"); /** * getDom * * @param url * @returns Promise<JSDOM> */ function getDom(url) { return tslib_1.__awaiter(this, void 0, void 0, function () { var response, html, dom, pictures, images; return tslib_1.__generator(this, function (_a) { switch (_a.label) { case 0: return [4 /*yield*/, (0, node_fetch_1.default)(url)]; case 1: response = _a.sent(); return [4 /*yield*/, response.text()]; case 2: html = _a.sent(); dom = new jsdom_1.JSDOM(html); pictures = Array.from(dom.window.document.documentElement.querySelectorAll('picture')); replacePicturesWithImages(pictures); images = Array.from(dom.window.document.documentElement.querySelectorAll('img')); (0, ramda_1.compose)(withoutAttributes(['srcset', 'loading']), linkImgSrcToCompleteLink(url))(images); removeSvgImg(images); return [2 /*return*/, dom]; } }); }); } exports.getDom = getDom; var removeSvgImg = function (images) { images.forEach(function (image) { return /data:image\/svg/i.test(image.src) && image.remove(); }); }; var replacePicturesWithImages = function (pictures) { pictures.forEach(function (picture) { var image = picture.querySelector('img'); var parentNode = picture.parentNode; parentNode.insertBefore(image.cloneNode(true), picture); picture.remove(); }); }; var withoutAttributes = function (attributes) { return function (images) { var removeFrom = function (image) { return function (attribute) { return image.removeAttribute(attribute); }; }; images.forEach(function (image) { attributes.forEach(removeFrom(image)); }); return images; }; }; /** * getArticle * * @param dom * @param context * @returns JSDOM object */ function getArticle(dom, context) { if (context === void 0) { context = {}; } var document = (0, jsdom_2.getDocument)(dom); var article = document.querySelector('article') || document.querySelector('body'); if (!article) { throw new Error("cannot find article. describe the article explicitly ".concat(JSON.stringify(context))); } return new jsdom_1.JSDOM(article.outerHTML); } exports.getArticle = getArticle; function removeToc(dom) { var document = (0, jsdom_2.getDocument)(dom); var toc = document.querySelector('#toc') || document.querySelector('aside'); if (toc) toc.remove(); return new jsdom_1.JSDOM(document.documentElement.outerHTML); } exports.removeToc = removeToc; /** * getBodyHtmlFromDom * * @param {JSDOM} dom * @returns {string} */ function getBodyHtmlFromDom(dom) { if (!dom) return ''; return dom.window.document.querySelector('body').innerHTML; } exports.getBodyHtmlFromDom = getBodyHtmlFromDom; /** * getTitle * * @param {JSDOM} dom * @param {ContextType} context={} * @returns {string} */ function getTitle(dom, context) { if (context === void 0) { context = {}; } var document = (0, jsdom_2.getDocument)(dom); var titleElement = document.querySelector('h1'); if (!titleElement) { console.log("cannot find title at ".concat(context.url)); return ''; } return titleElement.innerHTML; } exports.getTitle = getTitle; /** * removeTitle * * @param {JSDOM} dom * @returns {JSDOM} */ function removeTitle(dom) { // TODO: immutability var document = (0, jsdom_2.getDocument)(dom); var titleElement = document.querySelector('h1'); if (!titleElement) { return dom; } titleElement.remove(); return dom; } exports.removeTitle = removeTitle; function getFallbackTitleContent(dom) { var title = (0, jsdom_2.getDocument)(dom).querySelector('h1') || (0, jsdom_2.getDocument)(dom).querySelector('h2'); return title === null || title === void 0 ? void 0 : title.parentElement; } /** * getMain * * @param {JSDOM} dom * @param {ContextType} context={} * @returns {JSDOM} */ function getMain(dom, context) { if (context === void 0) { context = {}; } var document = (0, jsdom_2.getDocument)(dom); var fallbackTitleContent = getFallbackTitleContent(dom); var main = document.querySelector('main') || document.querySelector('.content') || document.querySelector("[role='main']") || document.querySelector('#main') || fallbackTitleContent || document.querySelector('body'); if (!main) { throw new Error("cannot find main. describe the main explicitly at ".concat(context.url, " ").concat(dom.window.document.documentElement.outerHTML)); } return new jsdom_1.JSDOM(main.outerHTML); } exports.getMain = getMain; var linkImgSrcToCompleteLink = function (url) { return function (images) { Array.from(images).forEach(function (image) { if ((0, url_1.isAbsoluteHref)(image.src)) { image.src = (0, url_1.absoluteToLink)({ url: url })(image.src); } return image; }); return images; }; }; /** * generateLink * * @param {string} origin * @param {string} link * @returns {string} */ function generateLink(url, link) { if (!link) return; (0, logger_1.log)((0, logger_1.info)('Generating link at'), (0, logger_1.success)('origin:'), (0, logger_1.emphasizedInfo)((0, url_1.origin)(url)), logger_1.lineBreak, (0, logger_1.success)('link:'), (0, logger_1.emphasizedInfo)(link)); (0, logger_1.log)((0, logger_1.error)(url)); try { new URL(link); return link; } catch (e) { if ((0, url_1.isAbsoluteHref)(link)) { return (0, url_1.absoluteToLink)({ url: url })(link); } return "".concat(stripCurrentPageFromUrl(url), "/").concat(link); } } exports.generateLink = generateLink; var stripCurrentPageFromUrl = function (url) { return (url // .replace(/(^\w+:|^)\/\//, '') .split('/') .slice(0, -1) .join('/')); };