jokkerr
Version:
Node package and CLI tool for saving web page as single HTML file
518 lines • 20.7 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.HTML = void 0;
const utils_1 = require("./utils");
const css_1 = require("./css");
const uri_1 = require("./uri");
const jsdom_1 = require("jsdom");
/**
* @see https://html.spec.whatwg.org/multipage/semantics.html
*/
class HTML {
constructor(opt = {}) {
this.opt = opt;
// private opt: Record<string, unknown>;
this.rx = {
lazyImageSrc: /^\s*\S+(jpg|jpeg|png|webp|gif)\S*\s*$/gm,
lazyImageSrcset: /(jpg|jpeg|png|webp|gif)\s+\d/gm,
// Exclude SVG, because SVG can have a meaningful image in under 133 bytes.
B64DataURL: /data:(?!(image\/svg\+xml)).*?(;(.*?)),/gm,
srcsetURL: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/gm,
};
this.opt = opt;
}
/**
* Process assets within webpage
*
* @param {Webpage} [Webpage] if error will be thrown
* @return {this} [Cairn] `this` command for chaning
* @api public
*/
process(page) {
return __awaiter(this, void 0, void 0, function* () {
const { content, uri } = page;
const dom = new jsdom_1.JSDOM(content);
const doc = dom.window.document;
// Prepare documents by doing these steps :
// - Set Content-Security-Policy to make sure no unwanted request happened
// - Apply configuration to documents
// - Replace all noscript to divs, to make it processed as well
// - Remove all comments in documents
// - Convert data-src and data-srcset attribute in lazy image to src and srcset
// - Convert relative URL into absolute URL
// - Remove subresources integrity attribute from links
// - Convert Open Graph Metadata
this.setContentSecurityPolicy(doc);
this.applyConfiguration(doc);
this.convertNoScriptToDiv(doc, true);
this.removeComments(doc);
this.convertLazyImageAttrs(doc);
this.convertRelativeURLs(doc, uri);
this.removeLinkIntegrityAttr(doc);
this.convertOpenGraph(doc);
// Find all nodes which might has subresource.
// A node might has subresource if it fulfills one of these criteria :
// - It has inline style;
// - It's link for icon or stylesheets;
// - It's tag name is either style, img, picture, figure, video, audio, source, iframe or object;
let tagName;
const nodes = [];
const tags = 'link,style,script,iframe,embed,object,img,picture,figure,video,audio,source';
const rels = ['icon', 'stylesheet', 'shortcut icon', 'mask-icon', 'apple-touch-icon-precomposed'];
doc.querySelectorAll(tags).forEach(function (currentNode) {
tagName = currentNode.tagName;
if (typeof tagName !== 'string') {
return;
}
switch (tagName.toLowerCase()) {
case 'link': {
const rel = currentNode.getAttribute('rel');
if (rels.includes(rel)) {
nodes.push(currentNode);
}
break;
}
case 'style':
case 'script':
case 'iframe':
case 'embed':
case 'object':
case 'img':
case 'picture':
case 'figure':
case 'video':
case 'audio':
case 'source': {
nodes.push(currentNode);
break;
}
}
});
const css = new css_1.CSS();
for (const node of nodes) {
tagName = node.tagName;
if (node.hasAttributes() && node.getAttribute('style')) {
yield css.process(node, uri);
}
tagName = tagName.toLowerCase();
switch (tagName) {
case 'style': {
yield css.process(node, uri);
break;
}
case 'link': {
yield this.processLinkNode(node, uri);
break;
}
case 'script': {
yield this.processScriptNode(node, uri);
break;
}
case 'iframe':
case 'embed':
case 'object': {
yield this.processEmbedNode(node, uri);
break;
}
case 'img':
case 'picture':
case 'figure':
case 'video':
case 'audio':
case 'source': {
yield this.processMediaNode(node, uri);
break;
}
}
}
// Revert the converted noscripts
this.revertConvertedNoScript(doc);
// return document back as string
return dom.serialize();
});
}
/**
* setContentSecurityPolicy prevent browsers from requesting any remote
* resources by setting Content-Security-Policy to only allow from
* inline element and data URL.
*
* @param {Document} doc JSDOM.window.document
* @api private
*/
setContentSecurityPolicy(doc) {
// Remove existing CSP
doc.querySelectorAll('meta[http-equiv="Content-Security-Policy"]').forEach((e) => utils_1.removeChild(e));
const policies = ["default-src 'unsafe-inline' data:;", "connect-src 'none';"];
if (this.opt.disableJS === true) {
policies.push("script-src 'none';");
}
if (this.opt.disableCSS === true) {
policies.push("style-src 'none';");
}
if (this.opt.disableEmbeds === true) {
policies.push("frame-src 'none'; child-src 'none';");
}
if (this.opt.disableMedias === true) {
policies.push("image-src 'none'; media-src 'none';");
}
// Append the new CSP
const head = doc.head;
for (const policy of policies) {
const meta = doc.createElement('meta');
meta.httpEquiv = 'Content-Security-Policy';
meta.content = policy;
head.prepend(meta);
}
}
/**
* Removes or replace elements following the configuration.
*
* @param {Document} doc JSDOM.window.document
* @api private
*/
applyConfiguration(doc) {
if (this.opt.disableJS === true) {
// Remove script tags
doc.querySelectorAll('script').forEach((e) => utils_1.removeChild(e));
// Remove links with javascript URL scheme
doc.querySelectorAll('a[href*="javascript:"]').forEach((e) => e.setAttribute('href', '#'));
// Convert noscript to div
this.convertNoScriptToDiv(doc, false);
}
if (this.opt.disableCSS === true) {
// Remove style tags
doc.querySelectorAll('style').forEach((e) => utils_1.removeChild(e));
// Remove inline style
doc.querySelectorAll('[style]').forEach((e) => e.removeAttribute('style'));
}
if (this.opt.disableEmbeds === true) {
doc.querySelectorAll('embed,object,iframe').forEach((e) => utils_1.removeChild(e));
}
if (this.opt.disableMedias === true) {
doc.querySelectorAll('img,picture,figure,video,audio,source').forEach((e) => utils_1.removeChild(e));
}
}
/**
* Convert all noscript to div element.
*
* @param {Document} doc JSDOM.window.document
* @param {boolean} [markNewDiv] mark to noscript
* @api private
*/
convertNoScriptToDiv(doc, markNewDiv = false) {
doc.querySelectorAll('noscript').forEach((e) => {
const div = doc.createElement('div');
div.innerHTML = e.innerHTML;
if (markNewDiv) {
div.setAttribute('data-cairn-noscript', 'true');
}
e.parentNode.replaceChild(div, e);
});
}
/**
* Find all comments in document then remove it.
*
* @param {Document} doc JSDOM.window.document
* @api private
*/
removeComments(doc) {
const nodeIterator = doc.createNodeIterator(doc, 128); // NodeFilter.SHOW_COMMENT
let currentNode;
while ((currentNode = nodeIterator.nextNode())) {
currentNode.remove();
}
}
/**
* Convert attributes data-src and data-srcset which often found
* in lazy-loaded images and pictures, into basic attribute
* src and srcset, so images that can be loaded without JS.
*
* @param {Document} doc JSDOM.window.document
* @api private
*/
convertLazyImageAttrs(doc) {
// Convert img attributes
doc.querySelectorAll('img,picture,figure').forEach((e) => {
const src = e.src;
const srcset = e.srcset;
const tagName = e.tagName.toLowerCase();
// In some sites (e.g. Kotaku), they put 1px square image as data uri in
// the src attribute. So, here we check if the data uri is too short,
// just might as well remove it.
if (src !== undefined && src.length > 0 && this.rx.B64DataURL.test(src)) {
return;
}
// let srcCouldBeRemoved: boolean = false;
// todo
if ((src !== '' || srcset !== '') && e.getAttribute('loading') === 'lazy') {
return;
}
const attrs = e.attributes;
for (const attr of [...attrs]) {
if (attr.name === undefined) {
continue;
}
if (['src', 'srcset'].includes(attr.name.toLowerCase())) {
continue;
}
const attrVal = attr.value;
let copyTo = '';
if (this.rx.lazyImageSrcset.test(attrVal)) {
copyTo = 'srcset';
}
else if (this.rx.lazyImageSrc.test(attrVal)) {
copyTo = 'src';
}
if (copyTo === '' || !utils_1.isValidURL(attrVal)) {
continue;
}
if (['img', 'picture'].includes(tagName)) {
e.setAttribute(copyTo, attrVal);
}
else if (tagName === 'figure') {
const img = doc.createElement('img');
img.setAttribute(copyTo, attrVal);
e.appendChild(img);
}
e.removeAttribute(attr.name);
}
if (tagName === 'figure' && attrs.length === 0) {
const img = doc.createElement('img');
// img.setAttribute(copyTo, attrVal);
e.appendChild(img);
}
});
}
/**
* Converts all relative URL in document into absolute URL.
* We do this for a, img, picture, figure, video, audio, source, link,
* embed, iframe and object.
*
* @param {Document} doc JSDOM.window.document
* @param {string} [url] original request url
* @api private
*/
convertRelativeURLs(doc, url) {
const allowList = [
'a',
'link',
'embed',
'script',
'iframe',
'object',
'img',
'picture',
'figure',
'video',
'audio',
'source',
];
const slugs = {
a: 'href',
link: 'href',
embed: 'src',
script: 'src',
iframe: 'src',
object: 'data',
};
const mediaList = ['img', 'picture', 'figure', 'video', 'audio', 'source'];
const convert = (node, attrName) => {
const oriURI = node.getAttribute(attrName);
if (typeof oriURI === 'string') {
const newVal = utils_1.createAbsoluteURL(oriURI, url);
node.setAttribute(attrName, decodeURI(newVal));
}
};
const nodeIterator = doc.createNodeIterator(doc.body);
let currentNode, tagName, attrName, name, srcset, newSrcset;
while ((currentNode = nodeIterator.nextNode())) {
tagName = currentNode.tagName;
if (typeof tagName !== 'string' || currentNode.hasAttributes() === false) {
continue;
}
name = tagName.toLowerCase();
if (allowList.includes(name) === false) {
continue;
}
if (slugs[name]) {
attrName = slugs[name];
convert(currentNode, attrName);
}
if (mediaList.includes(name)) {
convert(currentNode, 'src');
convert(currentNode, 'poster');
srcset = currentNode.getAttribute('srcset');
if (typeof srcset === 'string') {
newSrcset = utils_1.createAbsoluteURL(srcset, url);
currentNode.setAttribute('srcset', decodeURI(newSrcset));
}
}
}
}
/**
* Removes integrity attributes from link tags.
*
* @param {Document} doc JSDOM.window.document
* @api private
*/
removeLinkIntegrityAttr(doc) {
doc.querySelectorAll('link[integrity]').forEach((e) => {
e.removeAttribute('integrity');
});
}
/**
* Set og:title to title when it empty.
*
* @param {Document} doc JSDOM.window.document
* @api private
*/
convertOpenGraph(doc) {
let meta, attr, content, property;
const title = doc.head.querySelector('title');
doc.querySelectorAll('head > meta').forEach((e) => {
attr = e.getAttribute('property');
content = e.getAttribute('content');
if (attr && typeof attr === 'string' && attr.startsWith('og:')) {
// real property
property = attr.substring(3);
meta = doc.createElement('meta');
meta.setAttribute('property', property);
meta.setAttribute('content', content);
e.parentNode.appendChild(meta);
// replace title if it empty
if (title && title.innerHTML.trim().length < 1 && property.toLowerCase() === 'title') {
title.textContent = content;
}
}
});
}
processLinkNode(doc, baseURL = '') {
return __awaiter(this, void 0, void 0, function* () {
if (!doc.hasAttribute('href')) {
return;
}
const href = doc.getAttribute('href');
if (!href || typeof href !== 'string') {
return;
}
const rel = doc.getAttribute('rel');
if (typeof rel !== 'string') {
return;
}
if (rel.indexOf('icon') > -1) {
return yield this.processURLNode(doc, 'href', baseURL);
}
// Replace <link> to <style>
if (['preload', 'stylesheet'].includes(rel.toLowerCase())) {
yield new uri_1.URI().process(href, baseURL).then((data) => {
const styleNode = new jsdom_1.JSDOM(`<style type="text/css">${data}</style>`);
doc.outerHTML = styleNode.window.document.head.querySelector('style').outerHTML;
});
}
return;
});
}
processURLNode(node, attrName, baseURL) {
return __awaiter(this, void 0, void 0, function* () {
if (!node.hasAttribute(attrName)) {
return;
}
const url = node.getAttribute(attrName);
if (typeof url !== 'string' || url.trim().length < 1) {
return;
}
const assetURL = utils_1.createAbsoluteURL(url, baseURL);
yield utils_1.convertToData(assetURL).then((data) => {
if (data && typeof data === 'string' && data.trim().length > -1) {
node.setAttribute(attrName, data);
}
});
return;
});
}
processScriptNode(node, baseURL) {
return __awaiter(this, void 0, void 0, function* () {
const src = node.getAttribute('src');
if (!src || typeof src !== 'string' || src.trim().length < 1) {
return;
}
yield new uri_1.URI().process(src, baseURL).then((data) => {
node.removeAttribute('src');
node.textContent = data;
});
return;
});
}
processEmbedNode(node, baseURL) {
return __awaiter(this, void 0, void 0, function* () {
const attrName = node.tagName === 'OBJECT' ? 'data' : 'src';
const url = node.getAttribute(attrName);
if (!url || typeof url !== 'string' || url.trim().length < 1) {
return;
}
const assetURL = utils_1.createAbsoluteURL(url, baseURL);
yield utils_1.convertToData(assetURL).then((data) => {
if (data && typeof data === 'string' && data.trim().length > -1) {
node.removeAttribute(attrName);
node.setAttribute(attrName, data);
}
});
return;
});
}
processMediaNode(node, baseURL) {
return __awaiter(this, void 0, void 0, function* () {
const src = node.getAttribute('src');
if (src && typeof src === 'string' && src.trim().length > 0) {
yield this.processURLNode(node, 'src', baseURL);
}
const poster = node.getAttribute('poster');
if (poster && typeof poster === 'string' && poster.trim().length > 0) {
yield this.processURLNode(node, 'poster', baseURL);
}
const srcset = node.getAttribute('srcset');
if (!srcset || typeof srcset !== 'string' || srcset.trim().length < 1) {
return;
}
const newSets = [];
const match = [...decodeURI(srcset).matchAll(this.rx.srcsetURL)];
for (const parts of match) {
const oldURL = parts[1];
const targetWidth = parts[2];
let newSet = oldURL;
const assetURL = utils_1.createAbsoluteURL(oldURL, baseURL);
const data = yield utils_1.convertToData(assetURL);
if (typeof data === 'string' && data.length > -1) {
newSet = data;
}
newSet += targetWidth;
newSets.push(newSet);
}
const newSrcset = newSets.join(',');
node.setAttribute('srcset', newSrcset);
return;
});
}
revertConvertedNoScript(doc) {
const divs = doc.getElementsByTagName('div');
for (const div of divs) {
const attr = div.getAttribute('data-cairn-noscript');
if (attr === 'true' && div.parentNode) {
const noscript = doc.createElement('noscript');
noscript.textContent = div.innerHTML;
div.parentNode.replaceChild(noscript, div);
}
}
}
}
exports.HTML = HTML;
//# sourceMappingURL=html.js.map