UNPKG

website-scrap-engine

Version:
143 lines 6.05 kB
import { parseSrcset, stringifySrcset } from 'srcset'; import { load } from 'cheerio'; import { sources as defaultSources } from '../sources.js'; import { ResourceType } from '../resource.js'; import { processCssText } from './process-css.js'; import { error, skip } from '../logger/logger.js'; import { parseHtml } from './adapters.js'; async function processHtmlDoc(options, doc, res, pipeline, depth, resources, refUrl, savePath, submit) { const sources = options.sources || defaultSources; for (const { selector, attr, type } of sources) { const elements = doc(selector); for (let index = 0; index < elements.length; index++) { const elem = elements.eq(index); const attrValue = attr && elem.attr(attr); if (!attr || !attrValue) { // style block if (type === ResourceType.CssInline) { let content = elem.html(); if (!content) continue; content = await processCssText(content, res, options, pipeline, depth, resources); elem.html(content); } continue; } else if (type === ResourceType.CssInline) { const content = await processCssText(attrValue, res, options, pipeline, depth, resources); elem.attr(attr, content); continue; } let links, replaceValue; if (attr === 'srcset') { try { replaceValue = parseSrcset(attrValue); } catch (e) { error.info('skipping invalid srcset', attrValue, e); // should invalid srcset being removed? continue; } links = replaceValue.map(e => e.url); } else { links = [attrValue]; replaceValue = attrValue; } for (let linkIndex = 0, l = links.length; linkIndex < l; linkIndex++) { const originalLink = links[linkIndex]; // skip empty links if (!originalLink) { continue; } const link = await pipeline.linkRedirect(originalLink, elem, res); if (!link) { if (skip.isTraceEnabled()) { skip.trace('skip linkRedirect', originalLink, refUrl); } continue; } const linkType = await pipeline.detectResourceType(link, type, elem, res); if (!linkType) { if (skip.isTraceEnabled()) { skip.trace('skip detectResourceType', originalLink, link, refUrl); } continue; } let resource = await pipeline.createResource(linkType, depth, link, refUrl, res.localRoot, options.encoding[linkType], savePath, res.type); resource = await pipeline.processBeforeDownload(resource, elem, res, options); if (!resource) { if (skip.isTraceEnabled()) { skip.trace('skip processBeforeDownload', originalLink, link, linkType, refUrl); } continue; } if (!resource.shouldBeDiscardedFromDownload) { submit(resource); } if (attr === 'srcset') { // 20241005: It's ok to do this // I've looked into the source code of srcset 5.0.1 // and there is nothing preventing the return value to change replaceValue[linkIndex].url = resource.replacePath; } else { replaceValue = resource.replacePath; // historical workaround here if (replaceValue === '.html' || replaceValue === '/.html') { replaceValue = ''; } } } if (attr === 'srcset') { elem.attr(attr, stringifySrcset(replaceValue)); } else if (attr) { elem.attr(attr, replaceValue); } else { error.warn('skip attr replace', links, replaceValue, refUrl); } } } const iframeSrcDocs = doc('iframe[srcdoc]'); for (let index = 0; index < iframeSrcDocs.length; index++) { const elem = iframeSrcDocs.eq(index); const attrValue = elem.attr('srcdoc'); if (!attrValue) { continue; } try { const iframeDoc = load(attrValue); await processHtmlDoc(options, iframeDoc, res, pipeline, depth, resources, refUrl, savePath, submit); const html = options.cheerioSerialize ? iframeDoc.html(options.cheerioSerialize) : iframeDoc.html(); elem.attr('srcdoc', html); } catch (e) { error.info('can not parse iframe srcdoc', res.url, res.rawUrl, e); } } } export async function processHtml(res, submit, options, pipeline) { if (res.type !== ResourceType.Html) { return res; } const refUrl = res.redirectedUrl || res.url; const savePath = refUrl === res.url ? res.savePath : undefined; // useless since processRedirectedUrl enabled by default // refUrl = await pipeline.linkRedirect(refUrl, null, res) || refUrl; const depth = res.depth + 1; let doc = res.meta.doc; if (!doc) { res.meta.doc = doc = parseHtml(res, options); } // resources from inline css const resources = []; await processHtmlDoc(options, doc, res, pipeline, depth, resources, refUrl, savePath, submit); if (resources.length) { submit(resources); } return res; } //# sourceMappingURL=process-html.js.map