UNPKG

website-scrap-engine

Version:
366 lines 13.5 kB
import URI from 'urijs'; import * as path from 'node:path'; import { escapePath, isUrlHttp, orderUrlSearch, simpleHashString } from './util.js'; import { error as log } from './logger/logger.js'; export var ResourceType; (function (ResourceType) { /** * Binary resource, not parsed nor processed */ ResourceType[ResourceType["Binary"] = 1] = "Binary"; /** * Html resource */ ResourceType[ResourceType["Html"] = 2] = "Html"; /** * Css resource */ ResourceType[ResourceType["Css"] = 3] = "Css"; /** * Inline css resource in html, * currently only style blocks and style attributes are processed */ ResourceType[ResourceType["CssInline"] = 4] = "CssInline"; /** * Very limited support of site-maps, urls in it are not replaced. */ ResourceType[ResourceType["SiteMap"] = 5] = "SiteMap"; /** * Standalone svg image * https://github.com/website-local/website-scrap-engine/issues/3 */ ResourceType[ResourceType["Svg"] = 6] = "Svg"; /** * Large binary, which would be streamed directly to disk, * {@link Resource.type} must be explicitly set to this value to use streaming. * @see downloadStreamingResource * @see https://github.com/website-local/website-scrap-engine/issues/2 */ ResourceType[ResourceType["StreamingBinary"] = 7] = "StreamingBinary"; })(ResourceType || (ResourceType = {})); export function prepareResourceForClone(res) { const clone = {}; for (const key of Object.keys(res)) { const value = Reflect.get(res, key); if (typeof value === 'object') { if (key === 'meta') { const props = clone[key] = {}; for (const prop of Object.keys(value)) { // headers can be cloned safely if (prop === 'headers' || typeof value[prop] !== 'object') { props[prop] = value[prop]; } } } else if (key === 'body' && (typeof value === 'string' || value instanceof ArrayBuffer || ArrayBuffer.isView(value) || Buffer.isBuffer(value))) { clone[key] = value; } } else { Reflect.set(clone, key, value); } } return clone; } /** * Generate save path from HTTP/HTTPS absolute uri * @param uri the HTTP/HTTPS absolute uri * @param isHtml should the savePath endsWith .html * @param keepSearch keep url search params in file name * @param localSrcRoot local source path to download from * @return string must return non-empty string */ export function generateSavePath(uri, isHtml, keepSearch, localSrcRoot) { if (uri.is('relative') && uri.protocol() !== 'file') { throw new Error('generateSavePath: uri can not be relative: ' + uri.toString()); } let savePath; if (uri.protocol() === 'file') { if (!localSrcRoot) { throw new Error('generateSavePath: using file protocol without localSrcRoot' + uri.toString()); } if (process.platform === 'win32' && localSrcRoot.match(/^[a-z]:\//i)) { // windows absolute fix savePath = uri.pathname().slice(localSrcRoot.length + 1); if (savePath[0] === '/') { savePath = savePath.slice(1); } } else { savePath = uri.pathname().slice(localSrcRoot.length); } } else { const host = uri.hostname(); savePath = path.join(host || '', escapePath(uri.path())); } if (isHtml && !savePath.endsWith('.html')) { if (uri.protocol() === 'file' && savePath === '' || savePath.endsWith('/') || savePath.endsWith('\\')) { savePath += 'index.html'; } else if (savePath.endsWith('.htm')) { savePath += 'l'; } else { savePath += '.html'; } } if (keepSearch) { let search = uri.search(); if (search && search.length > 0) { if (search.length > 43) { const ordered = orderUrlSearch(search); const hashed = simpleHashString(ordered); log.debug('search too long, replacing with hash', ordered, hashed); // avoid too long search search = '_' + hashed; } else { // order it search = escapePath(orderUrlSearch(search)); } const ext = path.extname(savePath); if (ext) { savePath = savePath.slice(0, -ext.length) + search + ext; } else { savePath += search; } } } return savePath; } export const urlOfSavePath = (savePath) => { if (savePath.includes('\\')) { return `file:///${savePath.replace(/\\/g, '/')}`; } return `file:///${savePath}`; }; /** * Check an absolute uri * @param uri {@link RawResource.uri} * @param refUri {@link RawResource.refUri} * @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError} * @param url {@link CreateResourceArgument.url} * @param refUrl {@link CreateResourceArgument.refUrl} * @param type {@link CreateResourceArgument.type} * @throws Error if {@link skipReplacePathError} === false and check fail * @return true if {@link skipReplacePathError} === true and check fail */ export function checkAbsoluteUri(uri, refUri, skipReplacePathError, url, refUrl, type) { let replacePathHasError = false; const protocol = uri.protocol().toLowerCase(); if (protocol !== 'http' && protocol !== 'https' && protocol !== 'file' && protocol !== refUri.protocol().toLowerCase()) { if (skipReplacePathError) { log.warn('protocol not supported, skipping', protocol, url, refUrl, type); replacePathHasError = true; } else { log.warn('protocol not supported, skipping', protocol, url, refUrl, type); throw new Error(`protocol ${protocol} not supported`); } } if (protocol !== 'file' && !uri.host()) { if (skipReplacePathError) { log.warn('empty host for non-file uri not supported, skipping', protocol, url, refUrl, type); replacePathHasError = true; } else { log.warn('empty host for non-file uri not supported, skipping', protocol, url, refUrl, type); throw new Error('empty host for non-file uri not supported'); } } return replacePathHasError; } const FILE_PROTOCOL_PREFIX = 'file:///'; export function resolveFileUrl(url, refUrl, localSrcRoot, skipReplacePathError) { if (isUrlHttp(url)) { return url; } let error; if (!localSrcRoot) { error = 'can not use file url without localSrcRoot'; } // unix absolute path if (localSrcRoot && localSrcRoot[0] === '/') { localSrcRoot = localSrcRoot.slice(1); } if (!error && localSrcRoot && url.startsWith(FILE_PROTOCOL_PREFIX) && !url.slice(FILE_PROTOCOL_PREFIX.length).startsWith(localSrcRoot)) { error = 'file url not starting with localSrcRoot is forbidden'; } if (!error && localSrcRoot && refUrl.startsWith(FILE_PROTOCOL_PREFIX) && !refUrl.slice(FILE_PROTOCOL_PREFIX.length).startsWith(localSrcRoot)) { error = 'file refUrl not starting with localSrcRoot is forbidden'; } if (!error && localSrcRoot) { if (localSrcRoot.endsWith('/')) { localSrcRoot = localSrcRoot.slice(0, -1); } if (url.startsWith('//')) { url = FILE_PROTOCOL_PREFIX + localSrcRoot + url.slice(1); } else if (url.startsWith('/')) { url = FILE_PROTOCOL_PREFIX + localSrcRoot + url; } else if (!url.startsWith(FILE_PROTOCOL_PREFIX)) { // relative url const absoluteRefUri = URI(FILE_PROTOCOL_PREFIX + refUrl.slice(FILE_PROTOCOL_PREFIX.length + localSrcRoot.length)); const uri = URI(url).absoluteTo(absoluteRefUri); url = FILE_PROTOCOL_PREFIX + localSrcRoot + uri.pathname() + uri.hash(); } } if (error) { if (skipReplacePathError) { log.warn(error, url, refUrl, localSrcRoot); return ''; } else { log.warn(error, url, refUrl, localSrcRoot); throw new Error(error); } } return url; } /** * Create a resource * @param type {@link CreateResourceArgument.type} * @param depth {@link CreateResourceArgument.depth} * @param url {@link CreateResourceArgument.rawUrl} * @param refUrl {@link CreateResourceArgument.refUrl} * @param refSavePath {@link CreateResourceArgument.refSavePath} * @param refType {@link CreateResourceArgument.refType} * @param localRoot {@link CreateResourceArgument.localRoot} * @param localSrcRoot {@link CreateResourceArgument.localSrcRoot} * @param encoding {@link CreateResourceArgument.encoding} * @param keepSearch {@link CreateResourceArgument.keepSearch} * @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError} * @param generateSavePathFn {@link CreateResourceArgument.generateSavePathFn} * @return the resource */ export function createResource({ type, depth, url, refUrl, refSavePath, refType, localRoot, localSrcRoot, encoding, keepSearch, skipReplacePathError, generateSavePathFn }) { const rawUrl = url; const refUri = URI(refUrl); let replacePathHasError = false; if (url.startsWith(FILE_PROTOCOL_PREFIX) || refUrl.startsWith(FILE_PROTOCOL_PREFIX)) { // file url should never have search keepSearch = false; url = resolveFileUrl(url, refUrl, localSrcRoot, skipReplacePathError); if (!url) { replacePathHasError = true; url = rawUrl; } } if (!replacePathHasError && url.startsWith('//')) { // url with the same protocol url = refUri.protocol() + ':' + url; } else if (!replacePathHasError && url[0] === '/') { // absolute path url = refUri.protocol() + '://' + refUri.host() + url; } let uri = URI(url); if (!replacePathHasError && uri.is('relative')) { uri = uri.absoluteTo(refUri); url = uri.toString(); } if (!replacePathHasError && checkAbsoluteUri(uri, refUri, skipReplacePathError, url, refUrl, type)) { replacePathHasError = true; } let downloadLink; if (uri.protocol() === 'file') { // file downloadLink contains no search downloadLink = uri.clone().search('').hash('').toString(); } else { downloadLink = uri.clone().hash('').toString(); } const implGenerateSavePath = generateSavePathFn || generateSavePath; // make savePath and replaceUri const savePath = replacePathHasError ? rawUrl : implGenerateSavePath(uri, type === ResourceType.Html, keepSearch, localSrcRoot); if (!refSavePath) { refSavePath = implGenerateSavePath(refUri, refType === ResourceType.Html, false, localSrcRoot); } const replaceUri = replacePathHasError ? URI(rawUrl) : URI(urlOfSavePath(savePath)).relativeTo(urlOfSavePath(refSavePath)); // recover hash if (uri.hash()) { replaceUri.hash(uri.hash()); } // remove search if not keepSearch if (!keepSearch && uri.search()) { uri.search(''); url = uri.toString(); } const resource = { type, depth, encoding: encoding || (type === ResourceType.Binary ? null : 'utf8'), url, rawUrl, downloadLink, refUrl, refSavePath, savePath, localRoot, replacePath: replaceUri.toString(), createTimestamp: Date.now(), body: undefined, meta: {}, uri, refUri, replaceUri, host: uri.hostname() }; if (replacePathHasError) { // urls with parser errors should never be downloaded resource.shouldBeDiscardedFromDownload = true; } return resource; } export function normalizeResource(res) { var _a; const resource = res; if (!resource.uri) { resource.uri = URI(resource.url); } if (!resource.refUri) { resource.refUri = URI(resource.refUrl); } if (!resource.replaceUri) { resource.replaceUri = URI(resource.replacePath); } if (!resource.host) { resource.host = (_a = resource.uri) === null || _a === void 0 ? void 0 : _a.hostname(); } if (!resource.waitTime && resource.downloadStartTimestamp) { resource.waitTime = resource.downloadStartTimestamp - resource.createTimestamp; } if (!resource.downloadTime && resource.finishTimestamp && resource.downloadStartTimestamp) { resource.downloadTime = resource.finishTimestamp - resource.downloadStartTimestamp; } if (resource.body instanceof ArrayBuffer || resource.body instanceof Uint8Array) { resource.body = Buffer.from(resource.body); } else if (ArrayBuffer.isView(resource.body)) { resource.body = Buffer.from(resource.body.buffer, resource.body.byteOffset, resource.body.byteLength); } return resource; } //# sourceMappingURL=resource.js.map