website-scrap-engine
Version:
Configurable website scraper in typescript
366 lines • 13.5 kB
JavaScript
import URI from 'urijs';
import * as path from 'node:path';
import { escapePath, isUrlHttp, orderUrlSearch, simpleHashString } from './util.js';
import { error as log } from './logger/logger.js';
export var ResourceType;
(function (ResourceType) {
/**
* Binary resource, not parsed nor processed
*/
ResourceType[ResourceType["Binary"] = 1] = "Binary";
/**
* Html resource
*/
ResourceType[ResourceType["Html"] = 2] = "Html";
/**
* Css resource
*/
ResourceType[ResourceType["Css"] = 3] = "Css";
/**
* Inline css resource in html,
* currently only style blocks and style attributes are processed
*/
ResourceType[ResourceType["CssInline"] = 4] = "CssInline";
/**
* Very limited support of site-maps, urls in it are not replaced.
*/
ResourceType[ResourceType["SiteMap"] = 5] = "SiteMap";
/**
* Standalone svg image
* https://github.com/website-local/website-scrap-engine/issues/3
*/
ResourceType[ResourceType["Svg"] = 6] = "Svg";
/**
* Large binary, which would be streamed directly to disk,
* {@link Resource.type} must be explicitly set to this value to use streaming.
* @see downloadStreamingResource
* @see https://github.com/website-local/website-scrap-engine/issues/2
*/
ResourceType[ResourceType["StreamingBinary"] = 7] = "StreamingBinary";
})(ResourceType || (ResourceType = {}));
export function prepareResourceForClone(res) {
const clone = {};
for (const key of Object.keys(res)) {
const value = Reflect.get(res, key);
if (typeof value === 'object') {
if (key === 'meta') {
const props = clone[key] = {};
for (const prop of Object.keys(value)) {
// headers can be cloned safely
if (prop === 'headers' || typeof value[prop] !== 'object') {
props[prop] = value[prop];
}
}
}
else if (key === 'body' && (typeof value === 'string' ||
value instanceof ArrayBuffer ||
ArrayBuffer.isView(value) ||
Buffer.isBuffer(value))) {
clone[key] = value;
}
}
else {
Reflect.set(clone, key, value);
}
}
return clone;
}
/**
* Generate save path from HTTP/HTTPS absolute uri
* @param uri the HTTP/HTTPS absolute uri
* @param isHtml should the savePath endsWith .html
* @param keepSearch keep url search params in file name
* @param localSrcRoot local source path to download from
* @return string must return non-empty string
*/
export function generateSavePath(uri, isHtml, keepSearch, localSrcRoot) {
if (uri.is('relative') && uri.protocol() !== 'file') {
throw new Error('generateSavePath: uri can not be relative: '
+ uri.toString());
}
let savePath;
if (uri.protocol() === 'file') {
if (!localSrcRoot) {
throw new Error('generateSavePath: using file protocol without localSrcRoot'
+ uri.toString());
}
if (process.platform === 'win32' &&
localSrcRoot.match(/^[a-z]:\//i)) {
// windows absolute fix
savePath = uri.pathname().slice(localSrcRoot.length + 1);
if (savePath[0] === '/') {
savePath = savePath.slice(1);
}
}
else {
savePath = uri.pathname().slice(localSrcRoot.length);
}
}
else {
const host = uri.hostname();
savePath = path.join(host || '', escapePath(uri.path()));
}
if (isHtml && !savePath.endsWith('.html')) {
if (uri.protocol() === 'file' && savePath === '' ||
savePath.endsWith('/') || savePath.endsWith('\\')) {
savePath += 'index.html';
}
else if (savePath.endsWith('.htm')) {
savePath += 'l';
}
else {
savePath += '.html';
}
}
if (keepSearch) {
let search = uri.search();
if (search && search.length > 0) {
if (search.length > 43) {
const ordered = orderUrlSearch(search);
const hashed = simpleHashString(ordered);
log.debug('search too long, replacing with hash', ordered, hashed);
// avoid too long search
search = '_' + hashed;
}
else {
// order it
search = escapePath(orderUrlSearch(search));
}
const ext = path.extname(savePath);
if (ext) {
savePath = savePath.slice(0, -ext.length) + search + ext;
}
else {
savePath += search;
}
}
}
return savePath;
}
export const urlOfSavePath = (savePath) => {
if (savePath.includes('\\')) {
return `file:///${savePath.replace(/\\/g, '/')}`;
}
return `file:///${savePath}`;
};
/**
* Check an absolute uri
* @param uri {@link RawResource.uri}
* @param refUri {@link RawResource.refUri}
* @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError}
* @param url {@link CreateResourceArgument.url}
* @param refUrl {@link CreateResourceArgument.refUrl}
* @param type {@link CreateResourceArgument.type}
* @throws Error if {@link skipReplacePathError} === false and check fail
* @return true if {@link skipReplacePathError} === true and check fail
*/
export function checkAbsoluteUri(uri, refUri, skipReplacePathError, url, refUrl, type) {
let replacePathHasError = false;
const protocol = uri.protocol().toLowerCase();
if (protocol !== 'http' &&
protocol !== 'https' &&
protocol !== 'file' &&
protocol !== refUri.protocol().toLowerCase()) {
if (skipReplacePathError) {
log.warn('protocol not supported, skipping', protocol, url, refUrl, type);
replacePathHasError = true;
}
else {
log.warn('protocol not supported, skipping', protocol, url, refUrl, type);
throw new Error(`protocol ${protocol} not supported`);
}
}
if (protocol !== 'file' && !uri.host()) {
if (skipReplacePathError) {
log.warn('empty host for non-file uri not supported, skipping', protocol, url, refUrl, type);
replacePathHasError = true;
}
else {
log.warn('empty host for non-file uri not supported, skipping', protocol, url, refUrl, type);
throw new Error('empty host for non-file uri not supported');
}
}
return replacePathHasError;
}
const FILE_PROTOCOL_PREFIX = 'file:///';
export function resolveFileUrl(url, refUrl, localSrcRoot, skipReplacePathError) {
if (isUrlHttp(url)) {
return url;
}
let error;
if (!localSrcRoot) {
error = 'can not use file url without localSrcRoot';
}
// unix absolute path
if (localSrcRoot && localSrcRoot[0] === '/') {
localSrcRoot = localSrcRoot.slice(1);
}
if (!error && localSrcRoot && url.startsWith(FILE_PROTOCOL_PREFIX) &&
!url.slice(FILE_PROTOCOL_PREFIX.length).startsWith(localSrcRoot)) {
error = 'file url not starting with localSrcRoot is forbidden';
}
if (!error && localSrcRoot && refUrl.startsWith(FILE_PROTOCOL_PREFIX) &&
!refUrl.slice(FILE_PROTOCOL_PREFIX.length).startsWith(localSrcRoot)) {
error = 'file refUrl not starting with localSrcRoot is forbidden';
}
if (!error && localSrcRoot) {
if (localSrcRoot.endsWith('/')) {
localSrcRoot = localSrcRoot.slice(0, -1);
}
if (url.startsWith('//')) {
url = FILE_PROTOCOL_PREFIX + localSrcRoot + url.slice(1);
}
else if (url.startsWith('/')) {
url = FILE_PROTOCOL_PREFIX + localSrcRoot + url;
}
else if (!url.startsWith(FILE_PROTOCOL_PREFIX)) {
// relative url
const absoluteRefUri = URI(FILE_PROTOCOL_PREFIX +
refUrl.slice(FILE_PROTOCOL_PREFIX.length + localSrcRoot.length));
const uri = URI(url).absoluteTo(absoluteRefUri);
url = FILE_PROTOCOL_PREFIX + localSrcRoot + uri.pathname() + uri.hash();
}
}
if (error) {
if (skipReplacePathError) {
log.warn(error, url, refUrl, localSrcRoot);
return '';
}
else {
log.warn(error, url, refUrl, localSrcRoot);
throw new Error(error);
}
}
return url;
}
/**
* Create a resource
* @param type {@link CreateResourceArgument.type}
* @param depth {@link CreateResourceArgument.depth}
* @param url {@link CreateResourceArgument.rawUrl}
* @param refUrl {@link CreateResourceArgument.refUrl}
* @param refSavePath {@link CreateResourceArgument.refSavePath}
* @param refType {@link CreateResourceArgument.refType}
* @param localRoot {@link CreateResourceArgument.localRoot}
* @param localSrcRoot {@link CreateResourceArgument.localSrcRoot}
* @param encoding {@link CreateResourceArgument.encoding}
* @param keepSearch {@link CreateResourceArgument.keepSearch}
* @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError}
* @param generateSavePathFn {@link CreateResourceArgument.generateSavePathFn}
* @return the resource
*/
export function createResource({ type, depth, url, refUrl, refSavePath, refType, localRoot, localSrcRoot, encoding, keepSearch, skipReplacePathError, generateSavePathFn }) {
const rawUrl = url;
const refUri = URI(refUrl);
let replacePathHasError = false;
if (url.startsWith(FILE_PROTOCOL_PREFIX) ||
refUrl.startsWith(FILE_PROTOCOL_PREFIX)) {
// file url should never have search
keepSearch = false;
url = resolveFileUrl(url, refUrl, localSrcRoot, skipReplacePathError);
if (!url) {
replacePathHasError = true;
url = rawUrl;
}
}
if (!replacePathHasError && url.startsWith('//')) {
// url with the same protocol
url = refUri.protocol() + ':' + url;
}
else if (!replacePathHasError && url[0] === '/') {
// absolute path
url = refUri.protocol() + '://' + refUri.host() + url;
}
let uri = URI(url);
if (!replacePathHasError && uri.is('relative')) {
uri = uri.absoluteTo(refUri);
url = uri.toString();
}
if (!replacePathHasError &&
checkAbsoluteUri(uri, refUri, skipReplacePathError, url, refUrl, type)) {
replacePathHasError = true;
}
let downloadLink;
if (uri.protocol() === 'file') {
// file downloadLink contains no search
downloadLink = uri.clone().search('').hash('').toString();
}
else {
downloadLink = uri.clone().hash('').toString();
}
const implGenerateSavePath = generateSavePathFn || generateSavePath;
// make savePath and replaceUri
const savePath = replacePathHasError ? rawUrl : implGenerateSavePath(uri, type === ResourceType.Html, keepSearch, localSrcRoot);
if (!refSavePath) {
refSavePath = implGenerateSavePath(refUri, refType === ResourceType.Html, false, localSrcRoot);
}
const replaceUri = replacePathHasError ? URI(rawUrl) :
URI(urlOfSavePath(savePath)).relativeTo(urlOfSavePath(refSavePath));
// recover hash
if (uri.hash()) {
replaceUri.hash(uri.hash());
}
// remove search if not keepSearch
if (!keepSearch && uri.search()) {
uri.search('');
url = uri.toString();
}
const resource = {
type,
depth,
encoding: encoding || (type === ResourceType.Binary ? null : 'utf8'),
url,
rawUrl,
downloadLink,
refUrl,
refSavePath,
savePath,
localRoot,
replacePath: replaceUri.toString(),
createTimestamp: Date.now(),
body: undefined,
meta: {},
uri,
refUri,
replaceUri,
host: uri.hostname()
};
if (replacePathHasError) {
// urls with parser errors should never be downloaded
resource.shouldBeDiscardedFromDownload = true;
}
return resource;
}
export function normalizeResource(res) {
var _a;
const resource = res;
if (!resource.uri) {
resource.uri = URI(resource.url);
}
if (!resource.refUri) {
resource.refUri = URI(resource.refUrl);
}
if (!resource.replaceUri) {
resource.replaceUri = URI(resource.replacePath);
}
if (!resource.host) {
resource.host = (_a = resource.uri) === null || _a === void 0 ? void 0 : _a.hostname();
}
if (!resource.waitTime && resource.downloadStartTimestamp) {
resource.waitTime = resource.downloadStartTimestamp - resource.createTimestamp;
}
if (!resource.downloadTime &&
resource.finishTimestamp &&
resource.downloadStartTimestamp) {
resource.downloadTime =
resource.finishTimestamp - resource.downloadStartTimestamp;
}
if (resource.body instanceof ArrayBuffer || resource.body instanceof Uint8Array) {
resource.body = Buffer.from(resource.body);
}
else if (ArrayBuffer.isView(resource.body)) {
resource.body = Buffer.from(resource.body.buffer, resource.body.byteOffset, resource.body.byteLength);
}
return resource;
}
//# sourceMappingURL=resource.js.map