website-scrap-engine
Version:
Configurable website scraper in typescript
279 lines • 9.07 kB
TypeScript
import URI from 'urijs';
import type { IncomingHttpHeaders } from 'node:http';
import type { CheerioStatic } from './types.js';
export declare enum ResourceType {
/**
* Binary resource, not parsed nor processed
*/
Binary = 1,
/**
* Html resource
*/
Html = 2,
/**
* Css resource
*/
Css = 3,
/**
* Inline css resource in html,
* currently only style blocks and style attributes are processed
*/
CssInline = 4,
/**
* Very limited support of site-maps, urls in it are not replaced.
*/
SiteMap = 5,
/**
* Standalone svg image
* https://github.com/website-local/website-scrap-engine/issues/3
*/
Svg = 6,
/**
* Large binary, which would be streamed directly to disk,
* {@link Resource.type} must be explicitly set to this value to use streaming.
* @see downloadStreamingResource
* @see https://github.com/website-local/website-scrap-engine/issues/2
*/
StreamingBinary = 7
}
export type ResourceEncoding = null | BufferEncoding;
export type ResourceBody = Buffer | ArrayBuffer | ArrayBufferView | string;
export interface RawResource {
/**
* The type of this resource
*/
type: ResourceType | number;
/**
* Recursive depth from root resource
*/
depth: number;
/**
* Character encoding of the content of this resource.
*
* For {@link ResourceType.Binary} type, this property should be null
*/
encoding: ResourceEncoding;
/**
* URL of resource
*
* Used in de-duplicating, relative-path resolving.
* May not be the real url.
*/
url: string;
/**
* The {@link .url} when this object created, should never change.
*/
readonly rawUrl: string;
/**
* The absolute url to download.
*/
downloadLink: string;
/**
* The url of the {@link RawResource} creating this resource.
*
* Should also be the referer url
*/
refUrl: string;
/**
* The relative path where this resource should be saved to
*/
savePath: string;
/**
* The relative path where the {@link RawResource}
* creating this resource should be saved to.
* This is used to generate the {@link .replacePath}
* See https://github.com/website-local/website-scrap-engine/issues/139
*/
refSavePath: string;
/**
* The absolute path which {@link RawResource.savePath} is relative to
*/
localRoot: string;
/**
* The path should replace the url of the link of
* the {@link RawResource} creating this resource,
* making the link work after saved to local disk.
*/
replacePath: string;
/**
* Timestamp of the creation of this object.
*/
createTimestamp: number;
/**
* Timestamp when downloading starts.
*/
downloadStartTimestamp?: number;
/**
* {@link .downloadStartTimestamp} - {@link .createTimestamp}
*/
waitTime?: number;
/**
* Timestamp after downloading finished.
*/
finishTimestamp?: number;
/**
* {@link .finishTimestamp} - {@link .downloadStartTimestamp}
*/
downloadTime?: number;
/**
* Downloaded content, if downloaded
*/
body?: ResourceBody;
/**
* Redirected url after downloaded
*/
redirectedUrl?: string;
/**
* Redirected {@link .savePath} after downloaded,
* if this is provided, the saveToDisk life cycle would be affected.
*
* https://github.com/website-local/website-scrap-engine/issues/139
* https://github.com/website-local/website-scrap-engine/issues/157
* https://github.com/website-local/website-scrap-engine/issues/171
*/
redirectedSavePath?: string;
meta: {
/**
* Parsed html content for {@link .type} === {@link ResourceType.Html}
* or {@link .type} === {@link ResourceType.Svg}
* after downloaded and parsed, content may differ from {@link .body}
*/
doc?: CheerioStatic;
/**
* Response headers after download
*/
headers?: IncomingHttpHeaders;
/**
* Other custom meta info for resource
*/
[key: string]: unknown;
};
}
export interface Resource extends RawResource {
/**
* If exists, this should be the {@link URI} instance
* containing the same content of {@link RawResource.url}
*/
uri?: URI;
/**
* If exists, this should be the {@link URI} instance
* containing the same content of {@link RawResource.refUrl}
*/
refUri?: URI;
/**
* If exists, this should be the {@link URI} instance
* containing the same content of {@link RawResource.replacePath}
*/
replaceUri?: URI;
/**
* {@link .uri}.hostname()
*/
host?: string;
/**
* True if url of this resource should be replaced and not downloaded
*/
shouldBeDiscardedFromDownload?: boolean;
}
export declare function prepareResourceForClone(res: Resource): RawResource;
/**
* The argument type of {@link createResource}
*/
export interface CreateResourceArgument {
/**
* {@link RawResource.type}
*/
type: ResourceType;
/**
* {@link RawResource.depth}
*/
depth: number;
/**
* {@link RawResource.rawUrl}
*/
url: string;
/**
* {@link RawResource.refUrl}
*/
refUrl: string;
/**
* {@link RawResource.refSavePath}
*/
refSavePath?: string;
/**
* The {@link type} of the {@link RawResource} creating this resource.
*/
refType?: ResourceType;
/**
* {@link RawResource.localRoot}
*/
localRoot: string;
/**
* Local source path to download from,
* if empty or undefined, file:// url would not be accepted
* https://github.com/website-local/website-scrap-engine/issues/126
*/
localSrcRoot?: string;
/**
* {@link RawResource.encoding}
*/
encoding?: ResourceEncoding;
/**
* keep url search params in file name
* in {@link Resource.replacePath} and {@link Resource.savePath}
* See commit c8e270c6421ca8a9d1c519737949ad04c09fcb99
*/
keepSearch?: boolean;
/**
* true to skip replacePath processing
* in case of parser error
* https://github.com/website-local/website-scrap-engine/issues/107
*/
skipReplacePathError?: boolean;
/**
* Set this to use a custom implementation of {@link generateSavePath}
*/
generateSavePathFn?: GenerateSavePathFn | void;
}
/**
* Generate save path from HTTP/HTTPS absolute uri
* @param uri the HTTP/HTTPS absolute uri
* @param isHtml should the savePath endsWith .html
* @param keepSearch keep url search params in file name
* @param localSrcRoot local source path to download from
* @return string must return non-empty string
*/
export declare function generateSavePath(uri: URI, isHtml?: boolean, keepSearch?: boolean, localSrcRoot?: string): string;
export type GenerateSavePathFn = typeof generateSavePath;
export declare const urlOfSavePath: (savePath: string) => string;
/**
* Check an absolute uri
* @param uri {@link RawResource.uri}
* @param refUri {@link RawResource.refUri}
* @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError}
* @param url {@link CreateResourceArgument.url}
* @param refUrl {@link CreateResourceArgument.refUrl}
* @param type {@link CreateResourceArgument.type}
* @throws Error if {@link skipReplacePathError} === false and check fail
* @return true if {@link skipReplacePathError} === true and check fail
*/
export declare function checkAbsoluteUri(uri: URI, refUri: URI, skipReplacePathError: boolean | undefined, url: string, refUrl: string, type: ResourceType): boolean;
export declare function resolveFileUrl(url: string, refUrl: string, localSrcRoot?: string, skipReplacePathError?: boolean): string;
/**
* Create a resource
* @param type {@link CreateResourceArgument.type}
* @param depth {@link CreateResourceArgument.depth}
* @param url {@link CreateResourceArgument.rawUrl}
* @param refUrl {@link CreateResourceArgument.refUrl}
* @param refSavePath {@link CreateResourceArgument.refSavePath}
* @param refType {@link CreateResourceArgument.refType}
* @param localRoot {@link CreateResourceArgument.localRoot}
* @param localSrcRoot {@link CreateResourceArgument.localSrcRoot}
* @param encoding {@link CreateResourceArgument.encoding}
* @param keepSearch {@link CreateResourceArgument.keepSearch}
* @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError}
* @param generateSavePathFn {@link CreateResourceArgument.generateSavePathFn}
* @return the resource
*/
export declare function createResource({ type, depth, url, refUrl, refSavePath, refType, localRoot, localSrcRoot, encoding, keepSearch, skipReplacePathError, generateSavePathFn }: CreateResourceArgument): Resource;
export declare function normalizeResource(res: RawResource): Resource;
//# sourceMappingURL=resource.d.ts.map