UNPKG

website-scrap-engine

Version:
279 lines 9.07 kB
import URI from 'urijs'; import type { IncomingHttpHeaders } from 'node:http'; import type { CheerioStatic } from './types.js'; export declare enum ResourceType { /** * Binary resource, not parsed nor processed */ Binary = 1, /** * Html resource */ Html = 2, /** * Css resource */ Css = 3, /** * Inline css resource in html, * currently only style blocks and style attributes are processed */ CssInline = 4, /** * Very limited support of site-maps, urls in it are not replaced. */ SiteMap = 5, /** * Standalone svg image * https://github.com/website-local/website-scrap-engine/issues/3 */ Svg = 6, /** * Large binary, which would be streamed directly to disk, * {@link Resource.type} must be explicitly set to this value to use streaming. * @see downloadStreamingResource * @see https://github.com/website-local/website-scrap-engine/issues/2 */ StreamingBinary = 7 } export type ResourceEncoding = null | BufferEncoding; export type ResourceBody = Buffer | ArrayBuffer | ArrayBufferView | string; export interface RawResource { /** * The type of this resource */ type: ResourceType | number; /** * Recursive depth from root resource */ depth: number; /** * Character encoding of the content of this resource. * * For {@link ResourceType.Binary} type, this property should be null */ encoding: ResourceEncoding; /** * URL of resource * * Used in de-duplicating, relative-path resolving. * May not be the real url. */ url: string; /** * The {@link .url} when this object created, should never change. */ readonly rawUrl: string; /** * The absolute url to download. */ downloadLink: string; /** * The url of the {@link RawResource} creating this resource. * * Should also be the referer url */ refUrl: string; /** * The relative path where this resource should be saved to */ savePath: string; /** * The relative path where the {@link RawResource} * creating this resource should be saved to. * This is used to generate the {@link .replacePath} * See https://github.com/website-local/website-scrap-engine/issues/139 */ refSavePath: string; /** * The absolute path which {@link RawResource.savePath} is relative to */ localRoot: string; /** * The path should replace the url of the link of * the {@link RawResource} creating this resource, * making the link work after saved to local disk. */ replacePath: string; /** * Timestamp of the creation of this object. */ createTimestamp: number; /** * Timestamp when downloading starts. */ downloadStartTimestamp?: number; /** * {@link .downloadStartTimestamp} - {@link .createTimestamp} */ waitTime?: number; /** * Timestamp after downloading finished. */ finishTimestamp?: number; /** * {@link .finishTimestamp} - {@link .downloadStartTimestamp} */ downloadTime?: number; /** * Downloaded content, if downloaded */ body?: ResourceBody; /** * Redirected url after downloaded */ redirectedUrl?: string; /** * Redirected {@link .savePath} after downloaded, * if this is provided, the saveToDisk life cycle would be affected. * * https://github.com/website-local/website-scrap-engine/issues/139 * https://github.com/website-local/website-scrap-engine/issues/157 * https://github.com/website-local/website-scrap-engine/issues/171 */ redirectedSavePath?: string; meta: { /** * Parsed html content for {@link .type} === {@link ResourceType.Html} * or {@link .type} === {@link ResourceType.Svg} * after downloaded and parsed, content may differ from {@link .body} */ doc?: CheerioStatic; /** * Response headers after download */ headers?: IncomingHttpHeaders; /** * Other custom meta info for resource */ [key: string]: unknown; }; } export interface Resource extends RawResource { /** * If exists, this should be the {@link URI} instance * containing the same content of {@link RawResource.url} */ uri?: URI; /** * If exists, this should be the {@link URI} instance * containing the same content of {@link RawResource.refUrl} */ refUri?: URI; /** * If exists, this should be the {@link URI} instance * containing the same content of {@link RawResource.replacePath} */ replaceUri?: URI; /** * {@link .uri}.hostname() */ host?: string; /** * True if url of this resource should be replaced and not downloaded */ shouldBeDiscardedFromDownload?: boolean; } export declare function prepareResourceForClone(res: Resource): RawResource; /** * The argument type of {@link createResource} */ export interface CreateResourceArgument { /** * {@link RawResource.type} */ type: ResourceType; /** * {@link RawResource.depth} */ depth: number; /** * {@link RawResource.rawUrl} */ url: string; /** * {@link RawResource.refUrl} */ refUrl: string; /** * {@link RawResource.refSavePath} */ refSavePath?: string; /** * The {@link type} of the {@link RawResource} creating this resource. */ refType?: ResourceType; /** * {@link RawResource.localRoot} */ localRoot: string; /** * Local source path to download from, * if empty or undefined, file:// url would not be accepted * https://github.com/website-local/website-scrap-engine/issues/126 */ localSrcRoot?: string; /** * {@link RawResource.encoding} */ encoding?: ResourceEncoding; /** * keep url search params in file name * in {@link Resource.replacePath} and {@link Resource.savePath} * See commit c8e270c6421ca8a9d1c519737949ad04c09fcb99 */ keepSearch?: boolean; /** * true to skip replacePath processing * in case of parser error * https://github.com/website-local/website-scrap-engine/issues/107 */ skipReplacePathError?: boolean; /** * Set this to use a custom implementation of {@link generateSavePath} */ generateSavePathFn?: GenerateSavePathFn | void; } /** * Generate save path from HTTP/HTTPS absolute uri * @param uri the HTTP/HTTPS absolute uri * @param isHtml should the savePath endsWith .html * @param keepSearch keep url search params in file name * @param localSrcRoot local source path to download from * @return string must return non-empty string */ export declare function generateSavePath(uri: URI, isHtml?: boolean, keepSearch?: boolean, localSrcRoot?: string): string; export type GenerateSavePathFn = typeof generateSavePath; export declare const urlOfSavePath: (savePath: string) => string; /** * Check an absolute uri * @param uri {@link RawResource.uri} * @param refUri {@link RawResource.refUri} * @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError} * @param url {@link CreateResourceArgument.url} * @param refUrl {@link CreateResourceArgument.refUrl} * @param type {@link CreateResourceArgument.type} * @throws Error if {@link skipReplacePathError} === false and check fail * @return true if {@link skipReplacePathError} === true and check fail */ export declare function checkAbsoluteUri(uri: URI, refUri: URI, skipReplacePathError: boolean | undefined, url: string, refUrl: string, type: ResourceType): boolean; export declare function resolveFileUrl(url: string, refUrl: string, localSrcRoot?: string, skipReplacePathError?: boolean): string; /** * Create a resource * @param type {@link CreateResourceArgument.type} * @param depth {@link CreateResourceArgument.depth} * @param url {@link CreateResourceArgument.rawUrl} * @param refUrl {@link CreateResourceArgument.refUrl} * @param refSavePath {@link CreateResourceArgument.refSavePath} * @param refType {@link CreateResourceArgument.refType} * @param localRoot {@link CreateResourceArgument.localRoot} * @param localSrcRoot {@link CreateResourceArgument.localSrcRoot} * @param encoding {@link CreateResourceArgument.encoding} * @param keepSearch {@link CreateResourceArgument.keepSearch} * @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError} * @param generateSavePathFn {@link CreateResourceArgument.generateSavePathFn} * @return the resource */ export declare function createResource({ type, depth, url, refUrl, refSavePath, refType, localRoot, localSrcRoot, encoding, keepSearch, skipReplacePathError, generateSavePathFn }: CreateResourceArgument): Resource; export declare function normalizeResource(res: RawResource): Resource; //# sourceMappingURL=resource.d.ts.map