website-scrap-engine
Version:
Configurable website scraper in typescript
151 lines • 4.98 kB
TypeScript
import type { RequestError, RetryFunction, TimeoutError } from 'got';
import type { ResourceEncoding, ResourceType } from './resource.js';
import type { ProcessingLifeCycle, RequestOptions } from './life-cycle/types.js';
import { configureLogger } from './logger/config-logger.js';
import type { DownloaderWithMeta } from './downloader/types.js';
import type { SourceDefinition } from './sources.js';
import type { CheerioOptionsInterface } from './types.js';
/**
* Extra options for custom life cycle
*/
export interface StaticDownloadMeta extends Record<string, string | number | boolean | void> {
detectIncompleteHtml?: '</html>' | '</body>' | string;
warnForNonHtml?: boolean;
}
/**
* Options which should not be changed at runtime, and safe for cloning
*/
export interface StaticDownloadOptions {
/**
* @see Resource.localRoot
*/
localRoot: string;
/**
* Local source path to download from,
* if empty or undefined, file:// url would not be accepted,
* this should not start with file://
* Note: must use slash (/) on windows
* https://github.com/website-local/website-scrap-engine/issues/126
*/
localSrcRoot?: string;
/**
* Maximum recursive depth
* @see Resource.depth
*/
maxDepth: number;
/**
* Downloading concurrency
*/
concurrency: number;
/**
* Resource default encoding by type.
*
* Encoding of a resource can be changed at
* {@link ProcessingLifeCycle.processBeforeDownload}
*/
encoding: Record<ResourceType, ResourceEncoding>;
/**
* WorkerPool.coreSize = Math.min(
* {@link concurrency},
* {@link workerCount}
* )
*/
workerCount?: number;
/**
* Minimum concurrency, for {@link DownloadOptions.adjustConcurrencyFunc}
*/
minConcurrency?: number;
/**
* If url search params should be stripped.
* If false, saved file name would contains search or hash of search
*/
deduplicateStripSearch?: boolean;
/**
* Optional parse option for cheerio
*/
cheerioParse?: CheerioOptionsInterface;
/**
* Optional serialize option for cheerio
*/
cheerioSerialize?: CheerioOptionsInterface;
/**
* Custom html sources
*/
sources?: SourceDefinition[];
/**
* Got options
*
* Never include functions or class instances
* with {@link StaticDownloadOptions}.
* Configure them functions or class instances
* using {@link DownloadOptions} only.
* @see RequestOptions
* @see got.mergeOptions
*/
req?: RequestOptions;
/**
* Urls being pushed to pipeline with depth 0 and the url self as refUrl
*/
initialUrl?: string[];
/**
* @see DownloadOptions.configureLogger
*/
logSubDir?: string;
/**
* @see StaticDownloadMeta
*/
meta: StaticDownloadMeta;
/**
* true to skip replacePath processing in case of parser error
* https://github.com/website-local/website-scrap-engine/issues/107
* @see createResource
*/
skipReplacePathError?: boolean;
/**
* Wait for this.init in method onIdle.
* See https://github.com/website-local/website-scrap-engine/issues/152
* @deprecated since 0.8.2
*/
waitForInitBeforeIdle?: boolean;
/**
* Set last modified time of local saved file with value from the
* Last-Modified http header, if available in response.
*
* https://developer.mozilla.org/docs/Web/HTTP/Headers/Last-Modified
* https://github.com/website-local/website-scrap-engine/issues/174
*/
preferRemoteLastModifiedTime?: boolean;
}
export interface DownloadOptions extends StaticDownloadOptions, ProcessingLifeCycle {
/**
* Functions or class callbacks can only be here.
*
* @see StaticDownloadOptions.req
*/
req: RequestOptions;
/**
* Adjust downloaded concurrency at runtime.
*
* Note: this would not affect worker_threads
*/
adjustConcurrencyPeriod?: number;
adjustConcurrencyFunc?: (downloader: DownloaderWithMeta) => void;
/**
* Use a custom function to configure logger.
*/
configureLogger: typeof configureLogger;
}
export type ExtendedError = (TimeoutError | RequestError) & {
retryLimitExceeded: boolean;
};
/**
* If you would like to implement it yourself,
* set error.retryLimitExceeded to 1 or true
* if attemptCount > retryOptions.limit
* or you think retry should end
*/
export declare const calculateFastDelay: RetryFunction;
export declare function defaultDownloadOptions(options: ProcessingLifeCycle & Partial<DownloadOptions>): DownloadOptions;
export declare function checkDownloadOptions(options: DownloadOptions): DownloadOptions;
export declare function mergeOverrideOptions(options: DownloadOptions | (() => DownloadOptions), overrideOptions?: Partial<StaticDownloadOptions>): DownloadOptions;
//# sourceMappingURL=options.d.ts.map