UNPKG

crawler-ts

Version:

Lightweight crawler written in TypeScript using ES6 generators.

95 lines (85 loc) 3.21 kB
type ValueOrPromise<T> = T | Promise<T>; export interface Logger { info: (...args: any[]) => void; error: (...args: any[]) => void; } export interface PreParseProps<L, R> { location: L; response: R; } export interface PostParseProps<L, R, P> extends PreParseProps<L, R> { parsed: P; } /** * @type {L} The type of the locations to crawl, e.g. `URL` or `string` that represents a path. * @type {R} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`. * @type {P} The intermediate parsed result that can be parsed from the response and generated by the crawler. */ export interface Options<L, R, P> { /** * This function should return the response for the given location. */ requester(location: L): ValueOrPromise<R | undefined>; /** * This function should return true if the crawler should parse the response, or false if not. */ shouldParse(props: PreParseProps<L, R>): ValueOrPromise<boolean>; /** * This function should parse the response and convert the response to the parsed type. */ parser(props: PreParseProps<L, R>): ValueOrPromise<P | undefined>; /** * This function should return true if the crawler should yield the parsed result, or false if not. */ shouldYield(props: PostParseProps<L, R, P>): ValueOrPromise<boolean>; /** * This function should yield all the locations to follow in the given parsed result. */ follower(props: PostParseProps<L, R, P>): AsyncGenerator<L>; /** * This function should return true if the crawler should queue the location for crawling, or false if not. */ shouldQueue(props: { location: L; origin: L; response: R; parsed: P }): ValueOrPromise<boolean>; /** * The logger can be set to `console` to output debug information to the `console`. * * @default undefined */ logger?: Logger; } export function createCrawler<L, R, P>( options: Options<L, R, P>, ): (start: L) => AsyncGenerator<PostParseProps<L, R, P>> { const { requester, shouldParse, parser, shouldYield, follower, shouldQueue, logger } = options; return async function* gen(location: L): AsyncGenerator<PostParseProps<L, R, P>> { try { logger?.info(`Requesting ${location}`); const response = await requester(location); if (response && (await shouldParse({ location, response }))) { logger?.info(`Parsing ${location}`); const parsed = await parser({ location, response }); if (!parsed) { return; } if (await shouldYield({ location, response, parsed })) { logger?.info(`Yielding ${location}`); yield { location, response, parsed }; } for await (const next of follower({ location, response, parsed })) { try { if (await shouldQueue({ location: next, origin: location, response, parsed })) { logger?.info(`Queueing ${next}`); yield* gen(next); } } catch (e) { logger?.error(`Cannot queue ${next}`); logger?.error(e); } } } } catch (e) { logger?.error(`Cannot visit ${location}`); logger?.error(e); } }; }