crawler-ts
Version:
Lightweight crawler written in TypeScript using ES6 generators.
95 lines (85 loc) • 3.21 kB
text/typescript
type ValueOrPromise<T> = T | Promise<T>;
export interface Logger {
info: (...args: any[]) => void;
error: (...args: any[]) => void;
}
export interface PreParseProps<L, R> {
location: L;
response: R;
}
export interface PostParseProps<L, R, P> extends PreParseProps<L, R> {
parsed: P;
}
/**
* @type {L} The type of the locations to crawl, e.g. `URL` or `string` that represents a path.
* @type {R} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`.
* @type {P} The intermediate parsed result that can be parsed from the response and generated by the crawler.
*/
export interface Options<L, R, P> {
/**
* This function should return the response for the given location.
*/
requester(location: L): ValueOrPromise<R | undefined>;
/**
* This function should return true if the crawler should parse the response, or false if not.
*/
shouldParse(props: PreParseProps<L, R>): ValueOrPromise<boolean>;
/**
* This function should parse the response and convert the response to the parsed type.
*/
parser(props: PreParseProps<L, R>): ValueOrPromise<P | undefined>;
/**
* This function should return true if the crawler should yield the parsed result, or false if not.
*/
shouldYield(props: PostParseProps<L, R, P>): ValueOrPromise<boolean>;
/**
* This function should yield all the locations to follow in the given parsed result.
*/
follower(props: PostParseProps<L, R, P>): AsyncGenerator<L>;
/**
* This function should return true if the crawler should queue the location for crawling, or false if not.
*/
shouldQueue(props: { location: L; origin: L; response: R; parsed: P }): ValueOrPromise<boolean>;
/**
* The logger can be set to `console` to output debug information to the `console`.
*
* @default undefined
*/
logger?: Logger;
}
export function createCrawler<L, R, P>(
options: Options<L, R, P>,
): (start: L) => AsyncGenerator<PostParseProps<L, R, P>> {
const { requester, shouldParse, parser, shouldYield, follower, shouldQueue, logger } = options;
return async function* gen(location: L): AsyncGenerator<PostParseProps<L, R, P>> {
try {
logger?.info(`Requesting ${location}`);
const response = await requester(location);
if (response && (await shouldParse({ location, response }))) {
logger?.info(`Parsing ${location}`);
const parsed = await parser({ location, response });
if (!parsed) {
return;
}
if (await shouldYield({ location, response, parsed })) {
logger?.info(`Yielding ${location}`);
yield { location, response, parsed };
}
for await (const next of follower({ location, response, parsed })) {
try {
if (await shouldQueue({ location: next, origin: location, response, parsed })) {
logger?.info(`Queueing ${next}`);
yield* gen(next);
}
} catch (e) {
logger?.error(`Cannot queue ${next}`);
logger?.error(e);
}
}
}
} catch (e) {
logger?.error(`Cannot visit ${location}`);
logger?.error(e);
}
};
}