scrapefrom
Version:
Scrape data from any webpage.
60 lines (59 loc) • 1.62 kB
TypeScript
import { LaunchOptions, CookieData, GoToOptions, WaitForSelectorOptions } from "puppeteer";
import { CheerioAPI, Cheerio } from "cheerio";
import type { AnyNode } from "domhandler";
export type KeyPath = {
[key: string]: string;
};
export interface Config {
index?: number;
url: string;
name?: string;
use?: "fetch" | "puppeteer";
log?: boolean;
timeout?: number | NodeJS.Timeout;
fetch?: RequestInit;
parser?: "json" | "text";
launch?: LaunchOptions;
cookies?: CookieData[];
pageGoTo?: GoToOptions;
waitForSelector?: string;
waitForSelectorOptions?: WaitForSelectorOptions;
select?: string[];
selects?: string[][];
response?: unknown;
keyPath?: KeyPath;
extractor?: (res: unknown | CheerioAPI) => unknown;
extract?: ExtractConfig;
extracts?: ExtractConfig[];
delimiter?: string | null;
result?: unknown;
error?: string;
includeResponse?: boolean;
includeTimeout?: boolean;
}
export interface ExtractConfig {
name?: string;
delimiter?: string | null;
selector?: string;
attribute?: string;
json?: boolean;
filter?: (res: unknown) => boolean;
keyPath?: KeyPath;
extract?: ExtractConfig;
extracts?: ExtractConfig[];
extractor?: ($: CheerioAPI, parentNode?: Cheerio<AnyNode>) => any;
}
export interface HTMLData {
head: JsonNode;
body: JsonNode;
map: string[];
extract: (path: string) => unknown;
}
export interface JsonNode {
tag: string | null;
attributes: {
[key: string]: any;
};
children: JsonNode[];
textContent: string | null;
}