xscrape
Version:
A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas
67 lines (62 loc) • 2.37 kB
text/typescript
import { z } from 'zod';
import { Element } from 'domhandler';
type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
interface ExtractDescriptor {
selector: string;
value?: string | ExtractDescriptorFn | ExtractMap;
}
type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
interface ExtractMap {
[key: string]: ExtractValue;
}
type ValidatorType = 'zod';
type ZodBuilder = typeof z;
type SchemaBuilder<V extends ValidatorType> = V extends 'zod' ? ZodBuilder : never;
type SchemaFunction<V extends ValidatorType, T> = (builder: SchemaBuilder<V>) => V extends 'zod' ? z.ZodSchema<T> : never;
type ScraperConfig<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T> = {
validator: V;
schema: SchemaFunction<V, T>;
extract: ExtractMap;
transform?: (data: T) => Promise<R> | R;
};
type BaseFieldOptions = {
attribute?: string;
};
type LeafFieldConfig = BaseFieldOptions & {
selector?: string;
selectorAll?: string;
} & ({
selector: string;
selectorAll?: never;
} | {
selector?: never;
selectorAll: string;
});
type FieldConfig<T> = T extends object ? T extends Array<infer U> ? LeafFieldConfig : {
fields: Fields<T>;
} : LeafFieldConfig;
type Fields<T> = {
[K in keyof T]: FieldConfig<T[K]>;
};
type ValidationResult<T> = {
success: boolean;
data?: T;
error?: unknown;
};
type ScraperResult<T> = {
data?: T;
error?: unknown;
};
/**
* Defines a scraper with the provided configuration.
*
* @template T - The shape of the extracted data.
* @template V - The type of the validator used for validation.
* @template R - The type of the result after optional transformation, defaults to T.
*
* @param config - The configuration object for the scraper.
* @returns A function that takes an HTML string and returns the scraping result, which could be
* a scraper result or a promise of a scraper result.
*/
declare function defineScraper<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T>(config: ScraperConfig<T, V, R>): (html: string) => Promise<ScraperResult<R>>;
export { type FieldConfig, type Fields, type LeafFieldConfig, type SchemaBuilder, type SchemaFunction, type ScraperConfig, type ScraperResult, type ValidationResult, type ValidatorType, defineScraper };