UNPKG

fandomscraper

Version:

A package to scrap fandoms wikis characters page. Only scraps the characters info section and the list of all repertoried characters.

356 lines (350 loc) 13.6 kB
declare const availableWikis: string[]; type TAvailableWikis = typeof availableWikis[number]; interface IGetCharactersOptions { /** * The limit of characters to get (optional). Default: 100000 */ limit: number; /** * The offset of characters to get (optional). Default: 0 */ offset: number; /** * If the scraper should get all the characters recursively (optional). Default: false */ recursive?: boolean; /** * If the scraper should get the images in base64 (optional). Default: false */ base64?: boolean; /** * If the scraper should get the id of the character (optional). The id is the pageId of the wikia. Default: false */ withId?: boolean; /** * The substrings to ignore in the characters names (optional). Default: [] */ ignore?: string[]; /** * The substrings to ignore in the characters names (optional). Default: [] */ attributes?: string[]; } interface IGetCharacterOptions { /** * The name of the character you want to get. */ name?: string; /** * If the scraper should get the images in base64 (optional). Default: false */ base64?: boolean; /** * If the scraper should get the id of the character (optional). The id is the pageId of the wikia. Default: true */ withId?: boolean; /** * The attributes to get in the character (optional). Default are the attributes of the schema. */ attributes?: string[]; } interface IMetaData { name: string; language: 'en' | 'fr'; attributes: string[]; count?: number; availableLanguages: string[]; url: string; } interface WikiaParameters { name: TAvailableWikis; lang: 'en' | 'fr'; } /** * FandomScraper is a class that allows you to scrape a Fandom wiki, and get all the characters of a fiction. * The list of available wikis can be found in the TAvailableWikis type. */ declare class FandomScraper { protected _schema: ISchema; private _CharactersPage; private method; private name; private wikiaParameters; private id; private isOldVersion; private _legacyWarningShown; private pageFetcher; private characterParser; private dataExtractor; private queryBuilder; /** * Constructs a FandomScraper instance. * @param { name: TAvailableWikis, options?: { lang: 'en' | 'fr' | null } } options - The options of the constructor. * @throws Error if an invalid wiki name is provided. * @example * ```ts * const scraper = new FandomScraper({ name: 'dragon-ball', language: 'fr' }); * ``` */ constructor(name: TAvailableWikis, options?: { lang: 'en' | 'fr' | null; }); /** * Get the schema of the current wiki. * @returns The schema of the wiki. */ getSchema(): ISchema; /** * Get metadata about the current wiki. (availables attributes, language, etc...) * @returns The metadata of the wiki. */ getMetadata(options?: { withCount: boolean; }): Promise<IMetaData>; /** * Set the url of the characters page of the wiki in the schema. * @param {string} url - The url of the characters page. * @returns The FandomScraper instance. */ setCharactersPage(url: string): this; /** * Set the limit of characters to get. Default: 50 * @param {number} limit - The limit of characters to get. * @throws Error if the limit is less than 1. * @example * ```ts * await scraper.findAll({ base64: true, recursive: true, withId: true }).limit(100).exec(); * ``` */ limit(limit: number): this; /** * Set the offset of characters to get. Default: 0 * @param {number} offset - The offset of characters to get. * @throws Error if the offset is less than 0. * @example * ```ts * await scraper.findAll({ base64: true, recursive: true, withId: true }).offset(100).exec(); * ``` */ offset(offset: number): this; /** * Set the language of the current wiki instance. * @param {'en' | 'fr'} lang - The language to set * @returns The FandomScraper instance * @throws Error if the language is not available for this wiki * @example * ```ts * await scraper.setLanguage('fr'); * ``` */ setLanguage(lang: 'en' | 'fr'): this; /** * Set the ignored substrings in the characters names. Default: [] * @param {string[]} ignore - The substrings to ignore in the characters names. * @throws Error if the ignore parameter is not an array. * @example * ```ts * await scraper.findAll({ base64: true, recursive: true, withId: true }).ignore(['(Dragon Ball Heroes)']).exec(); * ``` */ ignore(ignore: string[]): this; /** * Set the attributes to get in the characters. Default are the attributes of the schema. * @param {string} attributes - The attributes to get in the characters. * @throws Error if the attributes parameter is not a string. * @example * ```ts * await scraper.findAll({ base64: true, recursive: true, withId: true }).attr('name images age kanji').exec(); * ``` */ attr(attributes: string): this; /** * Set the keys of the attributes that should be converted to an array instead of a string. Default: [] * @param {string} attributes - The keys of the attributes that should be converted to an array instead of a string. * @throws Error if the attributes parameter is not a string. * @example * ```ts * await scraper.findAll({ base64: true, recursive: true, withId: true }).attrToArray('age height voiceActor').exec(); * ``` */ attrToArray(attributes: string): this; /** * Get the characters page of the current wiki. * @param {string} url - The url of the characters page. * @returns The characters page of the wiki. * @example * ```ts * await scraper.getCharactersPage('https://kimetsu-no-yaiba.fandom.com/fr/wiki/Catégorie:Personnages'); * ``` */ private getCharactersPage; /** * Get all the characters of the current wiki, considering the options provided. * @param {IGetCharactersOptions} [options] - The options of the getCharacters method. * @returns The characters of the wiki. * @throws Error if the limit is less than 1. * @throws Error if the offset is less than 0. * @example * ```ts * const characters = await scraper.getCharacters({ limit: 100, offset: 0, recursive: true, base64: true, withId: true }); * ``` * @deprecated Use the findAll method instead. */ getAll(options?: IGetCharactersOptions): Promise<any[]>; /** * Get all the characters of the current wiki, considering the options provided. * Must be called before the exec method and any other method. * @param { { base64: boolean, recursive: boolean, withId: boolean } } [options] - The options of the getCharacters method. * @returns The characters of the wiki. * @example * ```ts * const characters = await scraper.findAll({ base64: true, recursive: true, withId: true }).exec(); * ``` */ findAll(options: { base64: boolean; recursive: boolean; withId: boolean; }): this; /** * Get a character of the current wiki according to its name, considering the options provided. * Must be called before the exec method and any other method. * @param {string} name - The name of the character to get. * @param { { base64: boolean, withId: boolean } } [options] - The options of the getCharacters method. * @returns The character of the wiki. * @throws Error if the name is not provided. * @example * ```ts * const character = await scraper.findByName('Tanjiro Kamado', { base64: true, withId: true }).exec(); * ``` */ findByName(name: string, options: { base64: boolean; withId: boolean; }): this; /** * Get a character of the current wiki according to its id, considering the options provided. * Must be called before the exec method and any other method. * @param {number} id - The id of the character to get. * @param { { base64: boolean } } [options] - The options of the getCharacters method. * @returns The character of the wiki. * @throws Error if the id is less than 1. * @example * ```ts * const character = await scraper.findById(1, { base64: true }).exec(); * ``` */ findById(id: number, options?: { base64: boolean; }): this; /** * Execute the method previously called. Must be called after all the methods to get the result. * @returns The result of the method previously called. * @throws Error if the method is not valid. * @example * ```ts * const characters = await scraper.findAll({ base64: true, recursive: true, withId: true }).limit(100).attributes('name images').exec(); * ``` */ exec(): Promise<any>; /** * Get a character of the current wiki according to its name, considering the options provided. * @param {IGetCharacterOptions} [options] - The options of the getCharacter method. * @returns The character of the wiki. * @throws Error if the name is not provided. * @throws Error if the character is not found. * @example * ```ts * const character = await scraper.getByName({ name: 'Goku', base64: true, withId: true }); * ``` * @deprecated Use the findByName method instead. */ getByName(options?: IGetCharacterOptions): Promise<IData | undefined>; private _getByName; /** * Get a character of the current wiki by its id, considering the options provided. * @param {number} id - The id of the character. * @param {IGetCharacterOptions} [options] - The options of the getCharacter method. * @returns The character of the wiki. * @throws Error if the id is less than 1. * @throws Error if the character does not exists. * @example * ```ts * const scraper = new FandomScraper({ name: 'dragon-ball' }); * const character = await scraper.getById(1, { base64: true, withId: true }); * ``` * @deprecated Use the findById method instead. */ getById(id: number, options?: IGetCharacterOptions): Promise<any>; private _getById; /** * Get all the available wikis of the FandomScraper class. * @returns The available wikis. */ getAvailableWikis(): WikiaParameters[]; private _getOne; private formatCharacterData; /** * Get all the characters of the current wiki, considering the options provided. * Works only for the classic characters page format. * @param {IGetCharactersOptions} [options] - The options of the getCharacters method. * @returns The characters of the wiki. */ private _getAll; /** * Count the number of characters of the current wiki and return the number. * @returns The number of characters of the wiki. * @async */ count(): Promise<number>; /** * Fetches a webpage from the specified URL and extracts quotes from it. * * The method retrieves the page content using the provided URL and extracts quote data * by using either a schema-defined selector or by querying for <blockquote> elements. * It then processes the found elements using an extraction method, handling both string * and array formats of the quote content, and returns a list of quotes as strings. * * @param url - The URL of the webpage from which to extract quotes. * @returns A promise that resolves to an array of quote strings. * * @throws Will throw an error if fetching the page or processing the quote extraction fails. */ getQuotes(url: string): Promise<string[]>; private isValidCharacterPageInternal; private getWikiUrlInternal; /** * Whether the schema qualifies for the faster MediaWiki generator API path. * Requires both a `category` field and a Fandom wiki URL. */ private _useMediaWikiPath; /** * Emit a one-time deprecation warning when the legacy HTML scraping path is used. * Only triggers for 'classic' page format schemas that have no `category` set. */ private _warnLegacyIfNeeded; /** * Maximum number of character pages fetched in parallel when `recursive: true`. * High enough to benefit from parallelism, low enough to avoid Fandom rate limits. */ private static readonly CONCURRENT_FETCHES; /** * Fetch the character window [offset, offset+limit) via the MediaWiki generator API, * applying `ignore` filtering on the fly (offset/limit are counted against the filtered * stream, not raw API entries). When `recursive` is true, individual character pages * are fetched in a bounded-concurrency pool instead of sequentially. */ private _getAllViaMediaWiki; /** * Count all characters in the category using the MediaWiki generator API. */ private _countViaMediaWiki; } /** * This class allows you to define your own schema for a fandom wiki scraper * @class */ declare class FandomPersonalScraper extends FandomScraper { constructor(schema: ISchema); } export { FandomPersonalScraper, FandomScraper, type IGetCharacterOptions, type IGetCharactersOptions, type IMetaData, type TAvailableWikis, type WikiaParameters, availableWikis };