UNPKG

hammer-scrape

Version:

Unifies Cheerio and Puppeteer for the most streamline scraping experience

210 lines (209 loc) 8.42 kB
import EngineMode from './engine_mode'; import EngineType from './engine_type'; import EngineCoreType from './engine_core_type'; /** * A standardized abstract class that represents what we want in our web scraping engines. */ export declare abstract class WebScrapingEngine<PCore, MCore> { private readonly engineType; private readonly engineCore; private engineMode; protected parsingCore: PCore | null; protected manipulationCore: MCore | null; /** * Construct our basic implementation of our engine * @param engineType The type of engine we are aiming for * @param engineCoreType The core type that we want to utilize */ constructor(engineType: EngineType, engineCoreType: EngineCoreType); /** * Gets the engine mode that represents */ getEngineMode(): EngineMode; /** * Gets the engine type that best represents this engine */ getEngineType(): EngineType; /** * Gets the core types associated with this engine */ getEngineCoreType(): EngineCoreType; /** * Determine if the engine is currently running or not. */ isRunning(): boolean; /** * Decides what mode the engine should be in. * The engine should only be in one mode at a time. Either Parsing or manipulating * @param newMode The new mode that we want to lock the engine into */ protected setEngineMode(newMode: EngineMode): void; /** * Determines if the engine is in the current mode * @param targetMode The mode we are expecting to be in */ protected isCorrectEngineMode(targetMode: EngineMode): boolean; /** * Load things into the engine. This is automatically called by this.startup */ protected abstract load(): Promise<void>; /** * Start the engine up and get ready to process what comes in */ startup(): Promise<void>; /** * Processes the url while the the engine is running and doing its thing * @param url The url we want to target */ abstract process(url: string): Promise<void>; /** * Shut the engine off and free up any resources */ abstract shutoff(): Promise<void>; /** * Get the parsing core that we have decided to utilize */ getParsingCore(): PCore | null; /** * Get the manipulation core that we have decided to utilize */ getManipulationCore(): MCore | null; /** * Safely enters manipulation mode * @param callback Let's you safely manipulate the page this engine is processing */ manipulate(callback: (core: MCore) => Promise<void>): Promise<void>; /** * Provides a mechanism to safely parse the core contents * @param callback This is where you will be able to safely parse the contents */ parse(callback: (core: PCore) => Promise<void>): Promise<void>; } export interface Core<PageType, ExpectedInitializeObjectType> { getUrl(): string; raw(): PageType; initialize(data: ExpectedInitializeObjectType): Promise<void>; dispose(): Promise<void>; } /** * Defines the methods that should be implemented in a core that handles manipulation */ export declare abstract class ManipulationCore<PageType, ExpectedInitializeObjectType> implements Core<PageType, ExpectedInitializeObjectType> { /** This core provides methods to actually manipulate the page. But it can be null if the core has not been initialized yet */ protected core: PageType | null; private url; constructor(url: string); getUrl(): string; /** * Initialize and load in anything for the core * @param data Based on what was specified with the manipulation core design, this data allows us to properly initialize whatever core design we want */ abstract initialize(data: ExpectedInitializeObjectType): Promise<void>; /** * Set the value into the select element * @param querySelector The selector that targets the select element * @param value The value to attempt to set */ abstract select(querySelector: string, value: string): Promise<void>; /** * Click on the element that matches the query selector * @param querySelector The selector that targets the element we want to click */ abstract click(querySelector: string): Promise<void>; /** * Type a specific value into the field that can accept typing. * @param querySelector The element(s) that we are targeting * @param value The value we want to simulate typing in */ abstract type(querySelector: string, value: string): Promise<void>; /** * Gets the raw page access method used for manipulating */ abstract raw(): PageType; /** Frees up any resources */ abstract dispose(): Promise<void>; /** * Get the entire html of the page and return it in a single string */ abstract getDocumentHtml(): Promise<string>; } /** * Defines the methods that should be implemented in an engine that supports parsing the page */ export declare abstract class ParsingCore<PageType, ExpectedInitializeObjectType> implements Core<PageType, ExpectedInitializeObjectType> { /** This core provides methods to actually manipulate the page. But it can be null if the core has not been initialized yet */ protected core: PageType | null; private url; constructor(url: string); /** * Get the url that was passed at the time of creation */ getUrl(): string; /** * Initialize and load in anything for the core * @param data Based on what was specified with the manipulation core design, this data allows us to properly initialize whatever core design we want */ abstract initialize(data: ExpectedInitializeObjectType): Promise<void>; /** * Gets the text off the desired selector. * Note: If the selector finds more the one, this method will only return the very first text contents */ abstract getText(querySelector: string): Promise<string>; /** * Get the text from the desired selector. * This variation always returns an array, so this is multi element friendly */ abstract getTextAll(querySelector: string): Promise<string[]>; /** * Gets the attribute value off the selector with the specified attribute name. * Note: If there selector finds more then one element, this method will only return the very first attribute value */ abstract getAttribute(querySelector: string, attributeName: string): Promise<string>; /** * Gets the attribute value off the selector with the specified attribute name. * This method is multi element friendly, it will always return an array of attribute values that match the attribute name */ abstract getAttributeAll(querySelector: string, attributeName: string): Promise<string[]>; /** * Gets the html off the selector. * This method only works on the very first element found that matches the selector */ abstract getHtml(querySelector: string): Promise<string>; /** * Gets the html off the elememts that match the selector. * This method is multi element friendly */ abstract getHtmlAll(querySelector: string): Promise<string[]>; /** * Determines if an element does exist on the current DOM Tree */ abstract elementExist(querySelector: string, timeout?: number): Promise<boolean>; /** * Wait for a timeout operation, return a promise that completes when the timeout is done * @param timeoutInterval */ waitForTimeout(timeoutInterval: number): Promise<void>; /** * Counts the total amount of elements that match the selector */ abstract elementCount(querySelector: string): Promise<number>; /** * Gets all the option elements that can be found in the select element mathing the query selector * @param querySelector The query selector that targets the select element */ abstract getSelectOptions(querySelector: string): Promise<{ text: string; value: string; }[]>; /** * Get the entire html of the page and return it in a single string */ abstract getDocumentHtml(): Promise<string>; /** * Grab the raw page type to read from */ abstract raw(): PageType; /** Frees up any resources */ abstract dispose(): Promise<void>; } export default WebScrapingEngine;