hammer-scrape
Version:
Unifies Cheerio and Puppeteer for the most streamline scraping experience
210 lines (209 loc) • 8.42 kB
TypeScript
import EngineMode from './engine_mode';
import EngineType from './engine_type';
import EngineCoreType from './engine_core_type';
/**
* A standardized abstract class that represents what we want in our web scraping engines.
*/
export declare abstract class WebScrapingEngine<PCore, MCore> {
private readonly engineType;
private readonly engineCore;
private engineMode;
protected parsingCore: PCore | null;
protected manipulationCore: MCore | null;
/**
* Construct our basic implementation of our engine
* @param engineType The type of engine we are aiming for
* @param engineCoreType The core type that we want to utilize
*/
constructor(engineType: EngineType, engineCoreType: EngineCoreType);
/**
* Gets the engine mode that represents
*/
getEngineMode(): EngineMode;
/**
* Gets the engine type that best represents this engine
*/
getEngineType(): EngineType;
/**
* Gets the core types associated with this engine
*/
getEngineCoreType(): EngineCoreType;
/**
* Determine if the engine is currently running or not.
*/
isRunning(): boolean;
/**
* Decides what mode the engine should be in.
* The engine should only be in one mode at a time. Either Parsing or manipulating
* @param newMode The new mode that we want to lock the engine into
*/
protected setEngineMode(newMode: EngineMode): void;
/**
* Determines if the engine is in the current mode
* @param targetMode The mode we are expecting to be in
*/
protected isCorrectEngineMode(targetMode: EngineMode): boolean;
/**
* Load things into the engine. This is automatically called by this.startup
*/
protected abstract load(): Promise<void>;
/**
* Start the engine up and get ready to process what comes in
*/
startup(): Promise<void>;
/**
* Processes the url while the the engine is running and doing its thing
* @param url The url we want to target
*/
abstract process(url: string): Promise<void>;
/**
* Shut the engine off and free up any resources
*/
abstract shutoff(): Promise<void>;
/**
* Get the parsing core that we have decided to utilize
*/
getParsingCore(): PCore | null;
/**
* Get the manipulation core that we have decided to utilize
*/
getManipulationCore(): MCore | null;
/**
* Safely enters manipulation mode
* @param callback Let's you safely manipulate the page this engine is processing
*/
manipulate(callback: (core: MCore) => Promise<void>): Promise<void>;
/**
* Provides a mechanism to safely parse the core contents
* @param callback This is where you will be able to safely parse the contents
*/
parse(callback: (core: PCore) => Promise<void>): Promise<void>;
}
export interface Core<PageType, ExpectedInitializeObjectType> {
getUrl(): string;
raw(): PageType;
initialize(data: ExpectedInitializeObjectType): Promise<void>;
dispose(): Promise<void>;
}
/**
* Defines the methods that should be implemented in a core that handles manipulation
*/
export declare abstract class ManipulationCore<PageType, ExpectedInitializeObjectType> implements Core<PageType, ExpectedInitializeObjectType> {
/** This core provides methods to actually manipulate the page. But it can be null if the core has not been initialized yet */
protected core: PageType | null;
private url;
constructor(url: string);
getUrl(): string;
/**
* Initialize and load in anything for the core
* @param data Based on what was specified with the manipulation core design, this data allows us to properly initialize whatever core design we want
*/
abstract initialize(data: ExpectedInitializeObjectType): Promise<void>;
/**
* Set the value into the select element
* @param querySelector The selector that targets the select element
* @param value The value to attempt to set
*/
abstract select(querySelector: string, value: string): Promise<void>;
/**
* Click on the element that matches the query selector
* @param querySelector The selector that targets the element we want to click
*/
abstract click(querySelector: string): Promise<void>;
/**
* Type a specific value into the field that can accept typing.
* @param querySelector The element(s) that we are targeting
* @param value The value we want to simulate typing in
*/
abstract type(querySelector: string, value: string): Promise<void>;
/**
* Gets the raw page access method used for manipulating
*/
abstract raw(): PageType;
/** Frees up any resources */
abstract dispose(): Promise<void>;
/**
* Get the entire html of the page and return it in a single string
*/
abstract getDocumentHtml(): Promise<string>;
}
/**
* Defines the methods that should be implemented in an engine that supports parsing the page
*/
export declare abstract class ParsingCore<PageType, ExpectedInitializeObjectType> implements Core<PageType, ExpectedInitializeObjectType> {
/** This core provides methods to actually manipulate the page. But it can be null if the core has not been initialized yet */
protected core: PageType | null;
private url;
constructor(url: string);
/**
* Get the url that was passed at the time of creation
*/
getUrl(): string;
/**
* Initialize and load in anything for the core
* @param data Based on what was specified with the manipulation core design, this data allows us to properly initialize whatever core design we want
*/
abstract initialize(data: ExpectedInitializeObjectType): Promise<void>;
/**
* Gets the text off the desired selector.
* Note: If the selector finds more the one, this method will only return the very first text contents
*/
abstract getText(querySelector: string): Promise<string>;
/**
* Get the text from the desired selector.
* This variation always returns an array, so this is multi element friendly
*/
abstract getTextAll(querySelector: string): Promise<string[]>;
/**
* Gets the attribute value off the selector with the specified attribute name.
* Note: If there selector finds more then one element, this method will only return the very first attribute value
*/
abstract getAttribute(querySelector: string, attributeName: string): Promise<string>;
/**
* Gets the attribute value off the selector with the specified attribute name.
* This method is multi element friendly, it will always return an array of attribute values that match the attribute name
*/
abstract getAttributeAll(querySelector: string, attributeName: string): Promise<string[]>;
/**
* Gets the html off the selector.
* This method only works on the very first element found that matches the selector
*/
abstract getHtml(querySelector: string): Promise<string>;
/**
* Gets the html off the elememts that match the selector.
* This method is multi element friendly
*/
abstract getHtmlAll(querySelector: string): Promise<string[]>;
/**
* Determines if an element does exist on the current DOM Tree
*/
abstract elementExist(querySelector: string, timeout?: number): Promise<boolean>;
/**
* Wait for a timeout operation, return a promise that completes when the timeout is done
* @param timeoutInterval
*/
waitForTimeout(timeoutInterval: number): Promise<void>;
/**
* Counts the total amount of elements that match the selector
*/
abstract elementCount(querySelector: string): Promise<number>;
/**
* Gets all the option elements that can be found in the select element mathing the query selector
* @param querySelector The query selector that targets the select element
*/
abstract getSelectOptions(querySelector: string): Promise<{
text: string;
value: string;
}[]>;
/**
* Get the entire html of the page and return it in a single string
*/
abstract getDocumentHtml(): Promise<string>;
/**
* Grab the raw page type to read from
*/
abstract raw(): PageType;
/** Frees up any resources */
abstract dispose(): Promise<void>;
}
export default WebScrapingEngine;