UNPKG

parsera-ts

Version:

Official TypeScript SDK for Parsera.org API - Extract structured data from any webpage

300 lines (299 loc) 9.6 kB
export interface ParseraRetryOptions { /** * Maximum number of retry attempts for failed requests * Applies to network errors, timeouts, and rate limits * @default 3 */ maxRetries?: number; /** * Multiplier for exponential backoff between retries * Each retry will wait: initialDelay * (backoffFactor ^ retryCount) * Example: 1000ms, then 2000ms, then 4000ms * @default 2 */ backoffFactor?: number; /** * Initial delay (in milliseconds) before the first retry * This delay will be multiplied by backoffFactor for subsequent retries * @default 1000 (1 second) */ initialDelay?: number; } export interface ParseraAttribute { /** * Name of the attribute to extract * This will be the key in the returned data object */ name: string; /** * Natural language description of what to extract * Be as specific as possible about the data you want */ description: string; } export interface ParseraCookie { /** * Cookie properties as key-value pairs */ [key: string]: string; /** * SameSite attribute for the cookie * Controls how the cookie behaves with cross-site requests */ sameSite: 'None' | 'Lax' | 'Strict'; } export interface ParseraOptions { /** * Your Parsera API key * Required for authentication with the API */ apiKey: string; /** * Base URL for the Parsera API * Only change this if you're using a custom API endpoint * @default "https://api.parsera.org/v1" */ baseUrl?: string; /** * Default country code for proxy servers * Used when no specific proxyCountry is provided in the request * @example "random" | "UnitedStates" | "UnitedKingdom" | "Germany" | "Japan" | "France" | "Canada" * @default "UnitedStates" * @see https://api.parsera.org/v1/proxy-countries for full list of supported countries */ defaultProxyCountry?: string; /** * Maximum time (in milliseconds) to wait for each API request * If a request takes longer, it will be aborted and retried * @default 30000 (30 seconds) */ timeout?: number; /** * Configuration for retry behavior on failed requests * @see ParseraRetryOptions */ retryOptions?: ParseraRetryOptions; } export interface ExtractOptions { /** * URL of the webpage to extract data from * Must be a valid HTTP/HTTPS URL */ url: string; /** * Attributes to extract from the webpage * Can be either an array of ParseraAttribute objects * or a Record of name-description pairs */ attributes: ParseraAttribute[] | Record<string, string>; /** * Country code for proxy server location * Overrides the defaultProxyCountry setting * @example "UnitedStates" | "UnitedKingdom" | "Germany" | "Japan" */ proxyCountry?: string; /** * Cookies to be sent with the request * Useful for accessing pages that require authentication */ cookies?: ParseraCookie[]; /** * Enable precision mode for more accurate extractions * May increase processing time * @default false */ precisionMode?: boolean; /** * AbortSignal for request cancellation * Allows the request to be aborted if it exceeds timeout * or if cancellation is needed */ signal?: AbortSignal; } export type ParseraEventType = 'request:start' | 'request:end' | 'request:retry' | 'request:error' | 'extract:start' | 'extract:complete' | 'extract:error' | 'rateLimit' | 'timeout' | string; export interface ParseraEvent<T = unknown> { type: ParseraEventType; timestamp: number; data?: T; error?: Error; retryCount?: number; } export type ParseraEventHandler<T = unknown> = (event: ParseraEvent<T>) => void | Promise<void>; export interface ParseraEventOptions { /** * Whether to handle the event asynchronously * When true, event handlers won't block the main execution * @default false */ async?: boolean; /** * Whether to catch errors in event handlers * When true, errors in handlers won't affect the main execution * @default true */ catchErrors?: boolean; } export declare class Parsera { private readonly apiKey; private readonly baseUrl; private readonly defaultProxyCountry; private readonly timeout; private readonly retryOptions; private lastRequestTime; private readonly minRequestInterval; private readonly eventHandlers; private readonly eventOptions; /** * Creates a new Parsera client instance. * * @example * ```typescript * const parsera = new Parsera({ * apiKey: "your-api-key", * timeout: 60000, // 60 second timeout * retryOptions: { * maxRetries: 3, * backoffFactor: 2, * initialDelay: 1000, * } * }); * ``` */ constructor({ apiKey, baseUrl, defaultProxyCountry, timeout, retryOptions }: ParseraOptions); private validateApiKey; private validateUrl; private enforceRateLimit; private fetchWithTimeout; private retryableRequest; private isRetryableError; /** * Converts a Record<string, string> to ParseraAttribute[] */ private convertToAttributes; /** * Registers an event handler for a specific event type * * @param eventType - Type of event to listen for * @param handler - Function to handle the event * @param options - Configuration options for event handling * * @example * ```typescript * parsera.on('extract:complete', (event) => { * console.log(`Extraction completed with ${event.data.length} items`); * }); * * parsera.on('request:retry', (event) => { * console.log(`Retrying request (attempt ${event.retryCount})`); * }); * * // Custom event * parsera.on('my:custom:event', (event) => { * console.log('Custom event data:', event.data); * }); * ``` */ on<T = unknown>(eventType: ParseraEventType, handler: ParseraEventHandler<T>, options?: ParseraEventOptions): void; /** * Removes an event handler for a specific event type */ off<T = unknown>(eventType: ParseraEventType, handler: ParseraEventHandler<T>): void; /** * Removes all event handlers for a specific event type */ removeAllListeners(eventType?: ParseraEventType): void; private emit; /** * Extracts data from a webpage using the Parsera API. * * @param options - Configuration options for the extraction * @returns Promise resolving to an array of extracted data objects * * @throws {Error} When API key is invalid * @throws {Error} When URL is invalid * @throws {Error} When request times out * @throws {Error} When rate limit is exceeded (after retries) * @throws {Error} When no data is found * * @example * ```typescript * // Basic usage with attribute record * const results = await parsera.extract({ * url: "https://example.com/products", * attributes: { * title: "Extract the product title", * price: "Get the product price", * } * }); * * // Advanced usage with all options * const results = await parsera.extract({ * url: "https://example.com/products", * attributes: [ * { name: "title", description: "Extract the product title" }, * { name: "price", description: "Get the product price" } * ], * proxyCountry: "GB", * cookies: [ * { name: "session", value: "abc123", sameSite: "Lax" } * ], * precisionMode: true, * signal: abortController.signal * }); * * // With request cancellation * const controller = new AbortController(); * const promise = parsera.extract({ * url: "https://example.com", * attributes: { title: "Extract the title" }, * signal: controller.signal * }); * * // Cancel the request after 5 seconds * setTimeout(() => controller.abort(), 5000); * ``` * * @example * // Example return value: * [ * { * "title": "Product Name", * "price": "$99.99" * }, * { * "title": "Another Product", * "price": "$149.99" * } * ] */ extract({ url, attributes, proxyCountry, cookies, precisionMode, signal }: ExtractOptions): Promise<Record<string, string>[]>; /** * Alias for extract method to match Python library interface. * * @see {@link extract} for full documentation and examples * * @example * ```typescript * const results = await parsera.run({ * url: "https://example.com", * attributes: { title: "Extract the title" } * }); * ``` */ run(options: ExtractOptions): Promise<Record<string, string>[]>; /** * Alias for extract method to match Python library interface. * * @see {@link extract} for full documentation and examples * * @example * ```typescript * const results = await parsera.arun({ * url: "https://example.com", * attributes: { title: "Extract the title" } * }); * ``` */ arun(options: ExtractOptions): Promise<Record<string, string>[]>; private handleError; }