UNPKG

parsera-ts

Version:

Official TypeScript SDK for Parsera.org API - Extract structured data from any webpage

382 lines (381 loc) 13.3 kB
export class Parsera { /** * Creates a new Parsera client instance. * * @example * ```typescript * const parsera = new Parsera({ * apiKey: "your-api-key", * timeout: 60000, // 60 second timeout * retryOptions: { * maxRetries: 3, * backoffFactor: 2, * initialDelay: 1000, * } * }); * ``` */ constructor({ apiKey, baseUrl = 'https://api.parsera.org/v1', defaultProxyCountry = 'UnitedStates', timeout = 30000, retryOptions = {} }) { this.lastRequestTime = 0; this.minRequestInterval = 100; // 100ms between requests this.eventHandlers = new Map(); this.eventOptions = new Map(); this.validateApiKey(apiKey); this.apiKey = apiKey; this.baseUrl = baseUrl; this.defaultProxyCountry = defaultProxyCountry; this.timeout = timeout; this.retryOptions = { maxRetries: retryOptions.maxRetries ?? 3, backoffFactor: retryOptions.backoffFactor ?? 2, initialDelay: retryOptions.initialDelay ?? 1000 }; } validateApiKey(apiKey) { if (!apiKey || typeof apiKey !== 'string' || apiKey.length < 32) { throw new Error('Invalid API key format'); } } validateUrl(url) { try { new URL(url); } catch { throw new Error('Invalid URL format'); } } async enforceRateLimit() { const now = Date.now(); const timeSinceLastRequest = now - this.lastRequestTime; if (timeSinceLastRequest < this.minRequestInterval) { await new Promise((resolve) => setTimeout(resolve, this.minRequestInterval - timeSinceLastRequest)); } this.lastRequestTime = Date.now(); } async fetchWithTimeout(url, options) { const { timeout = this.timeout, ...fetchOptions } = options; const controller = new AbortController(); const timeoutId = setTimeout(() => { controller.abort(); }, timeout); if (options.signal) { options.signal.addEventListener('abort', () => { clearTimeout(timeoutId); controller.abort(); }); } try { const response = await fetch(url, { ...fetchOptions, signal: controller.signal }); clearTimeout(timeoutId); return response; } catch (error) { clearTimeout(timeoutId); if (error instanceof Error && error.name === 'AbortError') { const timeoutError = new Error('Request timed out'); timeoutError.name = 'TimeoutError'; throw timeoutError; } throw error; } } async retryableRequest(requestFn, retryCount = 0) { try { await this.enforceRateLimit(); const response = await requestFn(); if (response.status === 429 && retryCount < this.retryOptions.maxRetries) { await this.emit('rateLimit', { retryCount }); await this.emit('request:retry', { retryCount }); const delay = this.retryOptions.initialDelay * Math.pow(this.retryOptions.backoffFactor, retryCount); await new Promise((resolve) => setTimeout(resolve, delay)); return this.retryableRequest(requestFn, retryCount + 1); } return response; } catch (error) { if (error instanceof Error) { if (error.name === 'AbortError') { await this.emit('timeout', undefined, error); } await this.emit('request:error', undefined, error); if (retryCount < this.retryOptions.maxRetries && this.isRetryableError(error)) { await this.emit('request:retry', { retryCount }); const delay = this.retryOptions.initialDelay * Math.pow(this.retryOptions.backoffFactor, retryCount); await new Promise((resolve) => setTimeout(resolve, delay)); return this.retryableRequest(requestFn, retryCount + 1); } } throw error; } } isRetryableError(error) { if (error instanceof Error) { const message = error.message.toLowerCase(); return (message.includes('network') || message.includes('timeout') || message.includes('rate limit') || message.includes('too many requests') || message.includes('econnreset') || message.includes('socket hang up')); } return false; } /** * Converts a Record<string, string> to ParseraAttribute[] */ convertToAttributes(attrs) { return Object.entries(attrs).map(([name, description]) => ({ name, description })); } /** * Registers an event handler for a specific event type * * @param eventType - Type of event to listen for * @param handler - Function to handle the event * @param options - Configuration options for event handling * * @example * ```typescript * parsera.on('extract:complete', (event) => { * console.log(`Extraction completed with ${event.data.length} items`); * }); * * parsera.on('request:retry', (event) => { * console.log(`Retrying request (attempt ${event.retryCount})`); * }); * * // Custom event * parsera.on('my:custom:event', (event) => { * console.log('Custom event data:', event.data); * }); * ``` */ on(eventType, handler, options = {}) { if (!this.eventHandlers.has(eventType)) { this.eventHandlers.set(eventType, new Set()); } this.eventHandlers.get(eventType).add(handler); this.eventOptions.set(eventType, { async: options.async ?? false, catchErrors: options.catchErrors ?? true }); } /** * Removes an event handler for a specific event type */ off(eventType, handler) { const handlers = this.eventHandlers.get(eventType); if (handlers) { handlers.delete(handler); } } /** * Removes all event handlers for a specific event type */ removeAllListeners(eventType) { if (eventType) { this.eventHandlers.delete(eventType); } else { this.eventHandlers.clear(); } } async emit(eventType, data, error, retryCount) { const handlers = this.eventHandlers.get(eventType); const options = this.eventOptions.get(eventType) ?? { catchErrors: true, async: false }; if (!handlers?.size) return; const event = { type: eventType, timestamp: Date.now(), ...(data !== undefined && { data }), ...(error && { error }), ...(retryCount !== undefined && { retryCount }) }; const handleEvent = async (handler) => { try { await handler(event); } catch (error) { if (!options.catchErrors) { throw error; } } }; if (options.async) { handlers.forEach((handler) => { handleEvent(handler).catch(() => { }); }); } else { await Promise.all(Array.from(handlers).map((handler) => handleEvent(handler))); } } /** * Extracts data from a webpage using the Parsera API. * * @param options - Configuration options for the extraction * @returns Promise resolving to an array of extracted data objects * * @throws {Error} When API key is invalid * @throws {Error} When URL is invalid * @throws {Error} When request times out * @throws {Error} When rate limit is exceeded (after retries) * @throws {Error} When no data is found * * @example * ```typescript * // Basic usage with attribute record * const results = await parsera.extract({ * url: "https://example.com/products", * attributes: { * title: "Extract the product title", * price: "Get the product price", * } * }); * * // Advanced usage with all options * const results = await parsera.extract({ * url: "https://example.com/products", * attributes: [ * { name: "title", description: "Extract the product title" }, * { name: "price", description: "Get the product price" } * ], * proxyCountry: "GB", * cookies: [ * { name: "session", value: "abc123", sameSite: "Lax" } * ], * precisionMode: true, * signal: abortController.signal * }); * * // With request cancellation * const controller = new AbortController(); * const promise = parsera.extract({ * url: "https://example.com", * attributes: { title: "Extract the title" }, * signal: controller.signal * }); * * // Cancel the request after 5 seconds * setTimeout(() => controller.abort(), 5000); * ``` * * @example * // Example return value: * [ * { * "title": "Product Name", * "price": "$99.99" * }, * { * "title": "Another Product", * "price": "$149.99" * } * ] */ async extract({ url, attributes, proxyCountry, cookies, precisionMode, signal }) { await this.emit('extract:start', { url, attributes, proxyCountry, cookies, precisionMode, signal }); this.validateUrl(url); try { const requestBody = { url, attributes: Array.isArray(attributes) ? attributes : this.convertToAttributes(attributes), proxy_country: proxyCountry || this.defaultProxyCountry }; if (cookies) { requestBody.cookies = cookies; } if (precisionMode) { requestBody.mode = 'precision'; } const response = await this.retryableRequest(() => this.fetchWithTimeout(`${this.baseUrl}/extract`, { method: 'POST', headers: { 'Content-Type': 'application/json', 'X-API-KEY': this.apiKey }, body: JSON.stringify(requestBody), signal })); if (!response.ok) { await this.handleError(response); } const data = (await response.json()); if (!data.data?.length) { throw new Error(data.message || 'No data returned from Parsera API. Make sure the website contains the data and the attribute descriptions are clear.'); } await this.emit('extract:complete', data); return data.data; } catch (error) { if (error instanceof Error) { await this.emit('extract:error', undefined, error); if (error.message === 'Request timed out') { throw error; } throw new Error(`Failed to extract data: ${error.message}`); } throw new Error('Failed to extract data: Unknown error'); } } /** * Alias for extract method to match Python library interface. * * @see {@link extract} for full documentation and examples * * @example * ```typescript * const results = await parsera.run({ * url: "https://example.com", * attributes: { title: "Extract the title" } * }); * ``` */ async run(options) { return this.extract(options); } /** * Alias for extract method to match Python library interface. * * @see {@link extract} for full documentation and examples * * @example * ```typescript * const results = await parsera.arun({ * url: "https://example.com", * attributes: { title: "Extract the title" } * }); * ``` */ async arun(options) { return this.extract(options); } async handleError(response) { const status = response.status; const errorData = (await response.json()); switch (status) { case 401: throw new Error('Invalid Parsera API key. Please check your credentials.'); case 429: throw new Error('Rate limit exceeded. Please try again later.'); case 400: throw new Error(`Bad request: ${errorData?.message || 'Unknown error'}`); default: throw new Error(`Parsera API error: ${errorData?.message || response.statusText}`); } } }