parsera-ts
Version:
Official TypeScript SDK for Parsera.org API - Extract structured data from any webpage
382 lines (381 loc) • 13.3 kB
JavaScript
export class Parsera {
/**
* Creates a new Parsera client instance.
*
* @example
* ```typescript
* const parsera = new Parsera({
* apiKey: "your-api-key",
* timeout: 60000, // 60 second timeout
* retryOptions: {
* maxRetries: 3,
* backoffFactor: 2,
* initialDelay: 1000,
* }
* });
* ```
*/
constructor({ apiKey, baseUrl = 'https://api.parsera.org/v1', defaultProxyCountry = 'UnitedStates', timeout = 30000, retryOptions = {} }) {
this.lastRequestTime = 0;
this.minRequestInterval = 100; // 100ms between requests
this.eventHandlers = new Map();
this.eventOptions = new Map();
this.validateApiKey(apiKey);
this.apiKey = apiKey;
this.baseUrl = baseUrl;
this.defaultProxyCountry = defaultProxyCountry;
this.timeout = timeout;
this.retryOptions = {
maxRetries: retryOptions.maxRetries ?? 3,
backoffFactor: retryOptions.backoffFactor ?? 2,
initialDelay: retryOptions.initialDelay ?? 1000
};
}
validateApiKey(apiKey) {
if (!apiKey || typeof apiKey !== 'string' || apiKey.length < 32) {
throw new Error('Invalid API key format');
}
}
validateUrl(url) {
try {
new URL(url);
}
catch {
throw new Error('Invalid URL format');
}
}
async enforceRateLimit() {
const now = Date.now();
const timeSinceLastRequest = now - this.lastRequestTime;
if (timeSinceLastRequest < this.minRequestInterval) {
await new Promise((resolve) => setTimeout(resolve, this.minRequestInterval - timeSinceLastRequest));
}
this.lastRequestTime = Date.now();
}
async fetchWithTimeout(url, options) {
const { timeout = this.timeout, ...fetchOptions } = options;
const controller = new AbortController();
const timeoutId = setTimeout(() => {
controller.abort();
}, timeout);
if (options.signal) {
options.signal.addEventListener('abort', () => {
clearTimeout(timeoutId);
controller.abort();
});
}
try {
const response = await fetch(url, {
...fetchOptions,
signal: controller.signal
});
clearTimeout(timeoutId);
return response;
}
catch (error) {
clearTimeout(timeoutId);
if (error instanceof Error && error.name === 'AbortError') {
const timeoutError = new Error('Request timed out');
timeoutError.name = 'TimeoutError';
throw timeoutError;
}
throw error;
}
}
async retryableRequest(requestFn, retryCount = 0) {
try {
await this.enforceRateLimit();
const response = await requestFn();
if (response.status === 429 && retryCount < this.retryOptions.maxRetries) {
await this.emit('rateLimit', { retryCount });
await this.emit('request:retry', { retryCount });
const delay = this.retryOptions.initialDelay * Math.pow(this.retryOptions.backoffFactor, retryCount);
await new Promise((resolve) => setTimeout(resolve, delay));
return this.retryableRequest(requestFn, retryCount + 1);
}
return response;
}
catch (error) {
if (error instanceof Error) {
if (error.name === 'AbortError') {
await this.emit('timeout', undefined, error);
}
await this.emit('request:error', undefined, error);
if (retryCount < this.retryOptions.maxRetries && this.isRetryableError(error)) {
await this.emit('request:retry', { retryCount });
const delay = this.retryOptions.initialDelay * Math.pow(this.retryOptions.backoffFactor, retryCount);
await new Promise((resolve) => setTimeout(resolve, delay));
return this.retryableRequest(requestFn, retryCount + 1);
}
}
throw error;
}
}
isRetryableError(error) {
if (error instanceof Error) {
const message = error.message.toLowerCase();
return (message.includes('network') ||
message.includes('timeout') ||
message.includes('rate limit') ||
message.includes('too many requests') ||
message.includes('econnreset') ||
message.includes('socket hang up'));
}
return false;
}
/**
* Converts a Record<string, string> to ParseraAttribute[]
*/
convertToAttributes(attrs) {
return Object.entries(attrs).map(([name, description]) => ({
name,
description
}));
}
/**
* Registers an event handler for a specific event type
*
* @param eventType - Type of event to listen for
* @param handler - Function to handle the event
* @param options - Configuration options for event handling
*
* @example
* ```typescript
* parsera.on('extract:complete', (event) => {
* console.log(`Extraction completed with ${event.data.length} items`);
* });
*
* parsera.on('request:retry', (event) => {
* console.log(`Retrying request (attempt ${event.retryCount})`);
* });
*
* // Custom event
* parsera.on('my:custom:event', (event) => {
* console.log('Custom event data:', event.data);
* });
* ```
*/
on(eventType, handler, options = {}) {
if (!this.eventHandlers.has(eventType)) {
this.eventHandlers.set(eventType, new Set());
}
this.eventHandlers.get(eventType).add(handler);
this.eventOptions.set(eventType, {
async: options.async ?? false,
catchErrors: options.catchErrors ?? true
});
}
/**
* Removes an event handler for a specific event type
*/
off(eventType, handler) {
const handlers = this.eventHandlers.get(eventType);
if (handlers) {
handlers.delete(handler);
}
}
/**
* Removes all event handlers for a specific event type
*/
removeAllListeners(eventType) {
if (eventType) {
this.eventHandlers.delete(eventType);
}
else {
this.eventHandlers.clear();
}
}
async emit(eventType, data, error, retryCount) {
const handlers = this.eventHandlers.get(eventType);
const options = this.eventOptions.get(eventType) ?? { catchErrors: true, async: false };
if (!handlers?.size)
return;
const event = {
type: eventType,
timestamp: Date.now(),
...(data !== undefined && { data }),
...(error && { error }),
...(retryCount !== undefined && { retryCount })
};
const handleEvent = async (handler) => {
try {
await handler(event);
}
catch (error) {
if (!options.catchErrors) {
throw error;
}
}
};
if (options.async) {
handlers.forEach((handler) => {
handleEvent(handler).catch(() => { });
});
}
else {
await Promise.all(Array.from(handlers).map((handler) => handleEvent(handler)));
}
}
/**
* Extracts data from a webpage using the Parsera API.
*
* @param options - Configuration options for the extraction
* @returns Promise resolving to an array of extracted data objects
*
* @throws {Error} When API key is invalid
* @throws {Error} When URL is invalid
* @throws {Error} When request times out
* @throws {Error} When rate limit is exceeded (after retries)
* @throws {Error} When no data is found
*
* @example
* ```typescript
* // Basic usage with attribute record
* const results = await parsera.extract({
* url: "https://example.com/products",
* attributes: {
* title: "Extract the product title",
* price: "Get the product price",
* }
* });
*
* // Advanced usage with all options
* const results = await parsera.extract({
* url: "https://example.com/products",
* attributes: [
* { name: "title", description: "Extract the product title" },
* { name: "price", description: "Get the product price" }
* ],
* proxyCountry: "GB",
* cookies: [
* { name: "session", value: "abc123", sameSite: "Lax" }
* ],
* precisionMode: true,
* signal: abortController.signal
* });
*
* // With request cancellation
* const controller = new AbortController();
* const promise = parsera.extract({
* url: "https://example.com",
* attributes: { title: "Extract the title" },
* signal: controller.signal
* });
*
* // Cancel the request after 5 seconds
* setTimeout(() => controller.abort(), 5000);
* ```
*
* @example
* // Example return value:
* [
* {
* "title": "Product Name",
* "price": "$99.99"
* },
* {
* "title": "Another Product",
* "price": "$149.99"
* }
* ]
*/
async extract({ url, attributes, proxyCountry, cookies, precisionMode, signal }) {
await this.emit('extract:start', {
url,
attributes,
proxyCountry,
cookies,
precisionMode,
signal
});
this.validateUrl(url);
try {
const requestBody = {
url,
attributes: Array.isArray(attributes) ? attributes : this.convertToAttributes(attributes),
proxy_country: proxyCountry || this.defaultProxyCountry
};
if (cookies) {
requestBody.cookies = cookies;
}
if (precisionMode) {
requestBody.mode = 'precision';
}
const response = await this.retryableRequest(() => this.fetchWithTimeout(`${this.baseUrl}/extract`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-API-KEY': this.apiKey
},
body: JSON.stringify(requestBody),
signal
}));
if (!response.ok) {
await this.handleError(response);
}
const data = (await response.json());
if (!data.data?.length) {
throw new Error(data.message ||
'No data returned from Parsera API. Make sure the website contains the data and the attribute descriptions are clear.');
}
await this.emit('extract:complete', data);
return data.data;
}
catch (error) {
if (error instanceof Error) {
await this.emit('extract:error', undefined, error);
if (error.message === 'Request timed out') {
throw error;
}
throw new Error(`Failed to extract data: ${error.message}`);
}
throw new Error('Failed to extract data: Unknown error');
}
}
/**
* Alias for extract method to match Python library interface.
*
* @see {@link extract} for full documentation and examples
*
* @example
* ```typescript
* const results = await parsera.run({
* url: "https://example.com",
* attributes: { title: "Extract the title" }
* });
* ```
*/
async run(options) {
return this.extract(options);
}
/**
* Alias for extract method to match Python library interface.
*
* @see {@link extract} for full documentation and examples
*
* @example
* ```typescript
* const results = await parsera.arun({
* url: "https://example.com",
* attributes: { title: "Extract the title" }
* });
* ```
*/
async arun(options) {
return this.extract(options);
}
async handleError(response) {
const status = response.status;
const errorData = (await response.json());
switch (status) {
case 401:
throw new Error('Invalid Parsera API key. Please check your credentials.');
case 429:
throw new Error('Rate limit exceeded. Please try again later.');
case 400:
throw new Error(`Bad request: ${errorData?.message || 'Unknown error'}`);
default:
throw new Error(`Parsera API error: ${errorData?.message || response.statusText}`);
}
}
}