UNPKG

firecrawl

Version:
1,646 lines (1,528 loc) 67 kB
import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; import * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { TypedEventTarget } from "typescript-event-target"; /** * Configuration interface for FirecrawlApp. * @param apiKey - Optional API key for authentication. * @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'. */ export interface FirecrawlAppConfig { apiKey?: string | null; apiUrl?: string | null; } /** * Metadata for a Firecrawl document. * Includes various optional properties for document metadata. */ export interface FirecrawlDocumentMetadata { title?: string; description?: string; language?: string; keywords?: string; robots?: string; ogTitle?: string; ogDescription?: string; ogUrl?: string; ogImage?: string; ogAudio?: string; ogDeterminer?: string; ogLocale?: string; ogLocaleAlternate?: string[]; ogSiteName?: string; ogVideo?: string; dctermsCreated?: string; dcDateCreated?: string; dcDate?: string; dctermsType?: string; dcType?: string; dctermsAudience?: string; dctermsSubject?: string; dcSubject?: string; dcDescription?: string; dctermsKeywords?: string; modifiedTime?: string; publishedTime?: string; articleTag?: string; articleSection?: string; sourceURL?: string; statusCode?: number; error?: string; proxyUsed?: "basic" | "stealth"; cacheState?: "miss" | "hit"; cachedAt?: string; [key: string]: any; // Allows for additional metadata properties not explicitly defined. } /** * Document interface for Firecrawl. * Represents a document retrieved or processed by Firecrawl. */ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | never) = never> { url?: string; markdown?: string; html?: string; rawHtml?: string; links?: string[]; extract?: T; json?: T; screenshot?: string; metadata?: FirecrawlDocumentMetadata; actions: ActionsSchema; changeTracking?: { previousScrapeAt: string | null; changeStatus: "new" | "same" | "changed" | "removed"; visibility: "visible" | "hidden"; diff?: { text: string; json: { files: Array<{ from: string | null; to: string | null; chunks: Array<{ content: string; changes: Array<{ type: string; normal?: boolean; ln?: number; ln1?: number; ln2?: number; content: string; }>; }>; }>; }; }; json?: any; }; // v1 search only title?: string; description?: string; } /** * Parameters for scraping operations. * Defines the options and configurations available for scraping web content. */ export interface CrawlScrapeOptions { formats?: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json" | "changeTracking")[]; headers?: Record<string, string>; includeTags?: string[]; excludeTags?: string[]; onlyMainContent?: boolean; waitFor?: number; timeout?: number; location?: { country?: string; languages?: string[]; }; mobile?: boolean; skipTlsVerification?: boolean; removeBase64Images?: boolean; blockAds?: boolean; proxy?: "basic" | "stealth" | "auto"; storeInCache?: boolean; maxAge?: number; parsePDF?: boolean; } export type Action = { type: "wait", milliseconds?: number, selector?: string, } | { type: "click", selector: string, all?: boolean, } | { type: "screenshot", fullPage?: boolean, quality?: number, } | { type: "write", text: string, } | { type: "press", key: string, } | { type: "scroll", direction?: "up" | "down", selector?: string, } | { type: "scrape", } | { type: "executeJavascript", script: string, }; export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchema extends (Action[] | undefined) = undefined> extends CrawlScrapeOptions { extract?: { prompt?: string; schema?: LLMSchema; systemPrompt?: string; }; jsonOptions?:{ prompt?: string; schema?: LLMSchema; systemPrompt?: string; } changeTrackingOptions?: { prompt?: string; schema?: any; modes?: ("json" | "git-diff")[]; tag?: string | null; } actions?: ActionsSchema; agent?: AgentOptions; zeroDataRetention?: boolean; } export interface ActionsResult { screenshots: string[]; scrapes: ({ url: string; html: string; })[]; javascriptReturns: { type: string; value: unknown }[]; } /** * Response interface for scraping operations. * Defines the structure of the response received after a scraping operation. */ export interface ScrapeResponse<LLMResult = any, ActionsSchema extends (ActionsResult | never) = never> extends FirecrawlDocument<LLMResult, ActionsSchema> { success: true; warning?: string; error?: string; } /** * Parameters for crawling operations. * Includes options for both scraping and mapping during a crawl. */ export interface CrawlParams { includePaths?: string[]; excludePaths?: string[]; maxDepth?: number; maxDiscoveryDepth?: number; limit?: number; allowBackwardLinks?: boolean; crawlEntireDomain?: boolean; allowExternalLinks?: boolean; ignoreSitemap?: boolean; scrapeOptions?: CrawlScrapeOptions; webhook?: string | { url: string; headers?: Record<string, string>; metadata?: Record<string, string>; events?: ["completed", "failed", "page", "started"][number][]; }; deduplicateSimilarURLs?: boolean; ignoreQueryParameters?: boolean; regexOnFullURL?: boolean; /** * Delay in seconds between scrapes. This helps respect website rate limits. * If not provided, the crawler may use the robots.txt crawl delay if available. */ delay?: number; allowSubdomains?: boolean; maxConcurrency?: number; zeroDataRetention?: boolean; } /** * Response interface for crawling operations. * Defines the structure of the response received after initiating a crawl. */ export interface CrawlResponse { id?: string; url?: string; success: true; error?: string; } /** * Response interface for batch scrape operations. * Defines the structure of the response received after initiating a crawl. */ export interface BatchScrapeResponse { id?: string; url?: string; success: true; error?: string; invalidURLs?: string[]; } /** * Response interface for job status checks. * Provides detailed status of a crawl job including progress and results. */ export interface CrawlStatusResponse { success: true; status: "scraping" | "completed" | "failed" | "cancelled"; completed: number; total: number; creditsUsed: number; expiresAt: Date; next?: string; data: FirecrawlDocument<undefined>[]; }; /** * Response interface for batch scrape job status checks. * Provides detailed status of a batch scrape job including progress and results. */ export interface BatchScrapeStatusResponse { success: true; status: "scraping" | "completed" | "failed" | "cancelled"; completed: number; total: number; creditsUsed: number; expiresAt: Date; next?: string; data: FirecrawlDocument<undefined>[]; }; /** * Parameters for mapping operations. * Defines options for mapping URLs during a crawl. */ export interface MapParams { search?: string; ignoreSitemap?: boolean; includeSubdomains?: boolean; sitemapOnly?: boolean; limit?: number; timeout?: number; useIndex?: boolean; } /** * Response interface for mapping operations. * Defines the structure of the response received after a mapping operation. */ export interface MapResponse { success: true; links?: string[]; error?: string; } /** * Parameters for extracting information from URLs. * Defines options for extracting information from URLs. */ export interface AgentOptions { model?: string; prompt?: string; sessionId?: string; } /** * Parameters for extracting information from URLs. * Defines options for extracting information from URLs. */ export interface AgentOptionsExtract { model?: string; sessionId?: string; } export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> { prompt?: string; schema?: LLMSchema | object; systemPrompt?: string; allowExternalLinks?: boolean; enableWebSearch?: boolean; includeSubdomains?: boolean; origin?: string; showSources?: boolean; scrapeOptions?: CrawlScrapeOptions; agent?: AgentOptionsExtract; } /** * Response interface for extracting information from URLs. * Defines the structure of the response received after extracting information from URLs. */ export interface ExtractResponse<LLMSchema extends zt.ZodSchema = any> { success: boolean; data: LLMSchema; error?: string; warning?: string; sources?: string[]; } /** * Error response interface. * Defines the structure of the response received when an error occurs. */ export interface ErrorResponse { success: false; error: string; } /** * Custom error class for Firecrawl. * Extends the built-in Error class to include a status code. */ export class FirecrawlError extends Error { statusCode: number; details?: any; constructor(message: string, statusCode: number, details?: any) { super(message); this.statusCode = statusCode; this.details = details; } } /** * Parameters for search operations. * Defines options for searching and scraping search results. */ export interface SearchParams { limit?: number; tbs?: string; filter?: string; lang?: string; country?: string; location?: string; origin?: string; timeout?: number; scrapeOptions?: ScrapeParams; } /** * Response interface for search operations. * Defines the structure of the response received after a search operation. */ export interface SearchResponse { success: boolean; data: FirecrawlDocument<undefined>[]; warning?: string; error?: string; } /** * Response interface for crawl/batch scrape error monitoring. */ export interface CrawlErrorsResponse { /** * Scrapes that errored out + error details */ errors: { id: string, timestamp?: string, url: string, error: string, }[]; /** * URLs blocked by robots.txt */ robotsBlocked: string[]; }; /** * Parameters for deep research operations. * Defines options for conducting deep research on a query. */ export interface DeepResearchParams<LLMSchema extends zt.ZodSchema = any> { /** * Maximum depth of research iterations (1-10) * @default 7 */ maxDepth?: number; /** * Time limit in seconds (30-300) * @default 270 */ timeLimit?: number; /** * Maximum number of URLs to analyze (1-1000) * @default 20 */ maxUrls?: number; /** * The prompt to use for the final analysis */ analysisPrompt?: string; /** * The system prompt to use for the research agent */ systemPrompt?: string; /** * The formats to use for the final analysis */ formats?: ("markdown" | "json")[]; /** * The JSON options to use for the final analysis */ jsonOptions?:{ prompt?: string; schema?: LLMSchema; systemPrompt?: string; }; /** * Experimental flag for streaming steps */ // __experimental_streamSteps?: boolean; } /** * Response interface for deep research operations. */ export interface DeepResearchResponse { success: boolean; id: string; } /** * Status response interface for deep research operations. */ export interface DeepResearchStatusResponse { success: boolean; data: { finalAnalysis: string; activities: Array<{ type: string; status: string; message: string; timestamp: string; depth: number; }>; sources: Array<{ url: string; title: string; description: string; }>; }; status: "processing" | "completed" | "failed"; error?: string; expiresAt: string; currentDepth: number; maxDepth: number; activities: Array<{ type: string; status: string; message: string; timestamp: string; depth: number; }>; sources: Array<{ url: string; title: string; description: string; }>; summaries: string[]; } /** * Parameters for LLMs.txt generation operations. */ export interface GenerateLLMsTextParams { /** * Maximum number of URLs to process (1-100) * @default 10 */ maxUrls?: number; /** * Whether to show the full LLMs-full.txt in the response * @default false */ showFullText?: boolean; /** * Whether to use cached content if available * @default true */ cache?: boolean; /** * Experimental flag for streaming */ __experimental_stream?: boolean; } /** * Response interface for LLMs.txt generation operations. */ export interface GenerateLLMsTextResponse { success: boolean; id: string; } /** * Status response interface for LLMs.txt generation operations. */ export interface GenerateLLMsTextStatusResponse { success: boolean; data: { llmstxt: string; llmsfulltxt?: string; }; status: "processing" | "completed" | "failed"; error?: string; expiresAt: string; } /** * Main class for interacting with the Firecrawl API. * Provides methods for scraping, searching, crawling, and mapping web content. */ export default class FirecrawlApp { public apiKey: string; public apiUrl: string; public version: string = "1.25.1"; private isCloudService(url: string): boolean { return url.includes('api.firecrawl.dev'); } private async getVersion(): Promise<string> { try { const packageJson = await import('../package.json', { assert: { type: 'json' } }); return packageJson.default.version; } catch (error) { console.error("Error getting version:", error); return "1.25.1"; } } private async init() { this.version = await this.getVersion(); } /** * Initializes a new instance of the FirecrawlApp class. * @param config - Configuration options for the FirecrawlApp instance. */ constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { const baseUrl = apiUrl || "https://api.firecrawl.dev"; if (this.isCloudService(baseUrl) && typeof apiKey !== "string") { throw new FirecrawlError("No API key provided", 401); } this.apiKey = apiKey || ''; this.apiUrl = baseUrl; this.init(); } /** * Scrapes a URL using the Firecrawl API. * @param url - The URL to scrape. * @param params - Additional parameters for the scrape request. * @returns The response from the scrape operation. */ async scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>( url: string, params?: ScrapeParams<T, ActionsSchema> ): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse> { const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; if (jsonData?.extract?.schema) { let schema = jsonData.extract.schema; // Try parsing the schema as a Zod schema try { schema = zodToJsonSchema(schema); } catch (error) { } jsonData = { ...jsonData, extract: { ...jsonData.extract, schema: schema, }, }; } if (jsonData?.jsonOptions?.schema) { let schema = jsonData.jsonOptions.schema; // Try parsing the schema as a Zod schema try { schema = zodToJsonSchema(schema); } catch (error) { } jsonData = { ...jsonData, jsonOptions: { ...jsonData.jsonOptions, schema: schema, }, }; } try { const response: AxiosResponse = await axios.post( this.apiUrl + `/v1/scrape`, jsonData, { headers, timeout: params?.timeout !== undefined ? (params.timeout + 5000) : undefined }, ); if (response.status === 200) { const responseData = response.data; if (responseData.success) { return { success: true, warning: responseData.warning, error: responseData.error, ...responseData.data }; } else { throw new FirecrawlError(`Failed to scrape URL. Error: ${responseData.error}`, response.status); } } else { this.handleError(response, "scrape URL"); } } catch (error: any) { this.handleError(error.response, "scrape URL"); } return { success: false, error: "Internal server error." }; } /** * Searches using the Firecrawl API and optionally scrapes the results. * @param query - The search query string. * @param params - Optional parameters for the search request. * @returns The response from the search operation. */ async search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse> { const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; let jsonData: any = { query, limit: params?.limit ?? 5, tbs: params?.tbs, filter: params?.filter, lang: params?.lang ?? "en", country: params?.country ?? "us", location: params?.location, origin: `js-sdk@${this.version}`, timeout: params?.timeout ?? 60000, scrapeOptions: params?.scrapeOptions ?? { formats: [] }, }; if (jsonData?.scrapeOptions?.extract?.schema) { let schema = jsonData.scrapeOptions.extract.schema; // Try parsing the schema as a Zod schema try { schema = zodToJsonSchema(schema); } catch (error) { } jsonData = { ...jsonData, scrapeOptions: { ...jsonData.scrapeOptions, extract: { ...jsonData.scrapeOptions.extract, schema: schema, }, }, }; } try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/search`, jsonData, headers ); if (response.status === 200) { const responseData = response.data; if (responseData.success) { return { success: true, data: responseData.data as FirecrawlDocument<any>[], warning: responseData.warning, }; } else { throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status); } } else { this.handleError(response, "search"); } } catch (error: any) { if (error.response?.data?.error) { throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); } else { throw new FirecrawlError(error.message, 500); } } return { success: false, error: "Internal server error.", data: [] }; } /** * Initiates a crawl job for a URL using the Firecrawl API. * @param url - The URL to crawl. * @param params - Additional parameters for the crawl request. * @param pollInterval - Time in seconds for job status checks. * @param idempotencyKey - Optional idempotency key for the request. * @returns The response from the crawl operation. */ async crawlUrl( url: string, params?: CrawlParams, pollInterval: number = 2, idempotencyKey?: string ): Promise<CrawlStatusResponse | ErrorResponse> { const headers = this.prepareHeaders(idempotencyKey); let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/crawl`, jsonData, headers ); if (response.status === 200) { const id: string = response.data.id; return this.monitorJobStatus(id, headers, pollInterval); } else { this.handleError(response, "start crawl job"); } } catch (error: any) { if (error.response?.data?.error) { throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); } else { throw new FirecrawlError(error.message, 500); } } return { success: false, error: "Internal server error." }; } async asyncCrawlUrl( url: string, params?: CrawlParams, idempotencyKey?: string ): Promise<CrawlResponse | ErrorResponse> { const headers = this.prepareHeaders(idempotencyKey); let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/crawl`, jsonData, headers ); if (response.status === 200) { return response.data; } else { this.handleError(response, "start crawl job"); } } catch (error: any) { if (error.response?.data?.error) { throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); } else { throw new FirecrawlError(error.message, 500); } } return { success: false, error: "Internal server error." }; } /** * Checks the status of a crawl job using the Firecrawl API. * @param id - The ID of the crawl operation. * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @param nextURL - The `next` URL from the previous crawl status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`. * @param skip - How many entries to skip to paginate. Only required if you're not providing `nextURL`. Only used when `getAllData = false`. * @param limit - How many entries to return. Only used when `getAllData = false`. * @returns The response containing the job status. */ async checkCrawlStatus(id?: string, getAllData = false, nextURL?: string, skip?: number, limit?: number): Promise<CrawlStatusResponse | ErrorResponse> { if (!id) { throw new FirecrawlError("No crawl ID provided", 400); } const headers: AxiosRequestHeaders = this.prepareHeaders(); const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/crawl/${id}`); if (skip !== undefined) { targetURL.searchParams.set("skip", skip.toString()); } if (limit !== undefined) { targetURL.searchParams.set("limit", limit.toString()); } try { const response: AxiosResponse = await this.getRequest( targetURL.href, headers ); if (response.status === 200) { let allData = response.data.data; if (getAllData && response.data.status === "completed") { let statusData = response.data if ("data" in statusData) { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { if (data.length === 0) { break } statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } allData = data; } } let resp: CrawlStatusResponse | ErrorResponse = { success: response.data.success, status: response.data.status, total: response.data.total, completed: response.data.completed, creditsUsed: response.data.creditsUsed, next: getAllData ? undefined : response.data.next, expiresAt: new Date(response.data.expiresAt), data: allData } if (!response.data.success && response.data.error) { resp = { ...resp, success: false, error: response.data.error } as ErrorResponse; } if (response.data.next) { (resp as CrawlStatusResponse).next = response.data.next; } return resp; } else { this.handleError(response, "check crawl status"); } } catch (error: any) { throw new FirecrawlError(error.message, 500); } return { success: false, error: "Internal server error." }; } /** * Returns information about crawl errors. * @param id - The ID of the crawl operation. * @returns Information about crawl errors. */ async checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> { const headers = this.prepareHeaders(); try { const response: AxiosResponse = await this.deleteRequest( `${this.apiUrl}/v1/crawl/${id}/errors`, headers ); if (response.status === 200) { return response.data; } else { this.handleError(response, "check crawl errors"); } } catch (error: any) { throw new FirecrawlError(error.message, 500); } return { success: false, error: "Internal server error." }; } /** * Cancels a crawl job using the Firecrawl API. * @param id - The ID of the crawl operation. * @returns The response from the cancel crawl operation. */ async cancelCrawl(id: string): Promise<ErrorResponse> { const headers = this.prepareHeaders(); try { const response: AxiosResponse = await this.deleteRequest( `${this.apiUrl}/v1/crawl/${id}`, headers ); if (response.status === 200) { return response.data; } else { this.handleError(response, "cancel crawl job"); } } catch (error: any) { throw new FirecrawlError(error.message, 500); } return { success: false, error: "Internal server error." }; } /** * Initiates a crawl job and returns a CrawlWatcher to monitor the job via WebSocket. * @param url - The URL to crawl. * @param params - Additional parameters for the crawl request. * @param idempotencyKey - Optional idempotency key for the request. * @returns A CrawlWatcher instance to monitor the crawl job. */ async crawlUrlAndWatch( url: string, params?: CrawlParams, idempotencyKey?: string, ) { const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey); if (crawl.success && crawl.id) { const id = crawl.id; return new CrawlWatcher(id, this); } throw new FirecrawlError("Crawl job failed to start", 400); } /** * Maps a URL using the Firecrawl API. * @param url - The URL to map. * @param params - Additional parameters for the map request. * @returns The response from the map operation. */ async mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse> { const headers = this.prepareHeaders(); let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/map`, jsonData, headers ); if (response.status === 200) { return response.data as MapResponse; } else { this.handleError(response, "map"); } } catch (error: any) { throw new FirecrawlError(error.message, 500); } return { success: false, error: "Internal server error." }; } /** * Initiates a batch scrape job for multiple URLs using the Firecrawl API. * @param url - The URLs to scrape. * @param params - Additional parameters for the scrape request. * @param pollInterval - Time in seconds for job status checks. * @param idempotencyKey - Optional idempotency key for the request. * @param webhook - Optional webhook for the batch scrape. * @param ignoreInvalidURLs - Optional flag to ignore invalid URLs. * @returns The response from the crawl operation. */ async batchScrapeUrls( urls: string[], params?: ScrapeParams, pollInterval: number = 2, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean, maxConcurrency?: number, ): Promise<BatchScrapeStatusResponse | ErrorResponse> { const headers = this.prepareHeaders(idempotencyKey); let jsonData: any = { urls, webhook, ignoreInvalidURLs, maxConcurrency, ...params, origin: `js-sdk@${this.version}` }; if (jsonData?.extract?.schema) { let schema = jsonData.extract.schema; // Try parsing the schema as a Zod schema try { schema = zodToJsonSchema(schema); } catch (error) { } jsonData = { ...jsonData, extract: { ...jsonData.extract, schema: schema, }, }; } if (jsonData?.jsonOptions?.schema) { let schema = jsonData.jsonOptions.schema; // Try parsing the schema as a Zod schema try { schema = zodToJsonSchema(schema); } catch (error) { } jsonData = { ...jsonData, jsonOptions: { ...jsonData.jsonOptions, schema: schema, }, }; } try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/batch/scrape`, jsonData, headers ); if (response.status === 200) { const id: string = response.data.id; return this.monitorJobStatus(id, headers, pollInterval); } else { this.handleError(response, "start batch scrape job"); } } catch (error: any) { if (error.response?.data?.error) { throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); } else { throw new FirecrawlError(error.message, 500); } } return { success: false, error: "Internal server error." }; } async asyncBatchScrapeUrls( urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean, ): Promise<BatchScrapeResponse | ErrorResponse> { const headers = this.prepareHeaders(idempotencyKey); let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/batch/scrape`, jsonData, headers ); if (response.status === 200) { return response.data; } else { this.handleError(response, "start batch scrape job"); } } catch (error: any) { if (error.response?.data?.error) { throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); } else { throw new FirecrawlError(error.message, 500); } } return { success: false, error: "Internal server error." }; } /** * Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket. * @param urls - The URL to scrape. * @param params - Additional parameters for the scrape request. * @param idempotencyKey - Optional idempotency key for the request. * @returns A CrawlWatcher instance to monitor the crawl job. */ async batchScrapeUrlsAndWatch( urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean, ) { const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs); if (crawl.success && crawl.id) { const id = crawl.id; return new CrawlWatcher(id, this); } throw new FirecrawlError("Batch scrape job failed to start", 400); } /** * Checks the status of a batch scrape job using the Firecrawl API. * @param id - The ID of the batch scrape operation. * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @param nextURL - The `next` URL from the previous batch scrape status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`. * @param skip - How many entries to skip to paginate. Only used when `getAllData = false`. * @param limit - How many entries to return. Only used when `getAllData = false`. * @returns The response containing the job status. */ async checkBatchScrapeStatus(id?: string, getAllData = false, nextURL?: string, skip?: number, limit?: number): Promise<BatchScrapeStatusResponse | ErrorResponse> { if (!id) { throw new FirecrawlError("No batch scrape ID provided", 400); } const headers: AxiosRequestHeaders = this.prepareHeaders(); const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/batch/scrape/${id}`); if (skip !== undefined) { targetURL.searchParams.set("skip", skip.toString()); } if (limit !== undefined) { targetURL.searchParams.set("limit", limit.toString()); } try { const response: AxiosResponse = await this.getRequest( targetURL.href, headers ); if (response.status === 200) { let allData = response.data.data; if (getAllData && response.data.status === "completed") { let statusData = response.data if ("data" in statusData) { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { if (data.length === 0) { break } statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } allData = data; } } let resp: BatchScrapeStatusResponse | ErrorResponse = { success: response.data.success, status: response.data.status, total: response.data.total, completed: response.data.completed, creditsUsed: response.data.creditsUsed, next: getAllData ? undefined : response.data.next, expiresAt: new Date(response.data.expiresAt), data: allData } if (!response.data.success && response.data.error) { resp = { ...resp, success: false, error: response.data.error } as ErrorResponse; } if (response.data.next) { (resp as BatchScrapeStatusResponse).next = response.data.next; } return resp; } else { this.handleError(response, "check batch scrape status"); } } catch (error: any) { throw new FirecrawlError(error.message, 500); } return { success: false, error: "Internal server error." }; } /** * Returns information about batch scrape errors. * @param id - The ID of the batch scrape operation. * @returns Information about batch scrape errors. */ async checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> { const headers = this.prepareHeaders(); try { const response: AxiosResponse = await this.deleteRequest( `${this.apiUrl}/v1/batch/scrape/${id}/errors`, headers ); if (response.status === 200) { return response.data; } else { this.handleError(response, "check batch scrape errors"); } } catch (error: any) { throw new FirecrawlError(error.message, 500); } return { success: false, error: "Internal server error." }; } /** * Extracts information from URLs using the Firecrawl API. * Currently in Beta. Expect breaking changes on future minor versions. * @param urls - The URLs to extract information from. Optional if using other methods for data extraction. * @param params - Additional parameters for the extract request. * @returns The response from the extract operation. */ async extract<T extends zt.ZodSchema = any>(urls?: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> { const headers = this.prepareHeaders(); let jsonData: { urls?: string[] } & ExtractParams<T> = { urls: urls, ...params }; let jsonSchema: any; try { if (!params?.schema) { jsonSchema = undefined; } else { try { jsonSchema = zodToJsonSchema(params.schema as zt.ZodType); } catch (_) { jsonSchema = params.schema; } } } catch (error: any) { throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400); } try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/extract`, { ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` }, headers ); if (response.status === 200) { const jobId = response.data.id; let extractStatus; do { const statusResponse: AxiosResponse = await this.getRequest( `${this.apiUrl}/v1/extract/${jobId}`, headers ); extractStatus = statusResponse.data; if (extractStatus.status === "completed") { if (extractStatus.success) { return { success: true, data: extractStatus.data, warning: extractStatus.warning, error: extractStatus.error, sources: extractStatus?.sources || undefined, }; } else { throw new FirecrawlError(`Failed to extract data. Error: ${extractStatus.error}`, statusResponse.status); } } else if (extractStatus.status === "failed" || extractStatus.status === "cancelled") { throw new FirecrawlError(`Extract job ${extractStatus.status}. Error: ${extractStatus.error}`, statusResponse.status); } await new Promise(resolve => setTimeout(resolve, 1000)); // Polling interval } while (extractStatus.status !== "completed"); } else { this.handleError(response, "extract"); } } catch (error: any) { throw new FirecrawlError(error.message, 500, error.response?.data?.details); } return { success: false, error: "Internal server error."}; } /** * Initiates an asynchronous extract job for a URL using the Firecrawl API. * @param url - The URL to extract data from. * @param params - Additional parameters for the extract request. * @param idempotencyKey - Optional idempotency key for the request. * @returns The response from the extract operation. */ async asyncExtract( urls: string[], params?: ExtractParams, idempotencyKey?: string ): Promise<ExtractResponse | ErrorResponse> { const headers = this.prepareHeaders(idempotencyKey); let jsonData: any = { urls, ...params }; let jsonSchema: any; try { if (!params?.schema) { jsonSchema = undefined; } else { try { jsonSchema = zodToJsonSchema(params.schema as zt.ZodType); } catch (_) { jsonSchema = params.schema; } } } catch (error: any) { throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400); } try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/extract`, { ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` }, headers ); if (response.status === 200) { return response.data; } else { this.handleError(response, "start extract job"); } } catch (error: any) { throw new FirecrawlError(error.message, 500, error.response?.data?.details); } return { success: false, error: "Internal server error." }; } /** * Retrieves the status of an extract job. * @param jobId - The ID of the extract job. * @returns The status of the extract job. */ async getExtractStatus(jobId: string): Promise<any> { try { const response: AxiosResponse = await this.getRequest( `${this.apiUrl}/v1/extract/${jobId}`, this.prepareHeaders() ); if (response.status === 200) { return response.data; } else { this.handleError(response, "get extract status"); } } catch (error: any) { throw new FirecrawlError(error.message, 500); } } /** * Prepares the headers for an API request. * @param idempotencyKey - Optional key to ensure idempotency. * @returns The prepared headers. */ prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { return { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, ...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}), } as AxiosRequestHeaders & { "x-idempotency-key"?: string }; } /** * Sends a POST request to the specified URL. * @param url - The URL to send the request to. * @param data - The data to send in the request. * @param headers - The headers for the request. * @returns The response from the POST request. */ postRequest( url: string, data: any, headers: AxiosRequestHeaders ): Promise<AxiosResponse> { return axios.post(url, data, { headers, timeout: (data?.timeout ? (data.timeout + 5000) : undefined) }); } /** * Sends a GET request to the specified URL. * @param url - The URL to send the request to. * @param headers - The headers for the request. * @returns The response from the GET request. */ async getRequest( url: string, headers: AxiosRequestHeaders ): Promise<AxiosResponse> { try { return await axios.get(url, { headers }); } catch (error) { if (error instanceof AxiosError && error.response) { return error.response as AxiosResponse; } else { throw error; } } } /** * Sends a DELETE request to the specified URL. * @param url - The URL to send the request to. * @param headers - The headers for the request. * @returns The response from the DELETE request. */ async deleteRequest( url: string, headers: AxiosRequestHeaders ): Promise<AxiosResponse> { try { return await axios.delete(url, { headers }); } catch (error) { if (error instanceof AxiosError && error.response) { return error.response as AxiosResponse; } else { throw error; } } } /** * Monitors the status of a crawl job until completion or failure. * @param id - The ID of the crawl operation. * @param headers - The headers for the request. * @param checkInterval - Interval in seconds for job status checks. * @param checkUrl - Optional URL to check the status (used for v1 API) * @returns The final job status or data. */ async monitorJobStatus( id: string, headers: AxiosRequestHeaders, checkInterval: number ): Promise<CrawlStatusResponse | ErrorResponse> { let failedTries = 0; let networkRetries = 0; const maxNetworkRetries = 3; while (true) { try { let statusResponse: AxiosResponse = await this.getRequest( `${this.apiUrl}/v1/crawl/${id}`, headers ); if (statusResponse.status === 200) { failedTries = 0; networkRetries = 0; let statusData = statusResponse.data; if (statusData.status === "completed") { if ("data" in statusData) { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { if (data.length === 0) { break } statusResponse = await this.getRequest(statusData.next, headers); statusData = statusResponse.data; data = data.concat(statusData.data); } statusData.data = data; return statusData; } else { throw new FirecrawlError("Crawl job completed but no data was returned", 500); } } else if ( ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) ) { checkInterval = Math.max(checkInterval, 2); await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000) ); } else { throw new FirecrawlError( `Crawl job failed or was stopped. Status: ${statusData.status}`, 500 ); } } else { failedTries++; if (failedTries >= 3) { this.handleError(statusResponse, "check crawl status"); } } } catch (error: any) { if (this.isRetryableError(error) && networkRetries < maxNetworkRetries) { networkRetries++; const backoffDelay = Math.min(1000 * Math.pow(2, networkRetries - 1), 10000); await new Promise((resolve) => setTimeout(resolve, backoffDelay)); continue; } throw new FirecrawlError(error, 500); } } } /** * Determines if an error is retryable (transient network error) * @param error - The error to check * @returns True if the error should be retried */ private isRetryableError(error: any): boolean { if (error instanceof AxiosError) { if (!error.response) { const code = error.code; const message = error.message?.toLowerCase() || ''; return ( code === 'ECONNRESET' || code === 'ETIMEDOUT' || code === 'ENOTFOUND' || code === 'ECONNREFUSED' || message.includes('socket hang up') || message.includes('network error') || message.includes('timeout') ); } if (error.response?.status === 408 || error.response?.status === 504) { return true; } } if (error && typeof error === 'object') { const code = error.code; const message = error.message?.toLowerCase() || ''; if (code === 'ECONNRESET' || code === 'ETIMEDOUT' || code === 'ENOTFOUND' || code === 'ECONNREFUSED' || message.includes('socket hang up') || message.includes('network error') || message.includes('timeout')) { return true; } if (error.response?.status === 408 || error.response?.status === 504) { return true; } } return false; } /** * Handles errors from API responses. * @param {AxiosResponse} response - The response from the API. * @param {string} action - The action being performed when the error occurred. */ handleError(response: AxiosResponse, action: string): void { if (!response) { throw new FirecrawlError( `No response received while trying to ${action}. This may be a network error or the server is unreachable.`, 0 ); } if ([400, 402, 403, 408, 409, 500].includes(response.status)) { const errorMessage: string = response.data.error || "Unknown error occurred"; const details = response.data.details ? ` - ${JSON.stringify(response.data.details)}` : ''; throw new FirecrawlError( `Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}${details}`, response.status, response?.data?.details ); } else { throw new FirecrawlError( `Unexpected error occurred while trying to ${action}. Status code: ${response.status}`, response.status ); } } /** * Initiates a deep research operation on a given query and polls until completion. * @param query - The query to research. * @param params - Parameters for the deep research operation. * @param onActivity - Optional callback to receive activity updates in real-time. * @param onSource - Optional callback to receive source updates in real-time. * @returns The final research results. */ async deepResearch( query: string, params: DeepResearchParams<zt.ZodSchema>, onActivity?: (activity: { type: string; status: string; message: string; timestamp: string; depth: number; }) => void, onSource?: (source: { url: string; title?: string; description?: string; icon?: string; }) => void ): Promise<DeepResearchStatusResponse | ErrorResponse> { try { const response = await this.asyncDeepResearch(query, params); if (!response.success || 'error' in response) { return { success: false, error: 'error' in response ? response.error : 'Unknown error' }; } if (!response.id) { throw new FirecrawlError(`Failed to start research. No job ID returned.`, 500); } const jobId = response.id; let researchStatus; let lastActivityCount = 0; let lastSourceCount = 0; while (true) { researchStatus = await this.checkDeepResearchStatus(jobId); if ('error' in researchStatus && !resea