UNPKG

@mendable/firecrawl-js

Version:
1,587 lines (1,580 loc) 57.7 kB
import * as zt from 'zod'; import { ZodTypeAny } from 'zod'; import { AxiosResponse, AxiosRequestHeaders } from 'axios'; import { EventEmitter } from 'events'; import { TypedEventTarget } from 'typescript-event-target'; type FormatString = 'markdown' | 'html' | 'rawHtml' | 'links' | 'images' | 'screenshot' | 'summary' | 'changeTracking' | 'json' | 'attributes' | 'branding'; interface Viewport { width: number; height: number; } interface Format { type: FormatString; } interface JsonFormat extends Format { type: 'json'; prompt?: string; schema?: Record<string, unknown> | ZodTypeAny; } interface ScreenshotFormat { type: 'screenshot'; fullPage?: boolean; quality?: number; viewport?: Viewport | { width: number; height: number; }; } interface ChangeTrackingFormat extends Format { type: 'changeTracking'; modes: ('git-diff' | 'json')[]; schema?: Record<string, unknown>; prompt?: string; tag?: string; } interface AttributesFormat extends Format { type: 'attributes'; selectors: Array<{ selector: string; attribute: string; }>; } type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat | AttributesFormat; interface LocationConfig$1 { country?: string; languages?: string[]; } interface WaitAction { type: 'wait'; milliseconds?: number; selector?: string; } interface ScreenshotAction { type: 'screenshot'; fullPage?: boolean; quality?: number; viewport?: Viewport | { width: number; height: number; }; } interface ClickAction { type: 'click'; selector: string; } interface WriteAction { type: 'write'; text: string; } interface PressAction { type: 'press'; key: string; } interface ScrollAction { type: 'scroll'; direction: 'up' | 'down'; selector?: string; } interface ScrapeAction { type: 'scrape'; } interface ExecuteJavascriptAction { type: 'executeJavascript'; script: string; } interface PDFAction { type: 'pdf'; format?: 'A0' | 'A1' | 'A2' | 'A3' | 'A4' | 'A5' | 'A6' | 'Letter' | 'Legal' | 'Tabloid' | 'Ledger'; landscape?: boolean; scale?: number; } type ActionOption = WaitAction | ScreenshotAction | ClickAction | WriteAction | PressAction | ScrollAction | ScrapeAction | ExecuteJavascriptAction | PDFAction; interface ScrapeOptions { formats?: FormatOption[]; headers?: Record<string, string>; includeTags?: string[]; excludeTags?: string[]; onlyMainContent?: boolean; timeout?: number; waitFor?: number; mobile?: boolean; parsers?: Array<string | { type: 'pdf'; maxPages?: number; }>; actions?: ActionOption[]; location?: LocationConfig$1; skipTlsVerification?: boolean; removeBase64Images?: boolean; fastMode?: boolean; useMock?: string; blockAds?: boolean; proxy?: 'basic' | 'stealth' | 'auto' | string; maxAge?: number; minAge?: number; storeInCache?: boolean; integration?: string; } interface WebhookConfig { url: string; headers?: Record<string, string>; metadata?: Record<string, string>; events?: Array<'completed' | 'failed' | 'page' | 'started'>; } interface BrandingProfile { colorScheme?: 'light' | 'dark'; logo?: string | null; fonts?: Array<{ family: string; [key: string]: unknown; }>; colors?: { primary?: string; secondary?: string; accent?: string; background?: string; textPrimary?: string; textSecondary?: string; link?: string; success?: string; warning?: string; error?: string; [key: string]: string | undefined; }; typography?: { fontFamilies?: { primary?: string; heading?: string; code?: string; [key: string]: string | undefined; }; fontStacks?: { primary?: string[]; heading?: string[]; body?: string[]; paragraph?: string[]; [key: string]: string[] | undefined; }; fontSizes?: { h1?: string; h2?: string; h3?: string; body?: string; small?: string; [key: string]: string | undefined; }; lineHeights?: { heading?: number; body?: number; [key: string]: number | undefined; }; fontWeights?: { light?: number; regular?: number; medium?: number; bold?: number; [key: string]: number | undefined; }; }; spacing?: { baseUnit?: number; padding?: Record<string, number>; margins?: Record<string, number>; gridGutter?: number; borderRadius?: string; [key: string]: number | string | Record<string, number> | undefined; }; components?: { buttonPrimary?: { background?: string; textColor?: string; borderColor?: string; borderRadius?: string; [key: string]: string | undefined; }; buttonSecondary?: { background?: string; textColor?: string; borderColor?: string; borderRadius?: string; [key: string]: string | undefined; }; input?: { borderColor?: string; focusBorderColor?: string; borderRadius?: string; [key: string]: string | undefined; }; [key: string]: unknown; }; icons?: { style?: string; primaryColor?: string; [key: string]: string | undefined; }; images?: { logo?: string | null; favicon?: string | null; ogImage?: string | null; [key: string]: string | null | undefined; }; animations?: { transitionDuration?: string; easing?: string; [key: string]: string | undefined; }; layout?: { grid?: { columns?: number; maxWidth?: string; [key: string]: number | string | undefined; }; headerHeight?: string; footerHeight?: string; [key: string]: number | string | Record<string, number | string | undefined> | undefined; }; tone?: { voice?: string; emojiUsage?: string; [key: string]: string | undefined; }; personality?: { tone: 'professional' | 'playful' | 'modern' | 'traditional' | 'minimalist' | 'bold'; energy: 'low' | 'medium' | 'high'; targetAudience: string; }; [key: string]: unknown; } interface DocumentMetadata { title?: string; description?: string; url?: string; language?: string; keywords?: string | string[]; robots?: string; ogTitle?: string; ogDescription?: string; ogUrl?: string; ogImage?: string; ogAudio?: string; ogDeterminer?: string; ogLocale?: string; ogLocaleAlternate?: string[]; ogSiteName?: string; ogVideo?: string; favicon?: string; dcTermsCreated?: string; dcDateCreated?: string; dcDate?: string; dcTermsType?: string; dcType?: string; dcTermsAudience?: string; dcTermsSubject?: string; dcSubject?: string; dcDescription?: string; dcTermsKeywords?: string; modifiedTime?: string; publishedTime?: string; articleTag?: string; articleSection?: string; sourceURL?: string; statusCode?: number; scrapeId?: string; numPages?: number; contentType?: string; timezone?: string; proxyUsed?: 'basic' | 'stealth'; cacheState?: 'hit' | 'miss'; cachedAt?: string; creditsUsed?: number; concurrencyLimited?: boolean; concurrencyQueueDurationMs?: number; error?: string; [key: string]: unknown; } interface Document { markdown?: string; html?: string; rawHtml?: string; json?: unknown; summary?: string; metadata?: DocumentMetadata; links?: string[]; images?: string[]; screenshot?: string; attributes?: Array<{ selector: string; attribute: string; values: string[]; }>; actions?: Record<string, unknown>; warning?: string; changeTracking?: Record<string, unknown>; branding?: BrandingProfile; } interface PaginationConfig { /** When true (default), automatically follow `next` links and aggregate all documents. */ autoPaginate?: boolean; /** Maximum number of additional pages to fetch after the first response. */ maxPages?: number; /** Maximum total number of documents to return across all pages. */ maxResults?: number; /** Maximum time to spend fetching additional pages (in seconds). */ maxWaitTime?: number; } interface SearchResultWeb { url: string; title?: string; description?: string; category?: string; } interface SearchResultNews { title?: string; url?: string; snippet?: string; date?: string; imageUrl?: string; position?: number; category?: string; } interface SearchResultImages { title?: string; imageUrl?: string; imageWidth?: number; imageHeight?: number; url?: string; position?: number; } interface SearchData { web?: Array<SearchResultWeb | Document>; news?: Array<SearchResultNews | Document>; images?: Array<SearchResultImages | Document>; } interface CategoryOption { type: 'github' | 'research' | 'pdf'; } interface SearchRequest { query: string; sources?: Array<'web' | 'news' | 'images' | { type: 'web' | 'news' | 'images'; }>; categories?: Array<'github' | 'research' | 'pdf' | CategoryOption>; limit?: number; tbs?: string; location?: string; ignoreInvalidURLs?: boolean; timeout?: number; scrapeOptions?: ScrapeOptions; integration?: string; } interface CrawlOptions { prompt?: string | null; excludePaths?: string[] | null; includePaths?: string[] | null; maxDiscoveryDepth?: number | null; sitemap?: 'skip' | 'include'; ignoreQueryParameters?: boolean; limit?: number | null; crawlEntireDomain?: boolean; allowExternalLinks?: boolean; allowSubdomains?: boolean; delay?: number | null; maxConcurrency?: number | null; webhook?: string | WebhookConfig | null; scrapeOptions?: ScrapeOptions | null; zeroDataRetention?: boolean; integration?: string; } interface CrawlResponse$1 { id: string; url: string; } interface CrawlJob { id: string; status: 'scraping' | 'completed' | 'failed' | 'cancelled'; total: number; completed: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data: Document[]; } interface BatchScrapeOptions { options?: ScrapeOptions; webhook?: string | WebhookConfig; appendToId?: string; ignoreInvalidURLs?: boolean; maxConcurrency?: number; zeroDataRetention?: boolean; idempotencyKey?: string; integration?: string; } interface BatchScrapeResponse$1 { id: string; url: string; invalidURLs?: string[]; } interface BatchScrapeJob { id: string; status: 'scraping' | 'completed' | 'failed' | 'cancelled'; completed: number; total: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data: Document[]; } interface MapData { links: SearchResultWeb[]; } interface MapOptions { search?: string; sitemap?: 'only' | 'include' | 'skip'; includeSubdomains?: boolean; ignoreQueryParameters?: boolean; limit?: number; timeout?: number; integration?: string; location?: LocationConfig$1; } interface ExtractResponse$1 { success?: boolean; id?: string; status?: 'processing' | 'completed' | 'failed' | 'cancelled'; data?: unknown; error?: string; warning?: string; sources?: Record<string, unknown>; expiresAt?: string; creditsUsed?: number; } interface AgentResponse { success: boolean; id: string; error?: string; } interface AgentStatusResponse { success: boolean; status: 'processing' | 'completed' | 'failed'; error?: string; data?: unknown; expiresAt: string; creditsUsed?: number; } interface AgentOptions$1 { model: 'FIRE-1' | 'v3-beta'; } interface ConcurrencyCheck { concurrency: number; maxConcurrency: number; } interface CreditUsage { remainingCredits: number; planCredits?: number; billingPeriodStart?: string | null; billingPeriodEnd?: string | null; } interface TokenUsage { remainingTokens: number; planTokens?: number; billingPeriodStart?: string | null; billingPeriodEnd?: string | null; } interface CreditUsageHistoricalPeriod { startDate: string | null; endDate: string | null; apiKey?: string; creditsUsed: number; } interface CreditUsageHistoricalResponse { success: boolean; periods: CreditUsageHistoricalPeriod[]; } interface TokenUsageHistoricalPeriod { startDate: string | null; endDate: string | null; apiKey?: string; tokensUsed: number; } interface TokenUsageHistoricalResponse { success: boolean; periods: TokenUsageHistoricalPeriod[]; } interface CrawlErrorsResponse$1 { errors: { id: string; timestamp?: string; url: string; code?: string; error: string; }[]; robotsBlocked: string[]; } interface ActiveCrawl { id: string; teamId: string; url: string; options?: Record<string, unknown> | null; } interface ActiveCrawlsResponse { success: boolean; crawls: ActiveCrawl[]; } interface ErrorDetails { code?: string; message: string; details?: Record<string, unknown>; status?: number; } declare class SdkError extends Error { status?: number; code?: string; details?: unknown; jobId?: string; constructor(message: string, status?: number, code?: string, details?: unknown, jobId?: string); } declare class JobTimeoutError extends SdkError { timeoutSeconds: number; constructor(jobId: string, timeoutSeconds: number, jobType?: 'batch' | 'crawl'); } interface QueueStatusResponse$1 { success: boolean; jobsInQueue: number; activeJobsInQueue: number; waitingJobsInQueue: number; maxConcurrency: number; mostRecentSuccess: string | null; } interface HttpClientOptions { apiKey: string; apiUrl: string; timeoutMs?: number; maxRetries?: number; backoffFactor?: number; } declare class HttpClient { private instance; private readonly apiKey; private readonly apiUrl; private readonly maxRetries; private readonly backoffFactor; constructor(options: HttpClientOptions); getApiUrl(): string; getApiKey(): string; private request; private sleep; post<T = any>(endpoint: string, body: Record<string, unknown>, headers?: Record<string, string>): Promise<AxiosResponse<T, any, {}>>; get<T = any>(endpoint: string, headers?: Record<string, string>): Promise<AxiosResponse<T, any, {}>>; delete<T = any>(endpoint: string, headers?: Record<string, string>): Promise<AxiosResponse<T, any, {}>>; prepareHeaders(idempotencyKey?: string): Record<string, string>; } declare function prepareExtractPayload(args: { urls?: string[]; prompt?: string; schema?: Record<string, unknown> | ZodTypeAny; systemPrompt?: string; allowExternalLinks?: boolean; enableWebSearch?: boolean; showSources?: boolean; scrapeOptions?: ScrapeOptions; ignoreInvalidURLs?: boolean; integration?: string; agent?: AgentOptions$1; }): Record<string, unknown>; declare function startExtract(http: HttpClient, args: Parameters<typeof prepareExtractPayload>[0]): Promise<ExtractResponse$1>; declare function prepareAgentPayload(args: { urls?: string[]; prompt: string; schema?: Record<string, unknown> | ZodTypeAny; integration?: string; maxCredits?: number; strictConstrainToURLs?: boolean; }): Record<string, unknown>; declare function startAgent(http: HttpClient, args: Parameters<typeof prepareAgentPayload>[0]): Promise<AgentResponse>; type JobKind = "crawl" | "batch"; interface WatcherOptions { kind?: JobKind; pollInterval?: number; timeout?: number; } declare class Watcher extends EventEmitter { private readonly http; private readonly jobId; private readonly kind; private readonly pollInterval; private readonly timeout?; private ws?; private closed; private readonly emittedDocumentKeys; constructor(http: HttpClient, jobId: string, opts?: WatcherOptions); private buildWsUrl; start(): Promise<void>; private attachWsHandlers; private documentKey; private emitDocuments; private emitSnapshot; private pollLoop; close(): void; } type ExtractJsonSchemaFromFormats<Formats> = Formats extends readonly any[] ? Extract<Formats[number], { type: "json"; schema?: unknown; }>["schema"] : never; type InferredJsonFromOptions<Opts> = Opts extends { formats?: infer Fmts; } ? ExtractJsonSchemaFromFormats<Fmts> extends zt.ZodTypeAny ? zt.infer<ExtractJsonSchemaFromFormats<Fmts>> : unknown : unknown; /** * Configuration for the v2 client transport. */ interface FirecrawlClientOptions { /** API key (falls back to FIRECRAWL_API_KEY). */ apiKey?: string | null; /** API base URL (falls back to FIRECRAWL_API_URL or https://api.firecrawl.dev). */ apiUrl?: string | null; /** Per-request timeout in milliseconds (optional). */ timeoutMs?: number; /** Max automatic retries for transient failures (optional). */ maxRetries?: number; /** Exponential backoff factor for retries (optional). */ backoffFactor?: number; } /** * Firecrawl v2 client. Provides typed access to all v2 endpoints and utilities. */ declare class FirecrawlClient { private readonly http; private isCloudService; /** * Create a v2 client. * @param options Transport configuration (API key, base URL, timeouts, retries). */ constructor(options?: FirecrawlClientOptions); /** * Scrape a single URL. * @param url Target URL. * @param options Optional scrape options (formats, headers, etc.). * @returns Resolved document with requested formats. */ scrape<Opts extends ScrapeOptions>(url: string, options: Opts): Promise<Omit<Document, "json"> & { json?: InferredJsonFromOptions<Opts>; }>; scrape(url: string, options?: ScrapeOptions): Promise<Document>; /** * Search the web and optionally scrape each result. * @param query Search query string. * @param req Additional search options (sources, limit, scrapeOptions, etc.). * @returns Structured search results. */ search(query: string, req?: Omit<SearchRequest, "query">): Promise<SearchData>; /** * Map a site to discover URLs (sitemap-aware). * @param url Root URL to map. * @param options Mapping options (sitemap mode, includeSubdomains, limit, timeout). * @returns Discovered links. */ map(url: string, options?: MapOptions): Promise<MapData>; /** * Start a crawl job (async). * @param url Root URL to crawl. * @param req Crawl configuration (paths, limits, scrapeOptions, webhook, etc.). * @returns Job id and url. */ startCrawl(url: string, req?: CrawlOptions): Promise<CrawlResponse$1>; /** * Get the status and partial data of a crawl job. * @param jobId Crawl job id. */ getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>; /** * Cancel a crawl job. * @param jobId Crawl job id. * @returns True if cancelled. */ cancelCrawl(jobId: string): Promise<boolean>; /** * Convenience waiter: start a crawl and poll until it finishes. * @param url Root URL to crawl. * @param req Crawl configuration plus waiter controls (pollInterval, timeout seconds). * @returns Final job snapshot. */ crawl(url: string, req?: CrawlOptions & { pollInterval?: number; timeout?: number; }): Promise<CrawlJob>; /** * Retrieve crawl errors and robots.txt blocks. * @param crawlId Crawl job id. */ getCrawlErrors(crawlId: string): Promise<CrawlErrorsResponse$1>; /** * List active crawls for the authenticated team. */ getActiveCrawls(): Promise<ActiveCrawlsResponse>; /** * Preview normalized crawl parameters produced by a natural-language prompt. * @param url Root URL. * @param prompt Natural-language instruction. */ crawlParamsPreview(url: string, prompt: string): Promise<Record<string, unknown>>; /** * Start a batch scrape job for multiple URLs (async). * @param urls URLs to scrape. * @param opts Batch options (scrape options, webhook, concurrency, idempotency key, etc.). * @returns Job id and url. */ startBatchScrape(urls: string[], opts?: BatchScrapeOptions): Promise<BatchScrapeResponse$1>; /** * Get the status and partial data of a batch scrape job. * @param jobId Batch job id. */ getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>; /** * Retrieve batch scrape errors and robots.txt blocks. * @param jobId Batch job id. */ getBatchScrapeErrors(jobId: string): Promise<CrawlErrorsResponse$1>; /** * Cancel a batch scrape job. * @param jobId Batch job id. * @returns True if cancelled. */ cancelBatchScrape(jobId: string): Promise<boolean>; /** * Convenience waiter: start a batch scrape and poll until it finishes. * @param urls URLs to scrape. * @param opts Batch options plus waiter controls (pollInterval, timeout seconds). * @returns Final job snapshot. */ batchScrape(urls: string[], opts?: BatchScrapeOptions & { pollInterval?: number; timeout?: number; }): Promise<BatchScrapeJob>; /** * Start an extract job (async). * @param args Extraction request (urls, schema or prompt, flags). * @returns Job id or processing state. */ startExtract(args: Parameters<typeof startExtract>[1]): Promise<ExtractResponse$1>; /** * Get extract job status/data. * @param jobId Extract job id. */ getExtractStatus(jobId: string): Promise<ExtractResponse$1>; /** * Convenience waiter: start an extract and poll until it finishes. * @param args Extraction request plus waiter controls (pollInterval, timeout seconds). * @returns Final extract response. */ extract(args: Parameters<typeof startExtract>[1] & { pollInterval?: number; timeout?: number; }): Promise<ExtractResponse$1>; /** * Start an agent job (async). * @param args Agent request (urls, prompt, schema). * @returns Job id or processing state. */ startAgent(args: Parameters<typeof startAgent>[1]): Promise<AgentResponse>; /** * Get agent job status/data. * @param jobId Agent job id. */ getAgentStatus(jobId: string): Promise<AgentStatusResponse>; /** * Convenience waiter: start an agent and poll until it finishes. * @param args Agent request plus waiter controls (pollInterval, timeout seconds). * @returns Final agent response. */ agent(args: Parameters<typeof startAgent>[1] & { pollInterval?: number; timeout?: number; }): Promise<AgentStatusResponse>; /** * Cancel an agent job. * @param jobId Agent job id. * @returns True if cancelled. */ cancelAgent(jobId: string): Promise<boolean>; /** Current concurrency usage. */ getConcurrency(): Promise<ConcurrencyCheck>; /** Current credit usage. */ getCreditUsage(): Promise<CreditUsage>; /** Recent token usage. */ getTokenUsage(): Promise<TokenUsage>; /** Historical credit usage by month; set byApiKey to true to break down by API key. */ getCreditUsageHistorical(byApiKey?: boolean): Promise<CreditUsageHistoricalResponse>; /** Historical token usage by month; set byApiKey to true to break down by API key. */ getTokenUsageHistorical(byApiKey?: boolean): Promise<TokenUsageHistoricalResponse>; /** Metrics about the team's scrape queue. */ getQueueStatus(): Promise<QueueStatusResponse$1>; /** * Create a watcher for a crawl or batch job. Emits: `document`, `snapshot`, `done`, `error`. * @param jobId Job id. * @param opts Watcher options (kind, pollInterval, timeout seconds). */ watcher(jobId: string, opts?: WatcherOptions): Watcher; } /** * Configuration interface for FirecrawlApp. * @param apiKey - Optional API key for authentication. * @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'. */ interface FirecrawlAppConfig { apiKey?: string | null; apiUrl?: string | null; } /** * Metadata for a Firecrawl document. * Includes various optional properties for document metadata. */ interface FirecrawlDocumentMetadata { title?: string; description?: string; language?: string; keywords?: string; robots?: string; ogTitle?: string; ogDescription?: string; ogUrl?: string; ogImage?: string; ogAudio?: string; ogDeterminer?: string; ogLocale?: string; ogLocaleAlternate?: string[]; ogSiteName?: string; ogVideo?: string; dctermsCreated?: string; dcDateCreated?: string; dcDate?: string; dctermsType?: string; dcType?: string; dctermsAudience?: string; dctermsSubject?: string; dcSubject?: string; dcDescription?: string; dctermsKeywords?: string; modifiedTime?: string; publishedTime?: string; articleTag?: string; articleSection?: string; sourceURL?: string; statusCode?: number; timezone?: string; error?: string; proxyUsed?: "basic" | "stealth"; cacheState?: "miss" | "hit"; cachedAt?: string; creditsUsed?: number; concurrencyLimited?: boolean; concurrencyQueueDurationMs?: number; [key: string]: any; } /** * Document interface for Firecrawl. * Represents a document retrieved or processed by Firecrawl. */ interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | never) = never> { url?: string; markdown?: string; html?: string; rawHtml?: string; links?: string[]; extract?: T; json?: T; screenshot?: string; metadata?: FirecrawlDocumentMetadata; actions: ActionsSchema; changeTracking?: { previousScrapeAt: string | null; changeStatus: "new" | "same" | "changed" | "removed"; visibility: "visible" | "hidden"; diff?: { text: string; json: { files: Array<{ from: string | null; to: string | null; chunks: Array<{ content: string; changes: Array<{ type: string; normal?: boolean; ln?: number; ln1?: number; ln2?: number; content: string; }>; }>; }>; }; }; json?: any; }; title?: string; description?: string; } /** * Location configuration for proxy location */ interface LocationConfig { country?: string; languages?: string[]; } /** * Parameters for scraping operations. * Defines the options and configurations available for scraping web content. */ interface CrawlScrapeOptions { formats?: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json" | "changeTracking")[]; headers?: Record<string, string>; includeTags?: string[]; excludeTags?: string[]; onlyMainContent?: boolean; waitFor?: number; timeout?: number; location?: LocationConfig; mobile?: boolean; skipTlsVerification?: boolean; removeBase64Images?: boolean; blockAds?: boolean; proxy?: "basic" | "stealth" | "auto"; storeInCache?: boolean; maxAge?: number; parsePDF?: boolean; } type Action = { type: "wait"; milliseconds?: number; selector?: string; } | { type: "click"; selector: string; all?: boolean; } | { type: "screenshot"; fullPage?: boolean; quality?: number; } | { type: "write"; text: string; } | { type: "press"; key: string; } | { type: "scroll"; direction?: "up" | "down"; selector?: string; } | { type: "scrape"; } | { type: "executeJavascript"; script: string; }; interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchema extends (Action[] | undefined) = undefined> extends CrawlScrapeOptions { extract?: { prompt?: string; schema?: LLMSchema; systemPrompt?: string; }; jsonOptions?: { prompt?: string; schema?: LLMSchema; systemPrompt?: string; }; changeTrackingOptions?: { prompt?: string; schema?: any; modes?: ("json" | "git-diff")[]; tag?: string | null; }; actions?: ActionsSchema; agent?: AgentOptions; zeroDataRetention?: boolean; } interface ActionsResult { screenshots: string[]; scrapes: ({ url: string; html: string; })[]; javascriptReturns: { type: string; value: unknown; }[]; } /** * Response interface for scraping operations. * Defines the structure of the response received after a scraping operation. */ interface ScrapeResponse<LLMResult = any, ActionsSchema extends (ActionsResult | never) = never> extends FirecrawlDocument<LLMResult, ActionsSchema> { success: true; warning?: string; error?: string; } /** * Parameters for crawling operations. * Includes options for both scraping and mapping during a crawl. */ interface CrawlParams { includePaths?: string[]; excludePaths?: string[]; maxDepth?: number; maxDiscoveryDepth?: number; limit?: number; allowBackwardLinks?: boolean; crawlEntireDomain?: boolean; allowExternalLinks?: boolean; ignoreSitemap?: boolean; scrapeOptions?: CrawlScrapeOptions; webhook?: string | { url: string; headers?: Record<string, string>; metadata?: Record<string, string>; events?: ["completed", "failed", "page", "started"][number][]; }; deduplicateSimilarURLs?: boolean; ignoreQueryParameters?: boolean; regexOnFullURL?: boolean; /** * Delay in seconds between scrapes. This helps respect website rate limits. * If not provided, the crawler may use the robots.txt crawl delay if available. */ delay?: number; allowSubdomains?: boolean; maxConcurrency?: number; zeroDataRetention?: boolean; } /** * Response interface for crawling operations. * Defines the structure of the response received after initiating a crawl. */ interface CrawlResponse { id?: string; url?: string; success: true; error?: string; } /** * Response interface for batch scrape operations. * Defines the structure of the response received after initiating a crawl. */ interface BatchScrapeResponse { id?: string; url?: string; success: true; error?: string; invalidURLs?: string[]; } /** * Response interface for job status checks. * Provides detailed status of a crawl job including progress and results. */ interface CrawlStatusResponse { success: true; status: "scraping" | "completed" | "failed" | "cancelled"; completed: number; total: number; creditsUsed: number; expiresAt: Date; next?: string; data: FirecrawlDocument<undefined>[]; } /** * Response interface for batch scrape job status checks. * Provides detailed status of a batch scrape job including progress and results. */ interface BatchScrapeStatusResponse { success: true; status: "scraping" | "completed" | "failed" | "cancelled"; completed: number; total: number; creditsUsed: number; expiresAt: Date; next?: string; data: FirecrawlDocument<undefined>[]; } /** * Parameters for mapping operations. * Defines options for mapping URLs during a crawl. */ interface MapParams { search?: string; ignoreSitemap?: boolean; includeSubdomains?: boolean; sitemapOnly?: boolean; limit?: number; timeout?: number; useIndex?: boolean; location?: LocationConfig; } /** * Response interface for mapping operations. * Defines the structure of the response received after a mapping operation. */ interface MapResponse { success: true; links?: string[]; error?: string; } /** * Parameters for extracting information from URLs. * Defines options for extracting information from URLs. */ interface AgentOptions { model?: string; prompt?: string; sessionId?: string; } /** * Parameters for extracting information from URLs. * Defines options for extracting information from URLs. */ interface AgentOptionsExtract { model?: string; sessionId?: string; } interface ExtractParams<LLMSchema extends zt.ZodSchema = any> { prompt?: string; schema?: LLMSchema | object; systemPrompt?: string; allowExternalLinks?: boolean; enableWebSearch?: boolean; includeSubdomains?: boolean; origin?: string; showSources?: boolean; scrapeOptions?: CrawlScrapeOptions; agent?: AgentOptionsExtract; } /** * Response interface for extracting information from URLs. * Defines the structure of the response received after extracting information from URLs. */ interface ExtractResponse<LLMSchema extends zt.ZodSchema = any> { success: boolean; data: LLMSchema; error?: string; warning?: string; sources?: string[]; creditsUsed?: number; } /** * Error response interface. * Defines the structure of the response received when an error occurs. */ interface ErrorResponse { success: false; error: string; } /** * Parameters for search operations. * Defines options for searching and scraping search results. */ interface SearchParams { limit?: number; tbs?: string; filter?: string; lang?: string; country?: string; location?: string; origin?: string; timeout?: number; scrapeOptions?: ScrapeParams; } /** * Response interface for search operations. * Defines the structure of the response received after a search operation. */ interface SearchResponse { success: boolean; data: FirecrawlDocument<undefined>[]; warning?: string; error?: string; } /** * Response interface for crawl/batch scrape error monitoring. */ interface CrawlErrorsResponse { /** * Scrapes that errored out + error details */ errors: { id: string; timestamp?: string; url: string; code?: string; error: string; }[]; /** * URLs blocked by robots.txt */ robotsBlocked: string[]; } /** * Parameters for deep research operations. * Defines options for conducting deep research on a query. */ interface DeepResearchParams<LLMSchema extends zt.ZodSchema = any> { /** * Maximum depth of research iterations (1-10) * @default 7 */ maxDepth?: number; /** * Time limit in seconds (30-300) * @default 270 */ timeLimit?: number; /** * Maximum number of URLs to analyze (1-1000) * @default 20 */ maxUrls?: number; /** * The prompt to use for the final analysis */ analysisPrompt?: string; /** * The system prompt to use for the research agent */ systemPrompt?: string; /** * The formats to use for the final analysis */ formats?: ("markdown" | "json")[]; /** * The JSON options to use for the final analysis */ jsonOptions?: { prompt?: string; schema?: LLMSchema; systemPrompt?: string; }; } /** * Response interface for deep research operations. */ interface DeepResearchResponse { success: boolean; id: string; } /** * Status response interface for deep research operations. */ interface DeepResearchStatusResponse { success: boolean; data: { finalAnalysis: string; activities: Array<{ type: string; status: string; message: string; timestamp: string; depth: number; }>; sources: Array<{ url: string; title: string; description: string; }>; }; status: "processing" | "completed" | "failed"; error?: string; expiresAt: string; currentDepth: number; maxDepth: number; activities: Array<{ type: string; status: string; message: string; timestamp: string; depth: number; }>; sources: Array<{ url: string; title: string; description: string; }>; summaries: string[]; } /** * Parameters for LLMs.txt generation operations. */ interface GenerateLLMsTextParams { /** * Maximum number of URLs to process (1-100) * @default 10 */ maxUrls?: number; /** * Whether to show the full LLMs-full.txt in the response * @default false */ showFullText?: boolean; /** * Whether to use cached content if available * @default true */ cache?: boolean; /** * Experimental flag for streaming */ __experimental_stream?: boolean; } /** * Response interface for LLMs.txt generation operations. */ interface GenerateLLMsTextResponse { success: boolean; id: string; } /** * Status response interface for LLMs.txt generation operations. */ interface GenerateLLMsTextStatusResponse { success: boolean; data: { llmstxt: string; llmsfulltxt?: string; }; status: "processing" | "completed" | "failed"; error?: string; expiresAt: string; } /** * Response interface for queue status operations. */ interface QueueStatusResponse { success: boolean; jobsInQueue: number; activeJobsInQueue: number; waitingJobsInQueue: number; maxConcurrency: number; /** * ISO timestamp of the most recent successful scrape in the past 24 hours. Will be null if no successful scrape has occurred in the past 24 hours. */ mostRecentSuccess: string | null; } /** Credit usage for v1 API (snake_case fields as returned by API). */ interface CreditUsageResponseV1 { success: boolean; data: { remaining_credits: number; plan_credits: number; billing_period_start: string | null; billing_period_end: string | null; }; } /** Token usage for v1 API (snake_case fields as returned by API). */ interface TokenUsageResponseV1 { success: boolean; data: { remaining_tokens: number; plan_tokens: number; billing_period_start: string | null; billing_period_end: string | null; }; } interface CreditUsageHistoricalResponseV1 { success: boolean; periods: { startDate: string | null; endDate: string | null; apiKey?: string; creditsUsed: number; }[]; } interface TokenUsageHistoricalResponseV1 { success: boolean; periods: { startDate: string | null; endDate: string | null; apiKey?: string; tokensUsed: number; }[]; } /** * Main class for interacting with the Firecrawl API. * Provides methods for scraping, searching, crawling, and mapping web content. */ declare class FirecrawlApp { apiKey: string; apiUrl: string; version: string; private isCloudService; private getVersion; private init; /** * Initializes a new instance of the FirecrawlApp class. * @param config - Configuration options for the FirecrawlApp instance. */ constructor({ apiKey, apiUrl }: FirecrawlAppConfig); /** * Scrapes a URL using the Firecrawl API. * @param url - The URL to scrape. * @param params - Additional parameters for the scrape request. * @returns The response from the scrape operation. */ scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>(url: string, params?: ScrapeParams<T, ActionsSchema>): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse>; /** * Searches using the Firecrawl API and optionally scrapes the results. * @param query - The search query string. * @param params - Optional parameters for the search request. * @returns The response from the search operation. */ search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse>; /** * Initiates a crawl job for a URL using the Firecrawl API. * @param url - The URL to crawl. * @param params - Additional parameters for the crawl request. * @param pollInterval - Time in seconds for job status checks. * @param idempotencyKey - Optional idempotency key for the request. * @returns The response from the crawl operation. */ crawlUrl(url: string, params?: CrawlParams, pollInterval?: number, idempotencyKey?: string): Promise<CrawlStatusResponse | ErrorResponse>; asyncCrawlUrl(url: string, params?: CrawlParams, idempotencyKey?: string): Promise<CrawlResponse | ErrorResponse>; /** * Checks the status of a crawl job using the Firecrawl API. * @param id - The ID of the crawl operation. * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @param nextURL - The `next` URL from the previous crawl status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`. * @param skip - How many entries to skip to paginate. Only required if you're not providing `nextURL`. Only used when `getAllData = false`. * @param limit - How many entries to return. Only used when `getAllData = false`. * @returns The response containing the job status. */ checkCrawlStatus(id?: string, getAllData?: boolean, nextURL?: string, skip?: number, limit?: number): Promise<CrawlStatusResponse | ErrorResponse>; /** * Returns information about crawl errors. * @param id - The ID of the crawl operation. * @returns Information about crawl errors. */ checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse>; /** * Cancels a crawl job using the Firecrawl API. * @param id - The ID of the crawl operation. * @returns The response from the cancel crawl operation. */ cancelCrawl(id: string): Promise<ErrorResponse>; /** * Initiates a crawl job and returns a CrawlWatcher to monitor the job via WebSocket. * @param url - The URL to crawl. * @param params - Additional parameters for the crawl request. * @param idempotencyKey - Optional idempotency key for the request. * @returns A CrawlWatcher instance to monitor the crawl job. */ crawlUrlAndWatch(url: string, params?: CrawlParams, idempotencyKey?: string): Promise<CrawlWatcher>; /** * Maps a URL using the Firecrawl API. * @param url - The URL to map. * @param params - Additional parameters for the map request. * @returns The response from the map operation. */ mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse>; /** * Initiates a batch scrape job for multiple URLs using the Firecrawl API. * @param url - The URLs to scrape. * @param params - Additional parameters for the scrape request. * @param pollInterval - Time in seconds for job status checks. * @param idempotencyKey - Optional idempotency key for the request. * @param webhook - Optional webhook for the batch scrape. * @param ignoreInvalidURLs - Optional flag to ignore invalid URLs. * @returns The response from the crawl operation. */ batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean, maxConcurrency?: number): Promise<BatchScrapeStatusResponse | ErrorResponse>; asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<BatchScrapeResponse | ErrorResponse>; /** * Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket. * @param urls - The URL to scrape. * @param params - Additional parameters for the scrape request. * @param idempotencyKey - Optional idempotency key for the request. * @returns A CrawlWatcher instance to monitor the crawl job. */ batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<CrawlWatcher>; /** * Checks the status of a batch scrape job using the Firecrawl API. * @param id - The ID of the batch scrape operation. * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @param nextURL - The `next` URL from the previous batch scrape status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`. * @param skip - How many entries to skip to paginate. Only used when `getAllData = false`. * @param limit - How many entries to return. Only used when `getAllData = false`. * @returns The response containing the job status. */ checkBatchScrapeStatus(id?: string, getAllData?: boolean, nextURL?: string, skip?: number, limit?: number): Promise<BatchScrapeStatusResponse | ErrorResponse>; /** * Returns information about batch scrape errors. * @param id - The ID of the batch scrape operation. * @returns Information about batch scrape errors. */ checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse>; /** * Extracts information from URLs using the Firecrawl API. * Currently in Beta. Expect breaking changes on future minor versions. * @param urls - The URLs to extract information from. Optional if using other methods for data extraction. * @param params - Additional parameters for the extract request. * @returns The response from the extract operation. */ extract<T extends zt.ZodSchema = any>(urls?: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse>; /** * Initiates an asynchronous extract job for a URL using the Firecrawl API. * @param url - The URL to extract data from. * @param params - Additional parameters for the extract request. * @param idempotencyKey - Optional idempotency key for the request. * @returns The response from the extract operation. */ asyncExtract(urls: string[], params?: ExtractParams, idempotencyKey?: string): Promise<ExtractResponse | ErrorResponse>; /** * Retrieves the status of an extract job. * @param jobId - The ID of the extract job. * @returns The status of the extract job. */ getExtractStatus(jobId: string): Promise<any>; /** * Prepares the headers for an API request. * @param idempotencyKey - Optional key to ensure idempotency. * @returns The prepared headers. */ prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders; /** * Sends a POST request to the specified URL. * @param url - The URL to send the request to. * @param data - The data to send in the request. * @param headers - The headers for the request. * @returns The response from the POST request. */ postRequest(url: string, data: any, headers: AxiosRequestHeaders): Promise<AxiosResponse>; /** * Sends a GET request to the specified URL. * @param url - The URL to send the request to. * @param headers - The headers for the request. * @returns The response from the GET request. */ getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse>; /** * Sends a DELETE request to the specified URL. * @param url - The URL to send the request to. * @param headers - The headers for the request. * @returns The response from the DELETE request. */ deleteRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse>; /** * Monitors the status of a crawl job until completion or failure. * @param id - The ID of the crawl operation. * @param headers - The headers for the request. * @param checkInterval - Interval in seconds for job status checks. * @param checkUrl - Optional URL to check the status (used for v1 API) * @returns The final job status or data. */ monitorJobStatus(id: string, headers: AxiosRequestHeaders, checkInterval: number): Promise<CrawlStatusResponse | ErrorResponse>; /** * Determines if an error is retryable (transient network error) * @param error - The error to check * @returns True if the error should be retried */ private isRetryableError; /** * Handles errors from API responses. * @param {AxiosResponse} response - The response from the API. * @param {string} action - The action being performed when the error occurred. */ handleError(response: AxiosResponse, action: string): never; /** * Initiate