UNPKG

firecrawl

Version:
198 lines (180 loc) 8.12 kB
import { type ActiveCrawlsResponse, type CrawlErrorsResponse, type CrawlJob, type CrawlResponse, type Document, type CrawlOptions, type PaginationConfig, JobTimeoutError, SdkError, } from "../types"; import { HttpClient } from "../utils/httpClient"; import { ensureValidScrapeOptions } from "../utils/validation"; import { normalizeAxiosError, throwForBadResponse, isRetryableError } from "../utils/errorHandler"; import type { HttpClient as _Http } from "../utils/httpClient"; import { fetchAllPages } from "../utils/pagination"; export type CrawlRequest = CrawlOptions & { url: string; }; function prepareCrawlPayload(request: CrawlRequest): Record<string, unknown> { if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty"); const data: Record<string, unknown> = { url: request.url.trim() }; if (request.prompt) data.prompt = request.prompt; if (request.excludePaths) data.excludePaths = request.excludePaths; if (request.includePaths) data.includePaths = request.includePaths; if (request.maxDiscoveryDepth != null) data.maxDiscoveryDepth = request.maxDiscoveryDepth; if (request.sitemap != null) data.sitemap = request.sitemap; if (request.ignoreQueryParameters != null) data.ignoreQueryParameters = request.ignoreQueryParameters; if (request.limit != null) data.limit = request.limit; if (request.crawlEntireDomain != null) data.crawlEntireDomain = request.crawlEntireDomain; if (request.allowExternalLinks != null) data.allowExternalLinks = request.allowExternalLinks; if (request.allowSubdomains != null) data.allowSubdomains = request.allowSubdomains; if (request.delay != null) data.delay = request.delay; if (request.maxConcurrency != null) data.maxConcurrency = request.maxConcurrency; if (request.webhook != null) data.webhook = request.webhook; if (request.integration != null && request.integration.trim()) data.integration = request.integration.trim(); if (request.scrapeOptions) { ensureValidScrapeOptions(request.scrapeOptions); data.scrapeOptions = request.scrapeOptions; } if (request.zeroDataRetention != null) data.zeroDataRetention = request.zeroDataRetention; return data; } export async function startCrawl(http: HttpClient, request: CrawlRequest): Promise<CrawlResponse> { const payload = prepareCrawlPayload(request); try { const res = await http.post<{ success: boolean; id: string; url: string; error?: string }>("/v2/crawl", payload); if (res.status !== 200 || !res.data?.success) { throwForBadResponse(res, "start crawl"); } return { id: res.data.id, url: res.data.url }; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "start crawl"); throw err; } } export async function getCrawlStatus( http: HttpClient, jobId: string, pagination?: PaginationConfig ): Promise<CrawlJob> { try { const res = await http.get<{ success: boolean; status: CrawlJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/crawl/${jobId}`); if (res.status !== 200 || !res.data?.success) { throwForBadResponse(res, "get crawl status"); } const body = res.data; const initialDocs = (body.data || []) as Document[]; const auto = pagination?.autoPaginate ?? true; if (!auto || !body.next) { return { id: jobId, status: body.status, completed: body.completed ?? 0, total: body.total ?? 0, creditsUsed: body.creditsUsed, expiresAt: body.expiresAt, next: body.next ?? null, data: initialDocs, }; } const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination); return { id: jobId, status: body.status, completed: body.completed ?? 0, total: body.total ?? 0, creditsUsed: body.creditsUsed, expiresAt: body.expiresAt, next: null, data: aggregated, }; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status"); throw err; } } export async function cancelCrawl(http: HttpClient, jobId: string): Promise<boolean> { try { const res = await http.delete<{ status: string }>(`/v2/crawl/${jobId}`); if (res.status !== 200) throwForBadResponse(res, "cancel crawl"); return res.data?.status === "cancelled"; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "cancel crawl"); throw err; } } export async function waitForCrawlCompletion(http: HttpClient, jobId: string, pollInterval = 2, timeout?: number): Promise<CrawlJob> { const start = Date.now(); while (true) { try { const status = await getCrawlStatus(http, jobId); if (["completed", "failed", "cancelled"].includes(status.status)) { return status; } } catch (err: any) { // Don't retry on permanent errors (4xx) - re-throw immediately with jobId context if (!isRetryableError(err)) { // Create new error with jobId for better debugging (non-retryable errors like 404) if (err instanceof SdkError) { const errorWithJobId = new SdkError( err.message, err.status, err.code, err.details, jobId ); throw errorWithJobId; } throw err; } // Otherwise, retry after delay - error might be transient (network issue, timeout, 5xx, etc.) } if (timeout != null && Date.now() - start > timeout * 1000) { throw new JobTimeoutError(jobId, timeout, 'crawl'); } await new Promise((r) => setTimeout(r, Math.max(1000, pollInterval * 1000))); } } export async function crawl(http: HttpClient, request: CrawlRequest, pollInterval = 2, timeout?: number): Promise<CrawlJob> { const started = await startCrawl(http, request); return waitForCrawlCompletion(http, started.id, pollInterval, timeout); } export async function getCrawlErrors(http: HttpClient, crawlId: string): Promise<CrawlErrorsResponse> { try { const res = await http.get<{ success?: boolean; data?: { errors: Array<Record<string, string>>; robotsBlocked: string[] } }>(`/v2/crawl/${crawlId}/errors`); if (res.status !== 200) throwForBadResponse(res, "get crawl errors"); const payload = res.data?.data ?? (res.data as any); return { errors: payload.errors || [], robotsBlocked: payload.robotsBlocked || [] }; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl errors"); throw err; } } export async function getActiveCrawls(http: HttpClient): Promise<ActiveCrawlsResponse> { try { const res = await http.get<{ success: boolean; crawls: Array<{ id: string; teamId?: string; team_id?: string; url: string; options?: any }> }>(`/v2/crawl/active`); if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get active crawls"); const crawlsIn = res.data?.crawls || []; const crawls = crawlsIn.map((c) => ({ id: c.id, teamId: (c as any).teamId ?? (c as any).team_id, url: c.url, options: c.options ?? null })); return { success: true, crawls }; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "get active crawls"); throw err; } } export async function crawlParamsPreview(http: HttpClient, url: string, prompt: string): Promise<Record<string, unknown>> { if (!url || !url.trim()) throw new Error("URL cannot be empty"); if (!prompt || !prompt.trim()) throw new Error("Prompt cannot be empty"); try { const res = await http.post<{ success: boolean; data?: Record<string, unknown>; warning?: string }>("/v2/crawl/params-preview", { url: url.trim(), prompt }); if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "crawl params preview"); const data = res.data.data || {}; if (res.data.warning) (data as any).warning = res.data.warning; return data; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "crawl params preview"); throw err; } }