UNPKG

@mendable/firecrawl-js

Version:
169 lines (155 loc) 6.24 kB
import { type BatchScrapeJob, type BatchScrapeResponse, type CrawlErrorsResponse, type Document, type BatchScrapeOptions, type PaginationConfig, JobTimeoutError, SdkError, } from "../types"; import { HttpClient } from "../utils/httpClient"; import { ensureValidScrapeOptions } from "../utils/validation"; import { fetchAllPages } from "../utils/pagination"; import { normalizeAxiosError, throwForBadResponse, isRetryableError } from "../utils/errorHandler"; export async function startBatchScrape( http: HttpClient, urls: string[], { options, webhook, appendToId, ignoreInvalidURLs, maxConcurrency, zeroDataRetention, idempotencyKey, integration, }: BatchScrapeOptions = {} ): Promise<BatchScrapeResponse> { if (!Array.isArray(urls) || urls.length === 0) throw new Error("URLs list cannot be empty"); const payload: Record<string, unknown> = { urls }; if (options) { ensureValidScrapeOptions(options); Object.assign(payload, options); } if (webhook != null) payload.webhook = webhook; if (appendToId != null) payload.appendToId = appendToId; if (ignoreInvalidURLs != null) payload.ignoreInvalidURLs = ignoreInvalidURLs; if (maxConcurrency != null) payload.maxConcurrency = maxConcurrency; if (zeroDataRetention != null) payload.zeroDataRetention = zeroDataRetention; if (integration != null && integration.trim()) payload.integration = integration.trim(); try { const headers = http.prepareHeaders(idempotencyKey); const res = await http.post<{ success: boolean; id: string; url: string; invalidURLs?: string[]; error?: string }>("/v2/batch/scrape", payload, headers); if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "start batch scrape"); return { id: res.data.id, url: res.data.url, invalidURLs: res.data.invalidURLs || undefined }; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "start batch scrape"); throw err; } } export async function getBatchScrapeStatus( http: HttpClient, jobId: string, pagination?: PaginationConfig ): Promise<BatchScrapeJob> { try { const res = await http.get<{ success: boolean; status: BatchScrapeJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/batch/scrape/${jobId}`); if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status"); const body = res.data; const initialDocs = (body.data || []) as Document[]; const auto = pagination?.autoPaginate ?? true; if (!auto || !body.next) { return { id: jobId, status: body.status, completed: body.completed ?? 0, total: body.total ?? 0, creditsUsed: body.creditsUsed, expiresAt: body.expiresAt, next: body.next ?? null, data: initialDocs, }; } const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination); return { id: jobId, status: body.status, completed: body.completed ?? 0, total: body.total ?? 0, creditsUsed: body.creditsUsed, expiresAt: body.expiresAt, next: null, data: aggregated, }; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status"); throw err; } } export async function cancelBatchScrape(http: HttpClient, jobId: string): Promise<boolean> { try { const res = await http.delete<{ status: string }>(`/v2/batch/scrape/${jobId}`); if (res.status !== 200) throwForBadResponse(res, "cancel batch scrape"); return res.data?.status === "cancelled"; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "cancel batch scrape"); throw err; } } export async function getBatchScrapeErrors(http: HttpClient, jobId: string): Promise<CrawlErrorsResponse> { try { const res = await http.get<{ success?: boolean; data?: { errors: Array<Record<string, string>>; robotsBlocked: string[] } }>(`/v2/batch/scrape/${jobId}/errors`); if (res.status !== 200) throwForBadResponse(res, "get batch scrape errors"); const payload = res.data?.data ?? (res.data as any); return { errors: payload.errors || [], robotsBlocked: payload.robotsBlocked || [] }; } catch (err: any) { if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape errors"); throw err; } } export async function waitForBatchCompletion(http: HttpClient, jobId: string, pollInterval = 2, timeout?: number): Promise<BatchScrapeJob> { const start = Date.now(); while (true) { try { const status = await getBatchScrapeStatus(http, jobId); if (["completed", "failed", "cancelled"].includes(status.status)) { return status; } } catch (err: any) { // Don't retry on permanent errors (4xx) - re-throw immediately with jobId context if (!isRetryableError(err)) { // Create new error with jobId for better debugging (non-retryable errors like 404) if (err instanceof SdkError) { const errorWithJobId = new SdkError( err.message, err.status, err.code, err.details, jobId ); throw errorWithJobId; } throw err; } // Otherwise, retry after delay - error might be transient (network issue, timeout, 5xx, etc.) } if (timeout != null && Date.now() - start > timeout * 1000) { throw new JobTimeoutError(jobId, timeout, 'batch'); } await new Promise((r) => setTimeout(r, Math.max(1000, pollInterval * 1000))); } } export async function batchScrape( http: HttpClient, urls: string[], opts: BatchScrapeOptions & { pollInterval?: number; timeout?: number } = {} ): Promise<BatchScrapeJob> { const start = await startBatchScrape(http, urls, opts); return waitForBatchCompletion(http, start.id, opts.pollInterval ?? 2, opts.timeout); } export function chunkUrls(urls: string[], chunkSize = 100): string[][] { const chunks: string[][] = []; for (let i = 0; i < urls.length; i += chunkSize) chunks.push(urls.slice(i, i + chunkSize)); return chunks; }