@mendable/firecrawl-js
Version:
JavaScript SDK for Firecrawl API
232 lines (218 loc) • 6.58 kB
text/typescript
import {
type BatchScrapeJob,
type BatchScrapeResponse,
type CrawlErrorsResponse,
type Document,
type BatchScrapeOptions,
type PaginationConfig,
JobTimeoutError,
SdkError,
} from "../types";
import { HttpClient } from "../utils/httpClient";
import { ensureValidScrapeOptions } from "../utils/validation";
import { fetchAllPages } from "../utils/pagination";
import {
normalizeAxiosError,
throwForBadResponse,
isRetryableError,
} from "../utils/errorHandler";
export async function startBatchScrape(
http: HttpClient,
urls: string[],
{
options,
webhook,
appendToId,
ignoreInvalidURLs,
maxConcurrency,
zeroDataRetention,
idempotencyKey,
integration,
origin,
}: BatchScrapeOptions = {},
): Promise<BatchScrapeResponse> {
if (!Array.isArray(urls) || urls.length === 0)
throw new Error("URLs list cannot be empty");
const payload: Record<string, unknown> = { urls };
if (options) {
ensureValidScrapeOptions(options);
Object.assign(payload, options);
}
if (webhook != null) payload.webhook = webhook;
if (appendToId != null) payload.appendToId = appendToId;
if (ignoreInvalidURLs != null) payload.ignoreInvalidURLs = ignoreInvalidURLs;
if (maxConcurrency != null) payload.maxConcurrency = maxConcurrency;
if (zeroDataRetention != null) payload.zeroDataRetention = zeroDataRetention;
if (integration != null && integration.trim())
payload.integration = integration.trim();
if (origin) payload.origin = origin;
try {
const headers = http.prepareHeaders(idempotencyKey);
const res = await http.post<{
success: boolean;
id: string;
url: string;
invalidURLs?: string[];
error?: string;
}>("/v2/batch/scrape", payload, { headers });
if (res.status !== 200 || !res.data?.success)
throwForBadResponse(res, "start batch scrape");
return {
id: res.data.id,
url: res.data.url,
invalidURLs: res.data.invalidURLs || undefined,
};
} catch (err: any) {
if (err?.isAxiosError)
return normalizeAxiosError(err, "start batch scrape");
throw err;
}
}
export async function getBatchScrapeStatus(
http: HttpClient,
jobId: string,
pagination?: PaginationConfig,
): Promise<BatchScrapeJob> {
try {
const res = await http.get<{
success: boolean;
status: BatchScrapeJob["status"];
completed?: number;
total?: number;
creditsUsed?: number;
expiresAt?: string;
next?: string | null;
data?: Document[];
}>(`/v2/batch/scrape/${jobId}`);
if (res.status !== 200 || !res.data?.success)
throwForBadResponse(res, "get batch scrape status");
const body = res.data;
const initialDocs = (body.data || []) as Document[];
const auto = pagination?.autoPaginate ?? true;
if (!auto || !body.next) {
return {
id: jobId,
status: body.status,
completed: body.completed ?? 0,
total: body.total ?? 0,
creditsUsed: body.creditsUsed,
expiresAt: body.expiresAt,
next: body.next ?? null,
data: initialDocs,
};
}
const aggregated = await fetchAllPages(
http,
body.next,
initialDocs,
pagination,
);
return {
id: jobId,
status: body.status,
completed: body.completed ?? 0,
total: body.total ?? 0,
creditsUsed: body.creditsUsed,
expiresAt: body.expiresAt,
next: null,
data: aggregated,
};
} catch (err: any) {
if (err?.isAxiosError)
return normalizeAxiosError(err, "get batch scrape status");
throw err;
}
}
export async function cancelBatchScrape(
http: HttpClient,
jobId: string,
): Promise<boolean> {
try {
const res = await http.delete<{ status: string }>(
`/v2/batch/scrape/${jobId}`,
);
if (res.status !== 200) throwForBadResponse(res, "cancel batch scrape");
return res.data?.status === "cancelled";
} catch (err: any) {
if (err?.isAxiosError)
return normalizeAxiosError(err, "cancel batch scrape");
throw err;
}
}
export async function getBatchScrapeErrors(
http: HttpClient,
jobId: string,
): Promise<CrawlErrorsResponse> {
try {
const res = await http.get<{
success?: boolean;
data?: { errors: Array<Record<string, string>>; robotsBlocked: string[] };
}>(`/v2/batch/scrape/${jobId}/errors`);
if (res.status !== 200) throwForBadResponse(res, "get batch scrape errors");
const payload = res.data?.data ?? (res.data as any);
return {
errors: payload.errors || [],
robotsBlocked: payload.robotsBlocked || [],
};
} catch (err: any) {
if (err?.isAxiosError)
return normalizeAxiosError(err, "get batch scrape errors");
throw err;
}
}
export async function waitForBatchCompletion(
http: HttpClient,
jobId: string,
pollInterval = 2,
timeout?: number,
): Promise<BatchScrapeJob> {
const start = Date.now();
while (true) {
try {
const status = await getBatchScrapeStatus(http, jobId);
if (["completed", "failed", "cancelled"].includes(status.status)) {
return status;
}
} catch (err: any) {
// Don't retry on permanent errors (4xx) - re-throw immediately with jobId context
if (!isRetryableError(err)) {
// Create new error with jobId for better debugging (non-retryable errors like 404)
if (err instanceof SdkError) {
const errorWithJobId = new SdkError(
err.message,
err.status,
err.code,
err.details,
jobId,
);
throw errorWithJobId;
}
throw err;
}
// Otherwise, retry after delay - error might be transient (network issue, timeout, 5xx, etc.)
}
if (timeout != null && Date.now() - start > timeout * 1000) {
throw new JobTimeoutError(jobId, timeout, "batch");
}
await new Promise(r => setTimeout(r, Math.max(1000, pollInterval * 1000)));
}
}
export async function batchScrape(
http: HttpClient,
urls: string[],
opts: BatchScrapeOptions & { pollInterval?: number; timeout?: number } = {},
): Promise<BatchScrapeJob> {
const start = await startBatchScrape(http, urls, opts);
return waitForBatchCompletion(
http,
start.id,
opts.pollInterval ?? 2,
opts.timeout,
);
}
export function chunkUrls(urls: string[], chunkSize = 100): string[][] {
const chunks: string[][] = [];
for (let i = 0; i < urls.length; i += chunkSize)
chunks.push(urls.slice(i, i + chunkSize));
return chunks;
}