UNPKG

crawl4ai

Version:

TypeScript SDK for Crawl4AI REST API - Bun & Node.js compatible

490 lines (479 loc) 14.6 kB
// src/errors.ts class Crawl4AIError extends Error { status; statusText; data; request; constructor(message, status, statusText, data) { super(message); this.name = "Crawl4AIError"; if (status !== undefined) { this.status = status; } if (statusText !== undefined) { this.statusText = statusText; } if (data !== undefined) { this.data = data; } } } class NetworkError extends Crawl4AIError { constructor(message, cause) { super(message); this.name = "NetworkError"; if (cause) { this.cause = cause; } } } class TimeoutError extends NetworkError { timeout; constructor(timeout, url) { const message = url ? `Request to ${url} timed out after ${timeout}ms` : `Request timed out after ${timeout}ms`; super(message); this.name = "TimeoutError"; this.timeout = timeout; } } class RequestValidationError extends Crawl4AIError { field; value; constructor(message, field, value) { super(message, 400, "Bad Request"); this.name = "RequestValidationError"; if (field !== undefined) { this.field = field; } if (value !== undefined) { this.value = value; } } } class RateLimitError extends Crawl4AIError { retryAfter; limit; remaining; reset; constructor(message, retryAfter, headers) { super(message, 429, "Too Many Requests"); this.name = "RateLimitError"; if (retryAfter !== undefined) { this.retryAfter = retryAfter; } if (headers) { if (headers["x-ratelimit-limit"]) { this.limit = parseInt(headers["x-ratelimit-limit"], 10); } if (headers["x-ratelimit-remaining"]) { this.remaining = parseInt(headers["x-ratelimit-remaining"], 10); } if (headers["x-ratelimit-reset"]) { this.reset = new Date(parseInt(headers["x-ratelimit-reset"], 10) * 1000); } } } } class AuthError extends Crawl4AIError { constructor(message = "Authentication failed", status = 401) { super(message, status, status === 401 ? "Unauthorized" : "Forbidden"); this.name = "AuthError"; } } class ServerError extends Crawl4AIError { constructor(message = "Internal server error", status = 500, statusText) { super(message, status, statusText || "Internal Server Error"); this.name = "ServerError"; } } class NotFoundError extends Crawl4AIError { resource; constructor(resource) { const message = resource ? `Resource not found: ${resource}` : "Resource not found"; super(message, 404, "Not Found"); this.name = "NotFoundError"; if (resource) { this.resource = resource; } } } class ParseError extends Crawl4AIError { responseText; constructor(message, responseText) { super(message); this.name = "ParseError"; if (responseText) { this.responseText = responseText; } } } function isCrawl4AIError(error) { return error instanceof Crawl4AIError; } function isRateLimitError(error) { return error instanceof RateLimitError; } function isAuthError(error) { return error instanceof AuthError; } function isNetworkError(error) { return error instanceof NetworkError; } function createHttpError(status, statusText, message, data, headers) { const errorMessage = message || `HTTP ${status}: ${statusText}`; switch (status) { case 400: return new RequestValidationError(errorMessage); case 401: return new AuthError(errorMessage, 401); case 403: return new AuthError(errorMessage, 403); case 404: return new NotFoundError; case 429: { const retryAfter = headers?.["retry-after"] ? parseInt(headers["retry-after"], 10) : undefined; return new RateLimitError(errorMessage, retryAfter, headers); } case 500: case 502: case 503: case 504: return new ServerError(errorMessage, status, statusText); default: return new Crawl4AIError(errorMessage, status, statusText, data); } } // src/sdk.ts var DEFAULT_TIMEOUT = 300000; var DEFAULT_RETRIES = 3; var DEFAULT_RETRY_DELAY = 1000; var RETRY_BACKOFF_MULTIPLIER = 2; var HEALTH_CHECK_TIMEOUT = 5000; var CLIENT_ERROR_MIN = 400; var CLIENT_ERROR_MAX = 500; var RATE_LIMIT_STATUS = 429; class Crawl4AI { config; constructor(config) { if (!config.baseUrl) { throw new RequestValidationError("baseUrl is required in configuration", "baseUrl"); } try { new URL(config.baseUrl); } catch { throw new RequestValidationError(`Invalid baseUrl: ${config.baseUrl}`, "baseUrl", config.baseUrl); } const defaults = { apiToken: "", timeout: DEFAULT_TIMEOUT, retries: DEFAULT_RETRIES, retryDelay: DEFAULT_RETRY_DELAY, defaultHeaders: { "Content-Type": "application/json" }, throwOnError: true, validateStatus: (status) => status < CLIENT_ERROR_MIN, debug: false }; if (config.timeout !== undefined && (config.timeout <= 0 || !Number.isFinite(config.timeout))) { throw new RequestValidationError("timeout must be a positive number", "timeout", config.timeout); } if (config.retries !== undefined && (config.retries < 0 || !Number.isInteger(config.retries))) { throw new RequestValidationError("retries must be a non-negative integer", "retries", config.retries); } if (config.retryDelay !== undefined && (config.retryDelay < 0 || !Number.isFinite(config.retryDelay))) { throw new RequestValidationError("retryDelay must be a non-negative number", "retryDelay", config.retryDelay); } this.config = { ...defaults, ...config, baseUrl: config.baseUrl.replace(/\/$/, ""), defaultHeaders: { ...defaults.defaultHeaders, ...config.defaultHeaders }, throwOnError: config.throwOnError ?? defaults.throwOnError, validateStatus: config.validateStatus || defaults.validateStatus }; if (this.config.apiToken) { this.config.defaultHeaders.Authorization = `Bearer ${this.config.apiToken}`; } } validateUrl(url) { try { new URL(url); } catch { throw new RequestValidationError(`Invalid URL: ${url}`, "url", url); } } log(message, data) { if (this.config.debug) { console.log(`[Crawl4AI] ${message}`, data || ""); } } normalizeArrayResponse(response) { if (Array.isArray(response)) { return response; } if (typeof response === "object" && response !== null) { const apiResponse = response; if (apiResponse.results && Array.isArray(apiResponse.results)) { return apiResponse.results; } if (apiResponse.result && Array.isArray(apiResponse.result)) { return apiResponse.result; } } return [response]; } buildQueryParams(params) { const searchParams = new URLSearchParams; for (const [key, value] of Object.entries(params)) { if (value !== undefined) { searchParams.append(key, String(value)); } } return searchParams.toString(); } async request(endpoint, options = {}) { const url = `${this.config.baseUrl}${endpoint}`; const { timeout = this.config.timeout, signal, headers, ...fetchOptions } = options; this.log(`Request: ${fetchOptions.method || "GET"} ${url}`, fetchOptions.body); const requestHeaders = { ...this.config.defaultHeaders, ...headers }; const controller = new AbortController; const timeoutId = setTimeout(() => controller.abort(), timeout); const requestSignal = signal || controller.signal; try { const response = await fetch(url, { ...fetchOptions, headers: requestHeaders, signal: requestSignal }); clearTimeout(timeoutId); const contentType = response.headers.get("content-type") || ""; let responseData; if (contentType.includes("application/json")) { responseData = await response.json(); } else if (contentType.includes("text/html") || contentType.includes("text/plain")) { responseData = await response.text(); } else if (contentType.includes("text/event-stream")) { return response; } else { responseData = await response.text(); } this.log(`Response: ${response.status}`, responseData); if (!this.config.validateStatus(response.status)) { const headers2 = {}; response.headers.forEach((value, key) => { headers2[key] = value; }); const error = createHttpError(response.status, response.statusText, undefined, responseData, headers2); error.request = { url, method: fetchOptions.method || "GET", headers: requestHeaders, body: fetchOptions.body }; if (this.config.throwOnError) { throw error; } } return responseData; } catch (error) { clearTimeout(timeoutId); if (error instanceof Error && error.name === "AbortError") { throw new TimeoutError(timeout, url); } if (error instanceof TypeError && error.message.includes("fetch")) { throw new NetworkError(`Network request failed: ${error.message}`, error); } throw error; } } async requestWithRetry(endpoint, options = {}) { let lastError = new Error("No attempts made"); for (let attempt = 0;attempt <= this.config.retries; attempt++) { try { return await this.request(endpoint, options); } catch (error) { lastError = error; if (error instanceof Crawl4AIError && error.status && error.status >= CLIENT_ERROR_MIN && error.status < CLIENT_ERROR_MAX && error.status !== RATE_LIMIT_STATUS) { throw error; } if (attempt < this.config.retries) { let delay = this.config.retryDelay * RETRY_BACKOFF_MULTIPLIER ** attempt; if (error instanceof RateLimitError && error.retryAfter) { delay = error.retryAfter * 1000; this.log(`Rate limited. Waiting ${error.retryAfter}s before retry (attempt ${attempt + 1}/${this.config.retries})`); } else { this.log(`Retry attempt ${attempt + 1}/${this.config.retries} after ${delay}ms`); } await new Promise((resolve) => setTimeout(resolve, delay)); } } } throw lastError; } async crawl(request, config) { const urls = Array.isArray(request.urls) ? request.urls : [request.urls]; for (const url of urls) { this.validateUrl(url); } const normalizedRequest = { ...request, urls }; const response = await this.requestWithRetry("/crawl", { method: "POST", body: JSON.stringify(normalizedRequest), ...config }); return this.normalizeArrayResponse(response); } async markdown(request, config) { this.validateUrl(request.url); const apiRequest = { url: request.url, ...request.filter !== undefined && { f: request.filter }, ...request.query !== undefined && { q: request.query }, ...request.cache !== undefined && { c: request.cache } }; const response = await this.requestWithRetry("/md", { method: "POST", body: JSON.stringify(apiRequest), ...config }); return typeof response === "string" ? response : response.markdown; } async html(request, config) { this.validateUrl(request.url); const response = await this.requestWithRetry("/html", { method: "POST", body: JSON.stringify(request), ...config }); return typeof response === "string" ? response : response.html; } async executeJs(request, config) { this.validateUrl(request.url); return this.requestWithRetry("/execute_js", { method: "POST", body: JSON.stringify(request), ...config }); } async ask(params, config) { const queryString = this.buildQueryParams({ context_type: params?.context_type, query: params?.query, score_ratio: params?.score_ratio, max_results: params?.max_results }); const endpoint = `/ask${queryString ? `?${queryString}` : ""}`; const response = await this.requestWithRetry(endpoint, { method: "GET", ...config }); const results = response.doc_results || response.code_results || response.all_results || []; const result = { context: results.map((r) => r.text).join(` `), type: params?.context_type || "doc", results_count: results.length }; if (params?.query !== undefined) { result.query = params.query; } return result; } async llm(url, query, config) { this.validateUrl(url); const encodedUrl = encodeURIComponent(url); const queryParams = new URLSearchParams({ q: query }); const response = await this.requestWithRetry(`/llm/${encodedUrl}?${queryParams.toString()}`, { method: "GET", ...config }); return typeof response === "string" ? response : response.answer || ""; } async health(config) { return this.request("/health", { method: "GET", ...config }); } async metrics(config) { return this.request("/metrics", { method: "GET", ...config }); } async schema(config) { return this.request("/schema", { method: "GET", ...config }); } async getRoot(config) { return this.request("/", { method: "GET", ...config }); } async testConnection(options) { try { await this.health({ timeout: HEALTH_CHECK_TIMEOUT }); return true; } catch (error) { if (options?.throwOnError) { throw error; } return false; } } async version(options) { try { const health = await this.health(); return health.version || "unknown"; } catch (error) { if (options?.throwOnError) { throw error; } return "unknown"; } } setApiToken(token) { this.config.apiToken = token; if (token) { this.config.defaultHeaders.Authorization = `Bearer ${token}`; } else { delete this.config.defaultHeaders.Authorization; } } setBaseUrl(baseUrl) { this.config.baseUrl = baseUrl.replace(/\/$/, ""); } setDebug(debug) { this.config.debug = debug; } } var sdk_default = Crawl4AI; export { isRateLimitError, isNetworkError, isCrawl4AIError, isAuthError, sdk_default as default, createHttpError, TimeoutError, ServerError, RequestValidationError, RateLimitError, ParseError, NotFoundError, NetworkError, Crawl4AIError, Crawl4AI, AuthError };