UNPKG

webcrawlerapi-js

Version:
309 lines (308 loc) 12.5 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __exportStar = (this && this.__exportStar) || function(m, exports) { for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.WebcrawlerClient = exports.ErrorCode = exports.JobStatus = exports.WebcrawlerApiError = void 0; const constants_1 = require("./constants"); Object.defineProperty(exports, "JobStatus", { enumerable: true, get: function () { return constants_1.JobStatus; } }); Object.defineProperty(exports, "ErrorCode", { enumerable: true, get: function () { return constants_1.ErrorCode; } }); const errors_1 = require("./errors"); Object.defineProperty(exports, "WebcrawlerApiError", { enumerable: true, get: function () { return errors_1.WebcrawlerApiError; } }); const BASE_PATH = "https://api.webcrawlerapi.com"; const initialPullDelayMs = 2000; const MaxPullRetries = 100; const DEFAULT_POLL_DELAY_SECONDS = 2; __exportStar(require("./model"), exports); const SCRAPE_VERSION = "v2"; function resolveContentUrl(job, item) { // Prefer output_formats if present and non-empty, using priority: markdown > cleaned > html if (Array.isArray(job.output_formats) && job.output_formats.length > 0) { const priority = ['markdown', 'cleaned', 'html']; for (const fmt of priority) { if (job.output_formats.includes(fmt)) { switch (fmt) { case 'markdown': return item.markdown_content_url; case 'cleaned': return item.cleaned_content_url; case 'html': return item.raw_content_url; } } } return undefined; } // Fall back to scrape_type for backward compatibility switch (job.scrape_type) { case 'html': return item.raw_content_url; case 'cleaned': return item.cleaned_content_url; case 'markdown': return item.markdown_content_url; default: return undefined; } } async function fetchContentUrl(url) { if (!url) { return null; } const response = await fetch(url, { headers: { 'Accept-Encoding': 'gzip, deflate, br', 'Accept': '*/*' } }); if (!response.ok) { throw new Error(`Failed to fetch content: ${response.statusText}`); } return response.text(); } function addGetContentMethod(job) { // Transform each job item to include getContent method job.job_items = job.job_items.map(item => ({ ...item, getContent: async function () { if (job.status !== constants_1.JobStatus.DONE || this.status !== constants_1.JobStatus.DONE) { return null; } const contentUrl = resolveContentUrl(job, this); return fetchContentUrl(contentUrl); }, getMarkdown: async function () { return fetchContentUrl(this.markdown_content_url); }, getCleaned: async function () { return fetchContentUrl(this.cleaned_content_url); }, getHTML: async function () { return fetchContentUrl(this.raw_content_url); } })); return job; } class WebcrawlerClient { constructor(apiKey, basePath = BASE_PATH, apiVersion = "v1") { this.apiVersion = "v1"; this.apiKey = apiKey; this.basePath = basePath; this.apiVersion = apiVersion; } async scrapeAsync(request) { const apiUrl = `${this.basePath}/${SCRAPE_VERSION}/scrape?async=true`; const requestOptions = { 'method': 'POST', 'headers': { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, "User-Agent": "WebcrawlerAPI-NodeJS-Client" }, 'body': JSON.stringify(request), }; const response = await this.sendRequest(apiUrl, requestOptions); return { id: response.id }; } async getScrape(scrapeId) { const url = `${this.basePath}/${SCRAPE_VERSION}/scrape/${scrapeId}`; const requestOptions = { 'method': 'GET', 'headers': { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, "User-Agent": "WebcrawlerAPI-NodeJS-Client", 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0' }, }; const responseData = await this.sendRequest(url, requestOptions); const status = responseData.status; if (status === "done") { return responseData; } else if (status === "error") { return responseData; } else { // in_progress or any other status return { success: false, status: status, page_status_code: 0 }; } } async scrape(request, maxPolls = 100) { // Start the scraping job const scrapeIdResponse = await this.scrapeAsync(request); const scrapeId = scrapeIdResponse.id; let polls = 0; let result; while (polls < maxPolls) { result = await this.getScrape(scrapeId); // Return immediately if scrape is done if ('status' in result && result.status === "done") { return result; } // Return immediately if there's an error if ('error_code' in result) { return result; } // Continue polling if status is in_progress or any other non-terminal status // Wait before next poll await new Promise(resolve => setTimeout(resolve, DEFAULT_POLL_DELAY_SECONDS * 1000)); polls++; } // Return the last known state if maxPolls is reached return result; } async crawl(crawlRequest, actions) { const url = `${this.basePath}/${this.apiVersion}/crawl`; const requestBody = { output_formats: ['markdown'], ...crawlRequest, actions: actions ? (Array.isArray(actions) ? actions : [actions]) : undefined }; const requestOptions = { 'method': 'POST', 'headers': { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, "User-Agent": "WebcrawlerAPI-NodeJS-Client", 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0' }, 'body': JSON.stringify(requestBody), }; const jobIdResponse = await this.sendRequest(url, requestOptions); if (jobIdResponse.id === '') { throw new errors_1.WebcrawlerApiError('invalid_response', 'Failed to fetch job status', 0); } let delayIntervalMs = initialPullDelayMs; for (let i = 0; i < MaxPullRetries; i++) { await new Promise(resolve => setTimeout(resolve, delayIntervalMs)); const timestamp = new Date().getTime(); const job = await this.getJob(`${jobIdResponse.id}?t=${timestamp}`); if (job.status !== constants_1.JobStatus.IN_PROGRESS && job.status !== constants_1.JobStatus.NEW) { return job; } if (job.recommended_pull_delay_ms > 0) { delayIntervalMs = job.recommended_pull_delay_ms; } } throw new errors_1.WebcrawlerApiError('timeout', 'Crawling took too long, please retry or increase the number of polling retries', 0); } async crawlAsync(crawlRequest, actions) { const url = `${this.basePath}/${this.apiVersion}/crawl`; const requestBody = { output_formats: ['markdown'], ...crawlRequest, actions: actions ? (Array.isArray(actions) ? actions : [actions]) : undefined }; const requestOptions = { 'method': 'POST', 'headers': { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}` }, 'body': JSON.stringify(requestBody), }; return await this.sendRequest(url, requestOptions); } async getJob(jobID) { const url = `${this.basePath}/${this.apiVersion}/job/${jobID}`; const requestOptions = { 'method': 'GET', 'headers': { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, "User-Agent": "WebcrawlerAPI-NodeJS-Client", 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0' } }; const job = await this.sendRequest(url, requestOptions); return addGetContentMethod(job); } async getJobMarkdown(jobID) { const url = `${this.basePath}/${this.apiVersion}/job/${jobID}/markdown`; const requestOptions = { 'method': 'GET', 'headers': { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, "User-Agent": "WebcrawlerAPI-NodeJS-Client", 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0' } }; return this.sendRequest(url, requestOptions); } async getJobMarkdownContent(jobID) { const url = `${this.basePath}/${this.apiVersion}/job/${jobID}/markdown/content`; const requestOptions = { 'method': 'GET', 'headers': { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, "User-Agent": "WebcrawlerAPI-NodeJS-Client", 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0' } }; let response; try { response = await fetch(url, requestOptions); } catch (e) { throw new errors_1.WebcrawlerApiError('network_error', `Failed to send request: ${e}`, 0); } if (!response.ok) { try { const errorData = await response.json(); throw (0, errors_1.createErrorFromResponse)(response, errorData); } catch (e) { if (e instanceof errors_1.WebcrawlerApiError) { throw e; } throw new errors_1.WebcrawlerApiError('unknown_error', `Request failed with status ${response.status} ${response.statusText}`, response.status); } } return response.text(); } async sendRequest(url, requestOptions) { let response; try { response = await fetch(url, requestOptions); } catch (e) { throw new errors_1.WebcrawlerApiError('network_error', `Failed to send request: ${e}`, 0); } if (!response.ok) { try { const errorData = await response.json(); throw (0, errors_1.createErrorFromResponse)(response, errorData); } catch (e) { if (e instanceof errors_1.WebcrawlerApiError) { throw e; } // If we can't parse the error response, create a generic error throw new errors_1.WebcrawlerApiError('unknown_error', `Request failed with status ${response.status} ${response.statusText}`, response.status); } } return response.json(); } } exports.WebcrawlerClient = WebcrawlerClient;