webcrawlerapi-js
Version:
JS client for WebcrawlerAPI
309 lines (308 loc) • 12.5 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __exportStar = (this && this.__exportStar) || function(m, exports) {
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.WebcrawlerClient = exports.ErrorCode = exports.JobStatus = exports.WebcrawlerApiError = void 0;
const constants_1 = require("./constants");
Object.defineProperty(exports, "JobStatus", { enumerable: true, get: function () { return constants_1.JobStatus; } });
Object.defineProperty(exports, "ErrorCode", { enumerable: true, get: function () { return constants_1.ErrorCode; } });
const errors_1 = require("./errors");
Object.defineProperty(exports, "WebcrawlerApiError", { enumerable: true, get: function () { return errors_1.WebcrawlerApiError; } });
const BASE_PATH = "https://api.webcrawlerapi.com";
const initialPullDelayMs = 2000;
const MaxPullRetries = 100;
const DEFAULT_POLL_DELAY_SECONDS = 2;
__exportStar(require("./model"), exports);
const SCRAPE_VERSION = "v2";
function resolveContentUrl(job, item) {
// Prefer output_formats if present and non-empty, using priority: markdown > cleaned > html
if (Array.isArray(job.output_formats) && job.output_formats.length > 0) {
const priority = ['markdown', 'cleaned', 'html'];
for (const fmt of priority) {
if (job.output_formats.includes(fmt)) {
switch (fmt) {
case 'markdown': return item.markdown_content_url;
case 'cleaned': return item.cleaned_content_url;
case 'html': return item.raw_content_url;
}
}
}
return undefined;
}
// Fall back to scrape_type for backward compatibility
switch (job.scrape_type) {
case 'html': return item.raw_content_url;
case 'cleaned': return item.cleaned_content_url;
case 'markdown': return item.markdown_content_url;
default: return undefined;
}
}
async function fetchContentUrl(url) {
if (!url) {
return null;
}
const response = await fetch(url, {
headers: {
'Accept-Encoding': 'gzip, deflate, br',
'Accept': '*/*'
}
});
if (!response.ok) {
throw new Error(`Failed to fetch content: ${response.statusText}`);
}
return response.text();
}
function addGetContentMethod(job) {
// Transform each job item to include getContent method
job.job_items = job.job_items.map(item => ({
...item,
getContent: async function () {
if (job.status !== constants_1.JobStatus.DONE || this.status !== constants_1.JobStatus.DONE) {
return null;
}
const contentUrl = resolveContentUrl(job, this);
return fetchContentUrl(contentUrl);
},
getMarkdown: async function () {
return fetchContentUrl(this.markdown_content_url);
},
getCleaned: async function () {
return fetchContentUrl(this.cleaned_content_url);
},
getHTML: async function () {
return fetchContentUrl(this.raw_content_url);
}
}));
return job;
}
class WebcrawlerClient {
constructor(apiKey, basePath = BASE_PATH, apiVersion = "v1") {
this.apiVersion = "v1";
this.apiKey = apiKey;
this.basePath = basePath;
this.apiVersion = apiVersion;
}
async scrapeAsync(request) {
const apiUrl = `${this.basePath}/${SCRAPE_VERSION}/scrape?async=true`;
const requestOptions = {
'method': 'POST',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client"
},
'body': JSON.stringify(request),
};
const response = await this.sendRequest(apiUrl, requestOptions);
return { id: response.id };
}
async getScrape(scrapeId) {
const url = `${this.basePath}/${SCRAPE_VERSION}/scrape/${scrapeId}`;
const requestOptions = {
'method': 'GET',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client",
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
},
};
const responseData = await this.sendRequest(url, requestOptions);
const status = responseData.status;
if (status === "done") {
return responseData;
}
else if (status === "error") {
return responseData;
}
else {
// in_progress or any other status
return {
success: false,
status: status,
page_status_code: 0
};
}
}
async scrape(request, maxPolls = 100) {
// Start the scraping job
const scrapeIdResponse = await this.scrapeAsync(request);
const scrapeId = scrapeIdResponse.id;
let polls = 0;
let result;
while (polls < maxPolls) {
result = await this.getScrape(scrapeId);
// Return immediately if scrape is done
if ('status' in result && result.status === "done") {
return result;
}
// Return immediately if there's an error
if ('error_code' in result) {
return result;
}
// Continue polling if status is in_progress or any other non-terminal status
// Wait before next poll
await new Promise(resolve => setTimeout(resolve, DEFAULT_POLL_DELAY_SECONDS * 1000));
polls++;
}
// Return the last known state if maxPolls is reached
return result;
}
async crawl(crawlRequest, actions) {
const url = `${this.basePath}/${this.apiVersion}/crawl`;
const requestBody = {
output_formats: ['markdown'],
...crawlRequest,
actions: actions ? (Array.isArray(actions) ? actions : [actions]) : undefined
};
const requestOptions = {
'method': 'POST',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client",
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
},
'body': JSON.stringify(requestBody),
};
const jobIdResponse = await this.sendRequest(url, requestOptions);
if (jobIdResponse.id === '') {
throw new errors_1.WebcrawlerApiError('invalid_response', 'Failed to fetch job status', 0);
}
let delayIntervalMs = initialPullDelayMs;
for (let i = 0; i < MaxPullRetries; i++) {
await new Promise(resolve => setTimeout(resolve, delayIntervalMs));
const timestamp = new Date().getTime();
const job = await this.getJob(`${jobIdResponse.id}?t=${timestamp}`);
if (job.status !== constants_1.JobStatus.IN_PROGRESS && job.status !== constants_1.JobStatus.NEW) {
return job;
}
if (job.recommended_pull_delay_ms > 0) {
delayIntervalMs = job.recommended_pull_delay_ms;
}
}
throw new errors_1.WebcrawlerApiError('timeout', 'Crawling took too long, please retry or increase the number of polling retries', 0);
}
async crawlAsync(crawlRequest, actions) {
const url = `${this.basePath}/${this.apiVersion}/crawl`;
const requestBody = {
output_formats: ['markdown'],
...crawlRequest,
actions: actions ? (Array.isArray(actions) ? actions : [actions]) : undefined
};
const requestOptions = {
'method': 'POST',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`
},
'body': JSON.stringify(requestBody),
};
return await this.sendRequest(url, requestOptions);
}
async getJob(jobID) {
const url = `${this.basePath}/${this.apiVersion}/job/${jobID}`;
const requestOptions = {
'method': 'GET',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client",
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
}
};
const job = await this.sendRequest(url, requestOptions);
return addGetContentMethod(job);
}
async getJobMarkdown(jobID) {
const url = `${this.basePath}/${this.apiVersion}/job/${jobID}/markdown`;
const requestOptions = {
'method': 'GET',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client",
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
}
};
return this.sendRequest(url, requestOptions);
}
async getJobMarkdownContent(jobID) {
const url = `${this.basePath}/${this.apiVersion}/job/${jobID}/markdown/content`;
const requestOptions = {
'method': 'GET',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client",
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
}
};
let response;
try {
response = await fetch(url, requestOptions);
}
catch (e) {
throw new errors_1.WebcrawlerApiError('network_error', `Failed to send request: ${e}`, 0);
}
if (!response.ok) {
try {
const errorData = await response.json();
throw (0, errors_1.createErrorFromResponse)(response, errorData);
}
catch (e) {
if (e instanceof errors_1.WebcrawlerApiError) {
throw e;
}
throw new errors_1.WebcrawlerApiError('unknown_error', `Request failed with status ${response.status} ${response.statusText}`, response.status);
}
}
return response.text();
}
async sendRequest(url, requestOptions) {
let response;
try {
response = await fetch(url, requestOptions);
}
catch (e) {
throw new errors_1.WebcrawlerApiError('network_error', `Failed to send request: ${e}`, 0);
}
if (!response.ok) {
try {
const errorData = await response.json();
throw (0, errors_1.createErrorFromResponse)(response, errorData);
}
catch (e) {
if (e instanceof errors_1.WebcrawlerApiError) {
throw e;
}
// If we can't parse the error response, create a generic error
throw new errors_1.WebcrawlerApiError('unknown_error', `Request failed with status ${response.status} ${response.statusText}`, response.status);
}
}
return response.json();
}
}
exports.WebcrawlerClient = WebcrawlerClient;