webcrawlerapi-js
Version:
JS client for WecrawlerAPI
254 lines (219 loc) • 9.38 kB
text/typescript
import {CrawlRequest, Job, JobId, ScrapeRequest, ScrapeResponse} from "./model";
const BASE_PATH = "https://api.webcrawlerapi.com"
const initialPullDelayMs = 2000
const MaxPullRetries = 100
export class WebcrawlerClient {
protected apiVersion: string = "v1";
protected apiKey: string
protected basePath: string
constructor(apiKey: string, basePath: string = BASE_PATH, apiVersion: string = "v1") {
this.apiKey = apiKey ;
this.basePath = basePath;
this.apiVersion = apiVersion;
}
public async scrapeAsync(scrapeRequest: ScrapeRequest): Promise<JobId> {
const url = `${this.basePath}/${this.apiVersion}/scrape`;
const requestOptions = {
'method': 'POST',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client"
},
'body': JSON.stringify(scrapeRequest),
};
const response = await fetch(url, requestOptions);
if (response.ok) {
return response.json();
}
try {
const data = await response.json();
throw new Error(
`failed to scrape, response returned ${response.status} ${response.statusText}: ${data?.error}`
);
} catch (e) {
throw e;
}
}
public async scrapeWithMeta(scrapeRequest: ScrapeRequest, maxPollingRetries: number = MaxPullRetries): Promise<ScrapeResponse> {
const url = `${this.basePath}/${this.apiVersion}/scrape`;
const requestOptions = {
'method': 'POST',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`
},
'body': JSON.stringify(scrapeRequest),
};
const jobIdResponse: JobId = await this.sendRequest(url, requestOptions);
if (jobIdResponse.id === '') {
throw new Error("Failed to fetch job status");
}
let delayIntervalMs = initialPullDelayMs;
for (let i = 0; i < maxPollingRetries; i++) {
await new Promise(resolve => setTimeout(resolve, delayIntervalMs));
const scrapeResult = await this.getScrapeResult(jobIdResponse.id);
if (scrapeRequest.debug) {
console.log(`Scrape result: ${JSON.stringify(scrapeResult)}`);
}
if (scrapeResult.status !== 'in_progress' && scrapeResult.status !== 'new') {
return scrapeResult;
}
if (scrapeResult.recommended_pull_delay_ms > 0) {
delayIntervalMs = scrapeResult.recommended_pull_delay_ms;
}
}
throw new Error("Scraping took too long, please retry or increase the number of polling retries");
}
public async scrape(scrapeRequest: ScrapeRequest, maxPollingRetries: number = MaxPullRetries): Promise<any> {
const scrapeResult = await this.scrapeWithMeta(scrapeRequest, maxPollingRetries);
return scrapeResult.structured_data;
}
public async getScrapeResult(scrapeID: string): Promise<ScrapeResponse> {
const url = `${this.basePath}/${this.apiVersion}/scrape/${scrapeID}`;
const requestOptions = {
'method': 'GET',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client",
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
},
};
const response = await fetch(url, requestOptions);
if (response.ok) {
return response.json();
}
try {
const data = await response.json();
throw new Error(
`failed to fetch job status ${response.status} ${response.statusText}: ${JSON.stringify(data)}`
);
} catch (e) {
throw new Error(
`failed to fetch job status ${response.status} ${response.statusText}`
);
}
}
public async crawl(crawlRequest: CrawlRequest): Promise<Job> {
const url = `${this.basePath}/${this.apiVersion}/crawl`;
const requestOptions = {
'method': 'POST',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client",
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
},
'body': JSON.stringify(crawlRequest),
};
const jobIdResponse: JobId = await this.sendRequest(url, requestOptions);
if (jobIdResponse.id === '') {
throw new Error("Failed to fetch job status");
}
let delayIntervalMs = initialPullDelayMs;
for (let i = 0; i < MaxPullRetries; i++) {
await new Promise(resolve => setTimeout(resolve, delayIntervalMs));
const timestamp = new Date().getTime();
const job = await this.getJob(`${jobIdResponse.id}?t=${timestamp}`);
if (job.status !== 'in_progress' && job.status !== 'new') {
// Transform each job item to include getContent method
job.job_items = job.job_items.map(item => ({
...item,
getContent: async function(): Promise<string | null> {
if (this.status !== 'done') {
return null;
}
let contentUrl: string | undefined;
switch (job.scrape_type) {
case 'html':
contentUrl = this.raw_content_url;
break;
case 'cleaned':
contentUrl = this.cleaned_content_url;
break;
case 'markdown':
contentUrl = this.markdown_content_url;
break;
}
if (!contentUrl) {
return null;
}
const response = await fetch(contentUrl, {
headers: {
'Accept-Encoding': 'gzip, deflate, br',
'Accept': '*/*'
}
});
if (!response.ok) {
throw new Error(`Failed to fetch content: ${response.statusText}`);
}
return await response.text();
}
}));
return job;
}
if (job.recommended_pull_delay_ms > 0) {
delayIntervalMs = job.recommended_pull_delay_ms;
}
}
throw new Error("Crawling took too long, please retry or increase the number of polling retries");
}
public async crawlAsync(crawlRequest: CrawlRequest): Promise<JobId> {
const url = `${this.basePath}/${this.apiVersion}/crawl`;
const requestOptions = {
'method': 'POST',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`
},
'body': JSON.stringify(crawlRequest),
};
return await this.sendRequest(url, requestOptions);
}
public async getJob(jobID: string): Promise<Job> {
const url = `${this.basePath}/${this.apiVersion}/job/${jobID}`;
const requestOptions = {
'method': 'GET',
'headers': {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"User-Agent": "WebcrawlerAPI-NodeJS-Client",
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
}
}
const response = await fetch(url, requestOptions);
if (response.ok) {
return response.json();
}
try {
const data = await response.json();
throw new Error(
`failed to fetch job status ${response.status} ${response.statusText}: ${JSON.stringify(data)}`
);
} catch (e) {
throw new Error(
`failed to fetch job status ${response.status} ${response.statusText}`
);
}
}
private async sendRequest(url: string, requestOptions: any): Promise<any> {
let response: Response;
try {
response = await fetch(url, requestOptions);
} catch (e) {
throw new Error(`Failed to send request: ${e}`);
}
if (!response.ok) {
const errorResponse = await response.json();
throw new Error(`${JSON.stringify(errorResponse)}`);
}
return response.json();
}
}