webcrawlerapi-js
Version:
JS client for WebcrawlerAPI
122 lines (121 loc) • 4.1 kB
TypeScript
export interface ScrapeRequest {
url: string;
/** @deprecated Use output_formats instead */
output_format?: string;
output_formats?: ('markdown' | 'cleaned' | 'html' | 'links')[];
webhook_url?: string;
/** CSS selectors to clean from the output. */
clean_selectors?: string;
actions?: Action[];
/** A prompt to run on the scraped content, used to extract specific information or format the output (extra cost will be charged). */
prompt?: string;
/** JSON schema to enforce structured output when using a prompt. Follows the OpenAI Structured Outputs format. */
response_schema?: Record<string, any>;
/** Extract only the main content of the page. Works best for articles, blog posts, news. */
main_content_only?: boolean;
/** Maximum age of cached content in seconds. Set to 0 to always fetch fresh content. */
max_age?: number;
}
export interface ScrapeId {
id: string;
}
export interface ScrapeResponse {
success: boolean;
status?: string;
markdown?: string;
cleaned_content?: string;
raw_content?: string;
page_status_code: number;
page_title?: string;
structured_data?: any;
links?: string[];
}
export interface ScrapeResponseError {
success: boolean;
error_code: string;
error_message: string;
status?: string;
}
export interface JobId {
id: string;
}
export interface CrawlRequest {
/** The seed URL where the crawler starts. Can be any valid URL. */
url?: string;
/** The crawler will stop when it reaches this limit of pages for this job. */
items_limit?: number;
/** @deprecated Use output_formats instead */
scrape_type?: string;
output_formats?: ('links' | 'markdown' | 'cleaned' | 'html')[];
/** Regular expression to whitelist URLs. Only URLs that match the pattern will be crawled. */
whitelist_regexp?: string;
/** Regular expression to blacklist URLs. URLs that match the pattern will be skipped. */
blacklist_regexp?: string;
/** URL where the server will send a POST request once the job is completed. */
webhook_url?: string;
actions?: Action[];
/** If true, the crawler will respect the website's robots.txt and skip disallowed pages. Default is false. */
respect_robots_txt?: boolean;
/** Extract only the main content of the page. Works best for articles, blog posts, news. */
main_content_only?: boolean;
/** Maximum crawl depth from the starting URL. 0 = starting page only, 1 = starting page + directly linked pages, etc. No limit by default. */
max_depth?: number;
/** Maximum age of cached content in seconds. Set to 0 to always fetch fresh content. */
max_age?: number;
}
export interface Job {
id: string;
org_id: string;
url: string;
status: string;
/** @deprecated Use output_formats instead */
scrape_type?: string;
output_formats?: string[];
whitelist_regexp: string;
blacklist_regexp: string;
items_limit: number;
max_depth?: number;
created_at: string;
finished_at: string;
updated_at: string;
webhook_url: string;
recommended_pull_delay_ms: number;
job_items: JobItem[];
}
export interface JobItem {
id: string;
job_id: string;
original_url: string;
page_status_code: number;
markdown_content_url: string;
status: string;
title: string;
last_error: string;
created_at: string;
updated_at: string;
cost: number;
referred_url: string;
link?: string;
depth?: number;
raw_content_url?: string;
cleaned_content_url?: string;
error_code?: string;
getContent(): Promise<string | null>;
getMarkdown(): Promise<string | null>;
getCleaned(): Promise<string | null>;
getHTML(): Promise<string | null>;
}
export interface Action {
type: string;
}
export interface UploadS3Action extends Action {
type: 'upload_s3';
path: string;
access_key_id: string;
secret_access_key: string;
bucket: string;
endpoint?: string;
}
export interface JobMarkdownResponse {
content_url: string;
}