UNPKG

webcrawlerapi-js

Version:
122 lines (121 loc) 4.1 kB
export interface ScrapeRequest { url: string; /** @deprecated Use output_formats instead */ output_format?: string; output_formats?: ('markdown' | 'cleaned' | 'html' | 'links')[]; webhook_url?: string; /** CSS selectors to clean from the output. */ clean_selectors?: string; actions?: Action[]; /** A prompt to run on the scraped content, used to extract specific information or format the output (extra cost will be charged). */ prompt?: string; /** JSON schema to enforce structured output when using a prompt. Follows the OpenAI Structured Outputs format. */ response_schema?: Record<string, any>; /** Extract only the main content of the page. Works best for articles, blog posts, news. */ main_content_only?: boolean; /** Maximum age of cached content in seconds. Set to 0 to always fetch fresh content. */ max_age?: number; } export interface ScrapeId { id: string; } export interface ScrapeResponse { success: boolean; status?: string; markdown?: string; cleaned_content?: string; raw_content?: string; page_status_code: number; page_title?: string; structured_data?: any; links?: string[]; } export interface ScrapeResponseError { success: boolean; error_code: string; error_message: string; status?: string; } export interface JobId { id: string; } export interface CrawlRequest { /** The seed URL where the crawler starts. Can be any valid URL. */ url?: string; /** The crawler will stop when it reaches this limit of pages for this job. */ items_limit?: number; /** @deprecated Use output_formats instead */ scrape_type?: string; output_formats?: ('links' | 'markdown' | 'cleaned' | 'html')[]; /** Regular expression to whitelist URLs. Only URLs that match the pattern will be crawled. */ whitelist_regexp?: string; /** Regular expression to blacklist URLs. URLs that match the pattern will be skipped. */ blacklist_regexp?: string; /** URL where the server will send a POST request once the job is completed. */ webhook_url?: string; actions?: Action[]; /** If true, the crawler will respect the website's robots.txt and skip disallowed pages. Default is false. */ respect_robots_txt?: boolean; /** Extract only the main content of the page. Works best for articles, blog posts, news. */ main_content_only?: boolean; /** Maximum crawl depth from the starting URL. 0 = starting page only, 1 = starting page + directly linked pages, etc. No limit by default. */ max_depth?: number; /** Maximum age of cached content in seconds. Set to 0 to always fetch fresh content. */ max_age?: number; } export interface Job { id: string; org_id: string; url: string; status: string; /** @deprecated Use output_formats instead */ scrape_type?: string; output_formats?: string[]; whitelist_regexp: string; blacklist_regexp: string; items_limit: number; max_depth?: number; created_at: string; finished_at: string; updated_at: string; webhook_url: string; recommended_pull_delay_ms: number; job_items: JobItem[]; } export interface JobItem { id: string; job_id: string; original_url: string; page_status_code: number; markdown_content_url: string; status: string; title: string; last_error: string; created_at: string; updated_at: string; cost: number; referred_url: string; link?: string; depth?: number; raw_content_url?: string; cleaned_content_url?: string; error_code?: string; getContent(): Promise<string | null>; getMarkdown(): Promise<string | null>; getCleaned(): Promise<string | null>; getHTML(): Promise<string | null>; } export interface Action { type: string; } export interface UploadS3Action extends Action { type: 'upload_s3'; path: string; access_key_id: string; secret_access_key: string; bucket: string; endpoint?: string; } export interface JobMarkdownResponse { content_url: string; }