@spider-cloud/spider-client
Version:
Isomorphic Javascript SDK for Spider Cloud services
580 lines (579 loc) • 17.6 kB
TypeScript
/**
* Represents viewport dimensions.
*/
export interface Viewport {
width: number;
height: number;
}
/**
* Represents HTTP headers as a dictionary object.
*/
export interface Headers {
[key: string]: string;
}
/**
* Represents a budget for various resources.
*/
export interface Budget {
[key: string]: number;
}
/**
* The chunking algorithm to use.
*/
export type ChunkingAlgType = "ByWords" | "ByLines" | "ByCharacterLength" | "BySentence";
/**
* The chunking algorithm with the value to chunk by.
*/
export interface ChunkingAlg {
type: ChunkingAlgType;
value: number;
}
/**
* Represents a timeout configuration.
* @typedef {Object} Timeout
* @property {number} secs - The number of seconds.
* @property {number} nanos - The number of nanoseconds.
*/
interface Timeout {
secs: number;
nanos: number;
}
/**
* Represents the webhook configuration.
* @typedef {Object} WebhookSettings
* @property {Object} object - The webhook configuration.
*/
interface WebhookSettings {
/**
* The URL or endpoint where the webhook information will be sent.
*/
destination: string;
/**
* Flag to indicate an action should be taken when all credits are depleted.
*/
on_credits_depleted: boolean;
/**
* Flag to indicate an action should be taken when half of the credits are depleted.
*/
on_credits_half_depleted: boolean;
/**
* Flag to trigger a notification on a website status update event.
*/
on_website_status: boolean;
/**
* Flag to send information about a new page find, such as links and data size.
*/
on_find: boolean;
/**
* Flag to handle the metadata of a new page that has been found.
*/
on_find_metadata: boolean;
}
/**
* Represents the idle network configuration.
* @typedef {Object} IdleNetwork
* @property {Timeout} timeout - The timeout configuration.
*/
interface IdleNetwork {
timeout: Timeout;
}
/**
* Represents the selector configuration.
* @typedef {Object} Selector
* @property {Timeout} timeout - The timeout configuration.
* @property {string} selector - The CSS selector to wait for.
*/
interface Selector {
timeout: Timeout;
selector: string;
}
/**
* Represents the delay configuration.
* @typedef {Object} Delay
* @property {Timeout} timeout - The timeout configuration.
*/
interface Delay {
timeout: Timeout;
}
/**
* Represents the wait_for configuration.
* @typedef {Object} WaitFor
* @property {IdleNetwork} [idle_network] - Configuration to wait for network to be idle between period.
* @property {IdleNetwork0} [idle_network] - Configuration to wait for network to be idle with max timeout.
* @property {AlmostIdleNetwork0} [idle_network] - Configuration to wait for network to almost idle with max timeout.
* @property {Selector} [selector] - Configuration to wait for a CSS selector.
* @property {Delay} [delay] - Configuration to wait for a delay.
* @property {boolean} [page_navigations] - Whether to wait for page navigations.
*/
export interface WaitForConfiguration {
idle_network?: IdleNetwork;
idle_network0?: IdleNetwork;
almost_idle_network0?: IdleNetwork;
selector?: Selector;
dom?: Selector;
delay?: Delay;
page_navigations?: boolean;
}
/**
* Represents the query API endpoint request to get documents from the global spider collection.
*/
export interface QueryRequest {
/**
* The exact URL to get.
*/
url?: string;
/**
* The domain to get a document from.
*/
domain?: string;
/**
* The path of the webpage to get the document. This is used with the domain key.
*/
pathname?: string;
}
type CSSSelector = {
name: string;
selectors: string[];
};
type CSSExtractionMap = {
[path: string]: CSSSelector[];
};
export type Evaluate = {
/** Rust: Evaluate(String) */
code: string;
};
export type Click = {
/** Rust: Click(String) */
selector: string;
};
export type ClickAll = {
/** Rust: ClickAll(String) */
selector: string;
};
export type ClickAllClickable = {};
export type ClickPoint = {
x: number;
y: number;
};
export type Wait = {
/** Rust: u64 (milliseconds) */
ms: number;
};
export type WaitForNavigation = {};
export type WaitForDom = {
/** Rust: Option<String> */
selector?: string | null;
/** Rust: u32 (milliseconds) */
timeout: number;
};
export type WaitFor = {
selector: string;
};
export type WaitForWithTimeout = {
selector: string;
/** Rust: u64 (milliseconds) */
timeout: number;
};
export type WaitForAndClick = {
selector: string;
};
export type ScrollX = {
/** Rust: i32 (pixels) */
dx: number;
};
export type ScrollY = {
/** Rust: i32 (pixels) */
dy: number;
};
export type Fill = {
selector: string;
value: string;
};
export type Type = {
modifier: number;
value: string;
};
export type InfiniteScroll = {
/** Rust: u32 (pixels/step or count—match your semantics) */
step_px: number;
};
export type Screenshot = {
/** Keep snake_case to match Rust JSON if interop is needed */
full_page: boolean;
omit_background: boolean;
output: string;
};
export type ValidateChain = {};
export type WebAutomation = Evaluate | Click | ClickAll | ClickAllClickable | ClickPoint | Wait | WaitForNavigation | WaitForDom | WaitFor | WaitForWithTimeout | WaitForAndClick | ScrollX | ScrollY | Fill | Type | InfiniteScroll | Screenshot | ValidateChain;
export type ReturnFormat = "markdown" | "commonmark" | "raw" | "screenshot" | "text" | "html2text" | "bytes" | "xml" | "empty";
export type WebAutomationMap = Record<string, WebAutomation[]>;
export type ExecutionScriptsMap = Record<string, string>;
export declare enum RedirectPolicy {
Loose = "Loose",
Strict = "Strict"
}
export type EventTracker = {
responses?: true;
requests?: true;
automation?: boolean;
};
/**
* Proxy pool selection for outbound request routing.
* Choose a pool based on your use case (e.g., stealth, speed, or stability).
*
* - 'residential' → cost-effective entry-level residential pool
* - 'mobile' → 4G/5G mobile proxies for maximum evasion
* - 'isp' → ISP-grade residential (alias: 'datacenter')
*/
export type Proxy = "residential" | "mobile" | "isp";
export type LinkRewriteReplace = {
type: "replace";
/**
* Only apply when the link's host matches this value.
* Optional key; null means "no host filter".
*/
host?: string | null;
find: string;
replace_with: string;
};
export type LinkRewriteRegex = {
type: "regex";
/**
* Only apply when the link's host matches this value.
* Optional key; null means "no host filter".
*/
host?: string | null;
pattern: string;
replace_with: string;
};
export type LinkRewriteRule = LinkRewriteReplace | LinkRewriteRegex;
/**
* Represents the options available for making a spider request.
*/
export interface SpiderParams {
/**
* The URL to be crawled.
*/
url: string;
/**
* The type of request to be made.
*/
request?: "http" | "chrome" | "smart";
/**
* The maximum number of pages the crawler should visit.
*/
limit?: number;
/**
* The format in which the result should be returned. When setting the return format as an array a object is returned mapping by the name.
*/
return_format?: ReturnFormat | ReturnFormat[];
/**
* Specifies whether to only visit the top-level domain.
*/
tld?: boolean;
/**
* The depth of the crawl.
*/
depth?: number;
/**
* Specifies whether the request should be cached.
*/
cache?: boolean;
/**
* The budget for various resources.
*/
budget?: Budget;
/**
* The blacklist routes to ignore. This can be a Regex string pattern.
*/
blacklist?: string[];
/**
* The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
*/
whitelist?: string[];
/**
* The locale to be used during the crawl.
*/
locale?: string;
/**
* The cookies to be set for the request, formatted as a single string.
*/
cookies?: string;
/**
* Specifies whether to use stealth techniques to avoid detection.
*/
stealth?: boolean;
/**
* The headers to be used for the request.
*/
headers?: Headers;
/**
* Specifies whether to include metadata in the response.
*/
metadata?: boolean;
/**
* Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
*/
css_extraction_map?: CSSExtractionMap;
/**
* The dimensions of the viewport.
*/
viewport?: Viewport;
/**
* The two letter country code for the request geo-location.
*/
country_code?: string;
/**
* The encoding to be used for the request.
*/
encoding?: "UTF-8" | "SHIFT_JIS" | string;
/**
* Specifies whether to include subdomains in the crawl.
*/
subdomains?: boolean;
/**
* The user agent string to be used for the request.
*/
user_agent?: string;
/**
* Use webhooks to send data.
*/
webhooks?: WebhookSettings;
link_rewrite?: LinkRewriteRule;
/**
* Specifies whether to use fingerprinting protection.
*/
fingerprint?: boolean;
/**
* Specifies whether to perform the request without using storage.
*/
storageless?: boolean;
/**
* Specifies whether readability optimizations should be applied.
*/
readability?: boolean;
/**
* Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
*/
proxy_enabled?: boolean;
/**
* Specifies whether to respect the site's robots.txt file.
*/
respect_robots?: boolean;
/**
* CSS root selector to be used to filter the content.
*/
root_selector?: string;
/**
* Specifies whether to load all resources of the crawl target.
*/
full_resources?: boolean;
/**
* Specifies whether to use the sitemap links.
*/
sitemap?: boolean;
/**
* Specifies whether to only use the sitemap links.
*/
sitemap_only?: boolean;
/**
* External domains to include the crawl.
*/
external_domains?: string[];
/**
* Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
*/
return_embeddings?: boolean;
/**
* Returns the HTTP response headers used.
*/
return_headers?: boolean;
/**
* Returns the link(s) found on the page that match the crawler query.
*/
return_page_links?: boolean;
/**
* Returns the HTTP response cookies used.
*/
return_cookies?: boolean;
/**
* The timeout for the request, in seconds.
*/
request_timeout?: number;
/**
* Specifies whether to run the request in the background.
*/
run_in_background?: boolean;
/**
* Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
*/
scroll?: number;
/**
* Specifies whether to skip configuration checks.
*/
skip_config_checks?: boolean;
/**
* The chunking algorithm to use.
*/
chunking_alg?: ChunkingAlg;
/**
* The wait for events on the page. You need to make your `request` `chrome` or `smart`.
*/
wait_for?: WaitForConfiguration;
/**
* Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
*/
disable_intercept?: boolean;
/**
* Perform custom web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`.
*/
automation_scripts?: WebAutomationMap;
/**
* Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`.
*/
execution_scripts?: ExecutionScriptsMap;
/**
* The redirect policy for HTTP request. Set the value to Loose to allow all.
*/
redirect_policy?: RedirectPolicy;
/**
* Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
*/
event_tracker?: EventTracker;
/**
* The timeout to stop the crawl.
*/
crawl_timeout?: Timeout;
/**
* Evaluates given script in every frame upon creation (before loading frame's scripts).
*/
evaluate_on_new_document?: string;
/**
* Runs the request using lite_mode:Lite mode reduces data transfer costs by 50%, with trade-offs in speed, accuracy,
* geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
* targeting websites with minimal anti-bot protections.
*/
lite_mode?: boolean;
/**
* Set the maximum number of credits to use per page.
* Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
* Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
*/
max_credits_per_page?: number;
/**
* Proxy pool selection for outbound request routing.
* Choose a pool based on your use case (e.g., stealth, speed, or stability).
*
* - 'residential' → cost-effective entry-level residential pool
* - 'residential_fast' → faster residential pool for higher throughput
* - 'residential_static' → static residential IPs, rotated daily
* - 'mobile' → 4G/5G mobile proxies for maximum evasion
* - 'isp' → ISP-grade residential (alias: 'datacenter')
* - 'residential_premium' → low-latency premium IPs
* - 'residential_core' → balanced plan (quality vs. cost)
* - 'residential_plus' → largest and highest quality core pool
*/
proxy?: Proxy;
/**
* Use a remote proxy at ~50% reduced cost for file downloads.
* This requires bringing your own proxy (e.g., static IP tunnel).
*/
remote_proxy?: string;
}
/**
* Represents the options available for making a spider search request.
*/
export interface SearchRequestParams {
/** The base request parameters shared across requests */
base?: SpiderParams;
/** The search query string (merged with other params) required */
search?: string;
/** Optional limit on the number of websites to search */
search_limit?: number;
/** Whether to fetch the actual page content (defaults to true) */
fetch_page_content?: boolean;
/** Optional search location (e.g., city or region name) */
location?: string;
/** Optional country code (e.g., "US", "DE") */
country?: string;
/** Optional language code (e.g., "en", "fr") */
language?: string;
/** Optional number of search results to retrieve */
num?: number;
/** Optional page number of search results to fetch */
page?: number;
/** Optional cap on websites if a list is provided via text or URL (comma-separated) */
website_limit?: number;
/** If true, prioritizes speed over completeness of results */
quick_search?: boolean;
}
export interface Resource {
/** The HTML to transform (Base64 or binary) */
html?: Uint8Array | string;
/** The content to transform (Base64 or binary) */
content?: Uint8Array | string;
/** The URL of the HTML, useful for readability transformations */
url?: string;
/** The language of the resource */
lang?: string;
}
export interface RequestParamsTransform {
/** The HTML to transform */
data: Resource[];
/** The format to return the content as */
return_format?: ReturnFormat | null;
/** Add readability preprocessing content */
readability?: boolean;
/** Clean the markdown or text for AI */
clean?: boolean;
/** Clean markdown or text, removing footers, navigation, and more */
clean_full?: boolean;
}
export type SpiderCoreResponse = {
content?: string;
message?: string;
error?: string;
status?: number;
url?: string;
};
export type ChunkCallbackFunction = (data: SpiderCoreResponse) => void;
export declare enum Collection {
Websites = "websites",
Pages = "pages",
PagesMetadata = "pages_metadata",
Contacts = "contacts",
CrawlState = "crawl_state",
CrawlLogs = "crawl_logs",
Profiles = "profiles",
Credits = "credits",
Webhooks = "webhooks",
APIKeys = "api_keys"
}
export declare enum ApiVersion {
V1 = "v1"
}
export declare enum APIRoutes {
Crawl = "crawl",
Links = "links",
Screenshot = "screenshot",
Search = "search",
Transform = "transform",
Data = "data",
DataCredits = "data/credits"
}
export declare const APISchema: {
url: string;
versions: {
current: ApiVersion;
v1: {
routes: typeof APIRoutes;
end_date: string;
};
latest: {
routes: typeof APIRoutes;
end_date: string;
};
};
};
export declare const setBaseUrl: (url: string) => void;
export {};