@spider-cloud/spider-client
Version:
Isomorphic Javascript SDK for Spider Cloud services
429 lines (428 loc) • 11.6 kB
TypeScript
/**
* Represents viewport dimensions.
*/
export interface Viewport {
width: number;
height: number;
}
/**
* Represents HTTP headers as a dictionary object.
*/
export interface Headers {
[key: string]: string;
}
/**
* Represents a budget for various resources.
*/
export interface Budget {
[key: string]: number;
}
/**
* The chunking algorithm to use.
*/
export type ChunkingAlgType = "ByWords" | "ByLines" | "ByCharacterLength" | "BySentence";
/**
* The chunking algorithm with the value to chunk by.
*/
export interface ChunkingAlg {
type: ChunkingAlgType;
value: number;
}
/**
* Represents a timeout configuration.
* @typedef {Object} Timeout
* @property {number} secs - The number of seconds.
* @property {number} nanos - The number of nanoseconds.
*/
interface Timeout {
secs: number;
nanos: number;
}
/**
* Represents the webhook configuration.
* @typedef {Object} WebhookSettings
* @property {Object} object - The webhook configuration.
*/
interface WebhookSettings {
/**
* The URL or endpoint where the webhook information will be sent.
*/
destination: string;
/**
* Flag to indicate an action should be taken when all credits are depleted.
*/
on_credits_depleted: boolean;
/**
* Flag to indicate an action should be taken when half of the credits are depleted.
*/
on_credits_half_depleted: boolean;
/**
* Flag to trigger a notification on a website status update event.
*/
on_website_status: boolean;
/**
* Flag to send information about a new page find, such as links and data size.
*/
on_find: boolean;
/**
* Flag to handle the metadata of a new page that has been found.
*/
on_find_metadata: boolean;
}
/**
* Represents the idle network configuration.
* @typedef {Object} IdleNetwork
* @property {Timeout} timeout - The timeout configuration.
*/
interface IdleNetwork {
timeout: Timeout;
}
/**
* Represents the selector configuration.
* @typedef {Object} Selector
* @property {Timeout} timeout - The timeout configuration.
* @property {string} selector - The CSS selector to wait for.
*/
interface Selector {
timeout: Timeout;
selector: string;
}
/**
* Represents the delay configuration.
* @typedef {Object} Delay
* @property {Timeout} timeout - The timeout configuration.
*/
interface Delay {
timeout: Timeout;
}
/**
* Represents the wait_for configuration.
* @typedef {Object} WaitFor
* @property {IdleNetwork} [idle_network] - Configuration to wait for network to be idle.
* @property {Selector} [selector] - Configuration to wait for a CSS selector.
* @property {Delay} [delay] - Configuration to wait for a delay.
* @property {boolean} [page_navigations] - Whether to wait for page navigations.
*/
interface WaitFor {
idle_network?: IdleNetwork;
selector?: Selector;
delay?: Delay;
page_navigations?: boolean;
}
/**
* Represents the query API endpoint request to get documents from the global spider collection.
*/
export interface QueryRequest {
/**
* The exact URL to get.
*/
url?: string;
/**
* The domain to get a document from.
*/
domain?: string;
/**
* The path of the webpage to get the document. This is used with the domain key.
*/
pathname?: string;
}
type CSSSelector = {
name: string;
selectors: string[];
};
type CSSExtractionMap = {
[path: string]: CSSSelector[];
};
export type WebAutomation = {
Evaluate: string;
} | {
Click: string;
} | {
Wait: number;
} | {
WaitForNavigation: boolean;
} | {
WaitFor: string;
} | {
WaitForAndClick: string;
} | {
ScrollX: number;
} | {
ScrollY: number;
} | {
Fill: {
selector: string;
value?: string;
};
} | {
InfiniteScroll: number;
};
export type ReturnFormat = "markdown" | "commonmark" | "raw" | "text" | "html2text" | "bytes" | "xml" | "empty";
export type WebAutomationMap = Record<string, WebAutomation[]>;
export type ExecutionScriptsMap = Record<string, string>;
export declare enum RedirectPolicy {
Loose = "Loose",
Strict = "Strict"
}
/**
* Represents the options available for making a spider request.
*/
export interface SpiderParams {
/**
* The URL to be crawled.
*/
url: string;
/**
* The type of request to be made.
*/
request?: "http" | "chrome" | "smart";
/**
* The maximum number of pages the crawler should visit.
*/
limit?: number;
/**
* The format in which the result should be returned. When setting the return format as an array a object is returned mapping by the name.
*/
return_format?: ReturnFormat | ReturnFormat[];
/**
* Specifies whether to only visit the top-level domain.
*/
tld?: boolean;
/**
* The depth of the crawl.
*/
depth?: number;
/**
* Specifies whether the request should be cached.
*/
cache?: boolean;
/**
* The budget for various resources.
*/
budget?: Budget;
/**
* The blacklist routes to ignore. This can be a Regex string pattern.
*/
blacklist?: string[];
/**
* The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
*/
whitelist?: string[];
/**
* The locale to be used during the crawl.
*/
locale?: string;
/**
* The cookies to be set for the request, formatted as a single string.
*/
cookies?: string;
/**
* Specifies whether to use stealth techniques to avoid detection.
*/
stealth?: boolean;
/**
* The headers to be used for the request.
*/
headers?: Headers;
/**
* Specifies whether anti-bot measures should be used.
*/
anti_bot?: boolean;
/**
* Specifies whether to include metadata in the response.
*/
metadata?: boolean;
/**
* Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
*/
css_extraction_map?: CSSExtractionMap;
/**
* The dimensions of the viewport.
*/
viewport?: Viewport;
/**
* The encoding to be used for the request.
*/
encoding?: "UTF-8" | "SHIFT_JIS" | string;
/**
* Specifies whether to include subdomains in the crawl.
*/
subdomains?: boolean;
/**
* The user agent string to be used for the request.
*/
user_agent?: string;
/**
* Specifies whether the response data should be stored.
*/
store_data?: boolean;
/**
* Use webhooks to send data.
*/
webhooks?: WebhookSettings;
/**
* Configuration settings for GPT (general purpose texture mappings).
*/
gpt_config?: Record<string, any>;
/**
* Specifies whether to use fingerprinting protection.
*/
fingerprint?: boolean;
/**
* Specifies whether to perform the request without using storage.
*/
storageless?: boolean;
/**
* Specifies whether readability optimizations should be applied.
*/
readability?: boolean;
/**
* Specifies whether to use a proxy for the request.
*/
proxy_enabled?: boolean;
/**
* Specifies whether to respect the site's robots.txt file.
*/
respect_robots?: boolean;
/**
* CSS root selector to be used to filter the content.
*/
root_selector?: string;
/**
* Specifies whether to load all resources of the crawl target.
*/
full_resources?: boolean;
/**
* Specifies whether to use the sitemap links.
*/
sitemap?: boolean;
/**
* Specifies whether to only use the sitemap links.
*/
sitemap_only?: boolean;
/**
* External domains to include the crawl.
*/
external_domains?: string[];
/**
* Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
*/
return_embeddings?: boolean;
/**
* Returns the HTTP response headers used.
*/
return_headers?: boolean;
/**
* Returns the link(s) found on the page that match the crawler query.
*/
return_page_links?: boolean;
/**
* Returns the HTTP response cookies used.
*/
return_cookies?: boolean;
/**
* The timeout for the request, in milliseconds.
*/
request_timeout?: number;
/**
* Specifies whether to run the request in the background.
*/
run_in_background?: boolean;
/**
* Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
*/
scroll?: number;
/**
* Specifies whether to skip configuration checks.
*/
skip_config_checks?: boolean;
/**
* The chunking algorithm to use.
*/
chunking_alg?: ChunkingAlg;
/**
* The wait for events on the page. You need to make your `request` `chrome` or `smart`.
*/
wait_for?: WaitFor;
/**
* Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
*/
disable_intercept?: boolean;
/**
* Perform custom web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`.
*/
automation_scripts?: WebAutomationMap;
/**
* Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`.
*/
execution_scripts?: ExecutionScriptsMap;
/**
* The redirect policy for HTTP request. Set the value to Loose to allow all.
*/
redirect_policy?: RedirectPolicy;
/**
* Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
*/
event_tracker?: {
responses?: true;
requests?: true;
};
/**
* The timeout to stop the crawl.
*/
crawl_timeout?: Timeout;
}
export type SpiderCoreResponse = {
content?: string;
message?: string;
error?: string;
status?: number;
url?: string;
};
export type ChunkCallbackFunction = (data: SpiderCoreResponse) => void;
export declare enum Collection {
Websites = "websites",
Pages = "pages",
PagesMetadata = "pages_metadata",
Contacts = "contacts",
CrawlState = "crawl_state",
CrawlLogs = "crawl_logs",
Profiles = "profiles",
Credits = "credits",
Webhooks = "webhooks",
APIKeys = "api_keys"
}
export declare enum ApiVersion {
V1 = "v1"
}
export declare enum APIRoutes {
Crawl = "crawl",
Links = "links",
Screenshot = "screenshot",
Search = "search",
Transform = "transform",
PiplineExtractLeads = "pipeline/extract-contacts",
PiplineLabel = "pipeline/label",
Data = "data",
DataCrawlState = "data/crawl_state",
DataSignUrl = "data/sign-url",
DataDownload = "data/download",
DataQuery = "data/query",
DataCredits = "data/credits"
}
export declare const APISchema: {
url: string;
versions: {
current: ApiVersion;
v1: {
routes: typeof APIRoutes;
end_date: string;
};
latest: {
routes: typeof APIRoutes;
end_date: string;
};
};
};
export declare const setBaseUrl: (url: string) => void;
export {};