UNPKG

@spider-cloud/spider-client

Version:

Isomorphic Javascript SDK for Spider Cloud services

315 lines (313 loc) 13.7 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Spider = void 0; const config_1 = require("./config"); const package_json_1 = require("../package.json"); const stream_reader_1 = require("./utils/stream-reader"); const exponential_backoff_1 = require("exponential-backoff"); /** * A class to interact with the Spider API. */ class Spider { /** * Create an instance of Spider. * @param {string | null} apiKey - The API key used to authenticate to the Spider API. If null, attempts to source from environment variables. * @throws Will throw an error if the API key is not provided. */ constructor(props) { var _a; this.apiKey = (props === null || props === void 0 ? void 0 : props.apiKey) || ((_a = process === null || process === void 0 ? void 0 : process.env) === null || _a === void 0 ? void 0 : _a.SPIDER_API_KEY); if (!this.apiKey) { throw new Error("No API key provided"); } } /** * Internal method to handle POST requests. * @param {string} endpoint - The API endpoint to which the POST request should be sent. * @param {Record<string, any>} data - The JSON data to be sent in the request body. * @param {boolean} [stream=false] - Whether to stream the response back without parsing. * @returns {Promise<Response | any>} The response in JSON if not streamed, or the Response object if streamed. */ async _apiPost(endpoint, data, stream, jsonl) { const headers = jsonl ? this.prepareHeadersJsonL : this.prepareHeaders; const response = await (0, exponential_backoff_1.backOff)(() => fetch(`${config_1.APISchema["url"]}/${config_1.ApiVersion.V1}/${endpoint}`, { method: "POST", headers: headers, body: JSON.stringify(data), }), { numOfAttempts: 5, }); if (!stream) { if (response.ok) { return response.json(); } else { this.handleError(response, `post to ${endpoint}`); } } return response; } /** * Internal method to handle GET requests. * @param {string} endpoint - The API endpoint from which data should be retrieved. * @returns {Promise<any>} The data returned from the endpoint in JSON format. */ async _apiGet(endpoint) { const headers = this.prepareHeaders; const response = await (0, exponential_backoff_1.backOff)(() => fetch(`${config_1.APISchema["url"]}/${config_1.ApiVersion.V1}/${endpoint}`, { method: "GET", headers: headers, }), { numOfAttempts: 5, }); if (response.ok) { return response.json(); } else { this.handleError(response, `get from ${endpoint}`); } } /** * Internal method to handle DELETE requests. * @param {string} endpoint - The API endpoint from which data should be retrieved. * @returns {Promise<any>} The data returned from the endpoint in JSON format. */ async _apiDelete(endpoint) { const headers = this.prepareHeaders; const response = await (0, exponential_backoff_1.backOff)(() => fetch(`${config_1.APISchema["url"]}/${config_1.ApiVersion.V1}/${endpoint}`, { method: "DELETE", headers, }), { numOfAttempts: 5, }); if (response.ok) { return response; } else { this.handleError(response, `get from ${endpoint}`); } } /** * Scrapes data from a specified URL. * @param {string} url - The URL to scrape. * @param {GenericParams} [params={}] - Additional parameters for the scraping request. * @returns {Promise<any>} The scraped data from the URL. */ async scrapeUrl(url, params = {}) { return this._apiPost(config_1.APIRoutes.Crawl, { url: url, limit: 1, ...params }); } /** * Initiates a crawling job starting from the specified URL. * @param {string} url - The URL to start crawling. * @param {GenericParams} [params={}] - Additional parameters for the crawl. * @param {boolean} [stream=false] - Whether to receive the response as a stream. * @param {function} [callback=function] - The callback function when streaming per chunk. If this is set with stream you will not get a end response. * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming. */ async crawlUrl(url, params = {}, stream = false, cb) { const jsonl = stream && cb; const res = await this._apiPost(config_1.APIRoutes.Crawl, { url, ...params }, stream, !!jsonl); if (jsonl) { return await (0, stream_reader_1.streamReader)(res, cb); } return res; } /** * Retrieves all links from the specified URL. * @param {string} url - The URL from which to gather links. * @param {GenericParams} [params={}] - Additional parameters for the crawl. * @param {boolean} [stream=false] - Whether to receive the response as a stream. * @param {function} [callback=function] - The callback function when streaming per chunk. If this is set with stream you will not get a end response. * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming. */ async links(url, params = {}, stream = false, cb) { const jsonl = stream && cb; const res = await this._apiPost(config_1.APIRoutes.Links, { url, ...params }, stream, !!jsonl); if (jsonl) { return await (0, stream_reader_1.streamReader)(res, cb); } return res; } /** * Takes a screenshot of the website starting from this URL. * @param {string} url - The URL to start the screenshot. * @param {GenericParams} [params={}] - Configuration parameters for the screenshot. * @returns {Promise<any>} The screenshot data. */ async screenshot(url, params = {}) { return this._apiPost(config_1.APIRoutes.Screenshot, { url: url, ...params }); } /** * Perform a search and gather a list of websites to start crawling and collect resources. * @param {string} search - The search query. * @param {GenericParams} [params={}] - Configuration parameters for the search. * @returns {Promise<any>} The result of the crawl, either structured data or a Response object if streaming. */ async search(q, params = {}) { return this._apiPost(config_1.APIRoutes.Search, { search: q, ...params }); } /** * Transform HTML to Markdown or text. You can send up to 10MB of data at once. * @param {object} data - The data to trasnform, a list of objects with the key 'html' and optional 'url' key for readability. * @param {object} [params={}] - Configuration parameters for the transformation. * @returns {Promise<any>} The transformation result. */ async transform(data, params = {}) { return this._apiPost(config_1.APIRoutes.Transform, { data, ...params }); } /** * Extracts leads from a website. * @param {string} url - The URL from which to extract contacts. * @param {GenericParams} [params={}] - Configuration parameters for the extraction. * @returns {Promise<any>} The contact information extracted. */ async extractContacts(url, params = {}) { return this._apiPost(config_1.APIRoutes.PiplineExtractLeads, { url: url, ...params, }); } /** * Applies labeling to data extracted from a specified URL. * @param {string} url - The URL to label. * @param {GenericParams} [params={}] - Configuration parameters for labeling. * @returns {Promise<any>} The labeled data. */ async label(url, params = {}) { return this._apiPost(config_1.APIRoutes.PiplineLabel, { url: url, ...params }); } /** * Check the crawl state of the website. * @param {string} url - The URL to check. * @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query. * @returns {Promise<any>} The crawl state data. */ async getCrawlState(url, params = {}) { return this._apiPost(config_1.APIRoutes.DataCrawlState, { url: url, ...params }); } /** * Create a signed url to download files from the storage. * @param {string} [domain] - The domain for the user's storage. If not provided, downloads all files. * @param {Object} [options] - The download options. * @param {boolean} [raw] - Return the raw response. * @returns {Promise<Response>} The response containing the file stream. */ async createSignedUrl(url, options) { const { page, limit, expiresIn, domain, pathname } = options !== null && options !== void 0 ? options : {}; const params = new URLSearchParams({ ...(url && { url }), ...(domain && { domain }), ...(pathname && { pathname }), ...(page && { page: page.toString() }), ...(limit && { limit: limit.toString() }), ...(expiresIn && { expiresIn: expiresIn.toString() }), }); const endpoint = `${config_1.APISchema["url"]}/${config_1.APIRoutes.DataSignUrl}?${params.toString()}`; const headers = this.prepareHeaders; const response = await fetch(endpoint, { method: "GET", headers, }); if (response.ok) { return await response.json(); } else { this.handleError(response, `Failed to sign files`); } } /** * Retrieves the number of credits available on the account. * @returns {Promise<any>} The current credit balance. */ async getCredits() { return this._apiGet(config_1.APIRoutes.DataCredits); } /** * Send a POST request to insert data into a specified table. * @param {string} table - The table name in the database. * @param {object} data - The data to be inserted. * @returns {Promise<any>} The response from the server. */ async postData(collection, data) { return this._apiPost(`${config_1.APIRoutes.Data}/${collection}`, data); } /** * Send a GET request to retrieve data from a specified table. * @param {Collection} table - The table name in the database. * @param {object} params - The query parameters for data retrieval. * @returns {Promise<any>} The response from the server. */ async getData(collections, params) { return this._apiGet(`${config_1.APIRoutes.Data}/${collections}?${new URLSearchParams(params).toString()}`); } /** * Download a record. The url is the path of the storage hash returned and not the exact website url. * @param {QueryRequest} params - The query parameters for data retrieval. * @returns {Promise<any>} The download response from the server. */ async download(query, output) { const headers = this.prepareHeaders; const endpoint = `${config_1.APIRoutes.DataDownload}?${new URLSearchParams(query).toString()}`; const response = await fetch(`${config_1.APISchema["url"]}/${config_1.ApiVersion.V1}/${endpoint}`, { method: "GET", headers, }); if (response.ok) { if (output === "text") { return await response.text(); } return await response.blob(); } else { this.handleError(response, `get from ${endpoint}`); } } /** * Perform a query to get a document. * @param {QueryRequest} params - The query parameters for data retrieval. * @returns {Promise<any>} The response from the server. */ async query(query) { return this._apiGet(`${config_1.APIRoutes.DataQuery}?${new URLSearchParams(query).toString()}`); } /** * Send a DELETE request to remove data from a specified table. * @param {Collection} table - The table name in the database. * @param {object} params - Parameters to identify records to delete. * @returns {Promise<any>} The response from the server. */ async deleteData(collection, params) { return this._apiDelete(`${config_1.APIRoutes.Data}/${collection}?${new URLSearchParams(params).toString()}`); } /** * Prepares common headers for each API request. * @returns {HeadersInit} A headers object for fetch requests. */ get prepareHeaders() { return { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, "User-Agent": `Spider-Client/${package_json_1.version}`, }; } /** * Prepares common headers for each API request with JSONl content-type suitable for streaming. * @returns {HeadersInit} A headers object for fetch requests. */ get prepareHeadersJsonL() { return { ...this.prepareHeaders, "Content-Type": "application/jsonl", }; } /** * Handles errors from API requests. * @param {Response} response - The fetch response object. * @param {string} action - Description of the attempted action. * @throws Will throw an error with detailed status information. */ handleError(response, action) { throw new Error(`Failed to ${action}. Status code: ${response.status}.`); } } exports.Spider = Spider;