@spider-cloud/spider-client
Version:
Isomorphic Javascript SDK for Spider Cloud services
315 lines (313 loc) • 13.7 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.Spider = void 0;
const config_1 = require("./config");
const package_json_1 = require("../package.json");
const stream_reader_1 = require("./utils/stream-reader");
const exponential_backoff_1 = require("exponential-backoff");
/**
* A class to interact with the Spider API.
*/
class Spider {
/**
* Create an instance of Spider.
* @param {string | null} apiKey - The API key used to authenticate to the Spider API. If null, attempts to source from environment variables.
* @throws Will throw an error if the API key is not provided.
*/
constructor(props) {
var _a;
this.apiKey = (props === null || props === void 0 ? void 0 : props.apiKey) || ((_a = process === null || process === void 0 ? void 0 : process.env) === null || _a === void 0 ? void 0 : _a.SPIDER_API_KEY);
if (!this.apiKey) {
throw new Error("No API key provided");
}
}
/**
* Internal method to handle POST requests.
* @param {string} endpoint - The API endpoint to which the POST request should be sent.
* @param {Record<string, any>} data - The JSON data to be sent in the request body.
* @param {boolean} [stream=false] - Whether to stream the response back without parsing.
* @returns {Promise<Response | any>} The response in JSON if not streamed, or the Response object if streamed.
*/
async _apiPost(endpoint, data, stream, jsonl) {
const headers = jsonl ? this.prepareHeadersJsonL : this.prepareHeaders;
const response = await (0, exponential_backoff_1.backOff)(() => fetch(`${config_1.APISchema["url"]}/${config_1.ApiVersion.V1}/${endpoint}`, {
method: "POST",
headers: headers,
body: JSON.stringify(data),
}), {
numOfAttempts: 5,
});
if (!stream) {
if (response.ok) {
return response.json();
}
else {
this.handleError(response, `post to ${endpoint}`);
}
}
return response;
}
/**
* Internal method to handle GET requests.
* @param {string} endpoint - The API endpoint from which data should be retrieved.
* @returns {Promise<any>} The data returned from the endpoint in JSON format.
*/
async _apiGet(endpoint) {
const headers = this.prepareHeaders;
const response = await (0, exponential_backoff_1.backOff)(() => fetch(`${config_1.APISchema["url"]}/${config_1.ApiVersion.V1}/${endpoint}`, {
method: "GET",
headers: headers,
}), {
numOfAttempts: 5,
});
if (response.ok) {
return response.json();
}
else {
this.handleError(response, `get from ${endpoint}`);
}
}
/**
* Internal method to handle DELETE requests.
* @param {string} endpoint - The API endpoint from which data should be retrieved.
* @returns {Promise<any>} The data returned from the endpoint in JSON format.
*/
async _apiDelete(endpoint) {
const headers = this.prepareHeaders;
const response = await (0, exponential_backoff_1.backOff)(() => fetch(`${config_1.APISchema["url"]}/${config_1.ApiVersion.V1}/${endpoint}`, {
method: "DELETE",
headers,
}), {
numOfAttempts: 5,
});
if (response.ok) {
return response;
}
else {
this.handleError(response, `get from ${endpoint}`);
}
}
/**
* Scrapes data from a specified URL.
* @param {string} url - The URL to scrape.
* @param {GenericParams} [params={}] - Additional parameters for the scraping request.
* @returns {Promise<any>} The scraped data from the URL.
*/
async scrapeUrl(url, params = {}) {
return this._apiPost(config_1.APIRoutes.Crawl, { url: url, limit: 1, ...params });
}
/**
* Initiates a crawling job starting from the specified URL.
* @param {string} url - The URL to start crawling.
* @param {GenericParams} [params={}] - Additional parameters for the crawl.
* @param {boolean} [stream=false] - Whether to receive the response as a stream.
* @param {function} [callback=function] - The callback function when streaming per chunk. If this is set with stream you will not get a end response.
* @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
*/
async crawlUrl(url, params = {}, stream = false, cb) {
const jsonl = stream && cb;
const res = await this._apiPost(config_1.APIRoutes.Crawl, { url, ...params }, stream, !!jsonl);
if (jsonl) {
return await (0, stream_reader_1.streamReader)(res, cb);
}
return res;
}
/**
* Retrieves all links from the specified URL.
* @param {string} url - The URL from which to gather links.
* @param {GenericParams} [params={}] - Additional parameters for the crawl.
* @param {boolean} [stream=false] - Whether to receive the response as a stream.
* @param {function} [callback=function] - The callback function when streaming per chunk. If this is set with stream you will not get a end response.
* @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
*/
async links(url, params = {}, stream = false, cb) {
const jsonl = stream && cb;
const res = await this._apiPost(config_1.APIRoutes.Links, { url, ...params }, stream, !!jsonl);
if (jsonl) {
return await (0, stream_reader_1.streamReader)(res, cb);
}
return res;
}
/**
* Takes a screenshot of the website starting from this URL.
* @param {string} url - The URL to start the screenshot.
* @param {GenericParams} [params={}] - Configuration parameters for the screenshot.
* @returns {Promise<any>} The screenshot data.
*/
async screenshot(url, params = {}) {
return this._apiPost(config_1.APIRoutes.Screenshot, { url: url, ...params });
}
/**
* Perform a search and gather a list of websites to start crawling and collect resources.
* @param {string} search - The search query.
* @param {GenericParams} [params={}] - Configuration parameters for the search.
* @returns {Promise<any>} The result of the crawl, either structured data or a Response object if streaming.
*/
async search(q, params = {}) {
return this._apiPost(config_1.APIRoutes.Search, { search: q, ...params });
}
/**
* Transform HTML to Markdown or text. You can send up to 10MB of data at once.
* @param {object} data - The data to trasnform, a list of objects with the key 'html' and optional 'url' key for readability.
* @param {object} [params={}] - Configuration parameters for the transformation.
* @returns {Promise<any>} The transformation result.
*/
async transform(data, params = {}) {
return this._apiPost(config_1.APIRoutes.Transform, { data, ...params });
}
/**
* Extracts leads from a website.
* @param {string} url - The URL from which to extract contacts.
* @param {GenericParams} [params={}] - Configuration parameters for the extraction.
* @returns {Promise<any>} The contact information extracted.
*/
async extractContacts(url, params = {}) {
return this._apiPost(config_1.APIRoutes.PiplineExtractLeads, {
url: url,
...params,
});
}
/**
* Applies labeling to data extracted from a specified URL.
* @param {string} url - The URL to label.
* @param {GenericParams} [params={}] - Configuration parameters for labeling.
* @returns {Promise<any>} The labeled data.
*/
async label(url, params = {}) {
return this._apiPost(config_1.APIRoutes.PiplineLabel, { url: url, ...params });
}
/**
* Check the crawl state of the website.
* @param {string} url - The URL to check.
* @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
* @returns {Promise<any>} The crawl state data.
*/
async getCrawlState(url, params = {}) {
return this._apiPost(config_1.APIRoutes.DataCrawlState, { url: url, ...params });
}
/**
* Create a signed url to download files from the storage.
* @param {string} [domain] - The domain for the user's storage. If not provided, downloads all files.
* @param {Object} [options] - The download options.
* @param {boolean} [raw] - Return the raw response.
* @returns {Promise<Response>} The response containing the file stream.
*/
async createSignedUrl(url, options) {
const { page, limit, expiresIn, domain, pathname } = options !== null && options !== void 0 ? options : {};
const params = new URLSearchParams({
...(url && { url }),
...(domain && { domain }),
...(pathname && { pathname }),
...(page && { page: page.toString() }),
...(limit && { limit: limit.toString() }),
...(expiresIn && { expiresIn: expiresIn.toString() }),
});
const endpoint = `${config_1.APISchema["url"]}/${config_1.APIRoutes.DataSignUrl}?${params.toString()}`;
const headers = this.prepareHeaders;
const response = await fetch(endpoint, {
method: "GET",
headers,
});
if (response.ok) {
return await response.json();
}
else {
this.handleError(response, `Failed to sign files`);
}
}
/**
* Retrieves the number of credits available on the account.
* @returns {Promise<any>} The current credit balance.
*/
async getCredits() {
return this._apiGet(config_1.APIRoutes.DataCredits);
}
/**
* Send a POST request to insert data into a specified table.
* @param {string} table - The table name in the database.
* @param {object} data - The data to be inserted.
* @returns {Promise<any>} The response from the server.
*/
async postData(collection, data) {
return this._apiPost(`${config_1.APIRoutes.Data}/${collection}`, data);
}
/**
* Send a GET request to retrieve data from a specified table.
* @param {Collection} table - The table name in the database.
* @param {object} params - The query parameters for data retrieval.
* @returns {Promise<any>} The response from the server.
*/
async getData(collections, params) {
return this._apiGet(`${config_1.APIRoutes.Data}/${collections}?${new URLSearchParams(params).toString()}`);
}
/**
* Download a record. The url is the path of the storage hash returned and not the exact website url.
* @param {QueryRequest} params - The query parameters for data retrieval.
* @returns {Promise<any>} The download response from the server.
*/
async download(query, output) {
const headers = this.prepareHeaders;
const endpoint = `${config_1.APIRoutes.DataDownload}?${new URLSearchParams(query).toString()}`;
const response = await fetch(`${config_1.APISchema["url"]}/${config_1.ApiVersion.V1}/${endpoint}`, {
method: "GET",
headers,
});
if (response.ok) {
if (output === "text") {
return await response.text();
}
return await response.blob();
}
else {
this.handleError(response, `get from ${endpoint}`);
}
}
/**
* Perform a query to get a document.
* @param {QueryRequest} params - The query parameters for data retrieval.
* @returns {Promise<any>} The response from the server.
*/
async query(query) {
return this._apiGet(`${config_1.APIRoutes.DataQuery}?${new URLSearchParams(query).toString()}`);
}
/**
* Send a DELETE request to remove data from a specified table.
* @param {Collection} table - The table name in the database.
* @param {object} params - Parameters to identify records to delete.
* @returns {Promise<any>} The response from the server.
*/
async deleteData(collection, params) {
return this._apiDelete(`${config_1.APIRoutes.Data}/${collection}?${new URLSearchParams(params).toString()}`);
}
/**
* Prepares common headers for each API request.
* @returns {HeadersInit} A headers object for fetch requests.
*/
get prepareHeaders() {
return {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
"User-Agent": `Spider-Client/${package_json_1.version}`,
};
}
/**
* Prepares common headers for each API request with JSONl content-type suitable for streaming.
* @returns {HeadersInit} A headers object for fetch requests.
*/
get prepareHeadersJsonL() {
return {
...this.prepareHeaders,
"Content-Type": "application/jsonl",
};
}
/**
* Handles errors from API requests.
* @param {Response} response - The fetch response object.
* @param {string} action - Description of the attempted action.
* @throws Will throw an error with detailed status information.
*/
handleError(response, action) {
throw new Error(`Failed to ${action}. Status code: ${response.status}.`);
}
}
exports.Spider = Spider;
;