scrapegraph-js
Version:
Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs. Supports cookies for authentication, infinite scrolling, and pagination.
133 lines (119 loc) • 4.56 kB
JavaScript
import axios from 'axios';
import handleError from './utils/handleError.js';
import { ZodType } from 'zod';
import { zodToJsonSchema } from 'zod-to-json-schema';
import { isMockEnabled, getMockConfig } from './utils/mockConfig.js';
import { getMockResponse } from './utils/mockResponse.js';
/**
* Start a crawl job using the ScrapeGraphAI API.
*
* @param {string} apiKey - Your ScrapeGraph AI API key
* @param {string} url - The starting URL for the crawl
* @param {string|null} prompt - The prompt to guide the crawl and extraction (null for markdown mode)
* @param {Object|ZodType|null} schema - JSON schema or Zod schema defining the structure of the extracted data (null for markdown mode)
* @param {Object} [options] - Optional crawl parameters
* @param {boolean} [options.extractionMode=true] - true for AI extraction, false for markdown conversion (NO AI/LLM)
* @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content
* @param {number} [options.depth=2] - Maximum depth of the crawl (1-10)
* @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100)
* @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain
* @param {boolean} [options.sitemap] - Whether to use sitemap for better page discovery
* @param {number} [options.batchSize=1] - Batch size for processing pages (1-10)
* @param {boolean} [options.mock] - Override mock mode for this request
* @param {boolean} [options.renderHeavyJs=false] - Whether to render heavy JavaScript on the page
* @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection
* @returns {Promise<Object>} The crawl job response
* @throws {Error} Throws an error if the HTTP request fails
*/
export async function crawl(
apiKey,
url,
prompt,
schema,
options = {}
) {
const { mock = null, renderHeavyJs = false, stealth = false } = options;
// Check if mock mode is enabled
const useMock = mock !== null ? mock : isMockEnabled();
if (useMock) {
console.log('🧪 Mock mode active. Returning stub for crawl request');
const mockConfig = getMockConfig();
const mockData = getMockResponse('POST', 'https://api.scrapegraphai.com/v1/crawl', mockConfig.customResponses, mockConfig.customHandler);
return mockData;
}
const endpoint = 'https://api.scrapegraphai.com/v1/crawl';
const headers = {
'accept': 'application/json',
'SGAI-APIKEY': apiKey,
'Content-Type': 'application/json',
};
let schemaPayload = null;
if (schema !== null && schema !== undefined) {
if (schema instanceof ZodType) {
schemaPayload = zodToJsonSchema(schema);
} else if (typeof schema === 'object') {
schemaPayload = schema;
} else {
throw new Error('The schema must be a Zod schema, a plain object, or null');
}
}
const {
cacheWebsite = true,
depth = 2,
maxPages = 2,
sameDomainOnly = true,
sitemap = false,
batchSize = 1,
} = options;
const payload = {
url,
prompt,
schema: schemaPayload,
cache_website: cacheWebsite,
depth,
max_pages: maxPages,
same_domain_only: sameDomainOnly,
sitemap,
batch_size: batchSize,
render_heavy_js: renderHeavyJs,
};
if (stealth) {
payload.stealth = stealth;
}
try {
const response = await axios.post(endpoint, payload, { headers });
return response.data;
} catch (error) {
handleError(error);
}
}
/**
* Get the result of a crawl job by ID.
*
* @param {string} apiKey - Your ScrapeGraph AI API key
* @param {string} crawlId - The crawl job ID
* @returns {Promise<Object>} The crawl result
* @throws {Error} Throws an error if the HTTP request fails
*/
export async function getCrawlRequest(apiKey, crawlId, options = {}) {
const { mock = null } = options;
// Check if mock mode is enabled
const useMock = mock !== null ? mock : isMockEnabled();
if (useMock) {
console.log('🧪 Mock mode active. Returning stub for getCrawlRequest');
const mockConfig = getMockConfig();
const mockData = getMockResponse('GET', `https://api.scrapegraphai.com/v1/crawl/${crawlId}`, mockConfig.customResponses, mockConfig.customHandler);
return mockData;
}
const endpoint = `https://api.scrapegraphai.com/v1/crawl/${crawlId}`;
const headers = {
'accept': 'application/json',
'SGAI-APIKEY': apiKey,
};
try {
const response = await axios.get(endpoint, { headers });
return response.data;
} catch (error) {
handleError(error);
}
}