scrapegraph-js
Version:
Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs. Supports cookies for authentication, infinite scrolling, and pagination.
106 lines (98 loc) • 3.38 kB
JavaScript
import { crawl, getCrawlRequest } from '../index.js';
import 'dotenv/config';
// Example .env file:
// SGAI_APIKEY=your_sgai_api_key
const apiKey = process.env.SGAI_APIKEY;
const schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "ScrapeGraphAI Website Content",
"type": "object",
"properties": {
"company": {
"type": "object",
"properties": {
"name": { "type": "string" },
"description": { "type": "string" },
"features": { "type": "array", "items": { "type": "string" } },
"contact_email": { "type": "string", "format": "email" },
"social_links": {
"type": "object",
"properties": {
"github": { "type": "string", "format": "uri" },
"linkedin": { "type": "string", "format": "uri" },
"twitter": { "type": "string", "format": "uri" }
},
"additionalProperties": false
}
},
"required": ["name", "description"]
},
"services": {
"type": "array",
"items": {
"type": "object",
"properties": {
"service_name": { "type": "string" },
"description": { "type": "string" },
"features": { "type": "array", "items": { "type": "string" } }
},
"required": ["service_name", "description"]
}
},
"legal": {
"type": "object",
"properties": {
"privacy_policy": { "type": "string" },
"terms_of_service": { "type": "string" }
},
"required": ["privacy_policy", "terms_of_service"]
}
},
"required": ["company", "services", "legal"]
};
const url = 'https://scrapegraphai.com/';
const prompt = 'What does the company do? and I need text content from there privacy and terms';
(async () => {
if (!apiKey) {
console.error('SGAI_APIKEY not found in environment. Please set it in your .env file.');
process.exit(1);
}
try {
// Start the crawl job
console.log(`\nStarting crawl for: ${url}`);
const crawlResponse = await crawl(apiKey, url, prompt, schema, {
cacheWebsite: true,
depth: 2,
maxPages: 2,
sameDomainOnly: true,
batchSize: 1,
});
console.log('\nCrawl job started. Response:');
console.log(JSON.stringify(crawlResponse, null, 2));
// If the crawl is asynchronous and returns an ID, fetch the result
const crawlId = crawlResponse.id || crawlResponse.task_id;
if (crawlId) {
console.log('\nPolling for crawl result...');
for (let i = 0; i < 10; i++) {
await new Promise((resolve) => setTimeout(resolve, 5000));
const result = await getCrawlRequest(apiKey, crawlId);
if (result.status === 'success' && result.result) {
console.log(`\nCrawl completed. Result:`);
console.log(JSON.stringify(result.result.llm_result, null, 2));
break;
} else if (result.status === 'failed') {
console.log('\nCrawl failed. Result:');
console.log(JSON.stringify(result, null, 2));
break;
} else {
console.log(`Status: ${result.status}, waiting...`);
}
}
} else {
console.log('No crawl ID found in response. Synchronous result:');
console.log(JSON.stringify(crawlResponse, null, 2));
}
} catch (error) {
console.error('Error occurred:', error);
}
})();