@pinkpixel/prysm-mcp
Version:
MCP server for the Prysm web scraper - enabling AI assistants to scrape web content
513 lines (402 loc) • 13.9 kB
Markdown
# Integrating Prysm into Your Project
Prysm is a powerful, structure-aware web scraper that you can integrate into your projects in multiple ways. This guide explains how to use Prysm in your own applications.
## Installation
First, add Prysm to your project:
```bash
# Clone the repository
git clone https://github.com/pinkpixel-dev/prysm.git
# Install dependencies
cd prysm/scraper
npm install
```
## Integration Methods
There are three main ways to use Prysm in your projects:
### 1. Direct Module Import
Import Prysm directly into your Node.js code:
```javascript
const prysm = require('./path/to/prysm/scraper/main_scraper');
async function scrapeWebsite() {
try {
const result = await prysm('https://example.com', {
maxScrolls: 5,
bypassCloudflare: true,
handlePagination: true,
headless: true,
// Smart Scan options
skipAnalysis: false, // Enable Smart Scan (default)
focused: false, // Use standard mode (default)
article: false // Use auto-detection (default)
});
console.log(`Title: ${result.title}`);
console.log(`Content items: ${result.content.length}`);
console.log(`Structure type: ${result.structureType}`);
// Process the scraped data
// ...
return result;
} catch (error) {
console.error('Scraping error:', error);
}
}
scrapeWebsite();
```
### 1a. NPM Package
You can also install Prysm directly from npm:
```bash
npm install @pinkpixel/prysm
```
Then import and use it in your code:
```javascript
const { scrape } = require('@pinkpixel/prysm');
async function scrapeWebsite() {
try {
const result = await scrape('https://example.com', {
maxScrolls: 5,
bypassCloudflare: true,
handlePagination: true,
// Smart Scan options
skipAnalysis: false, // Enable Smart Scan (default)
focused: true, // Speed-optimized mode
article: true // Force article mode
});
console.log(`Title: ${result.title}`);
console.log(`Content items: ${result.content.length}`);
console.log(`Structure type: ${result.structureType}`);
return result;
} catch (error) {
console.error('Scraping error:', error);
}
}
scrapeWebsite();
```
### 2. CLI Integration (via shell execution)
Execute Prysm's CLI from your application:
```javascript
const { exec } = require('child_process');
const path = require('path');
function scrapeUrl(url, options = {}) {
return new Promise((resolve, reject) => {
const prysmPath = path.join(__dirname, 'path/to/prysm/scraper');
// Build command with options
let cmd = `cd ${prysmPath} && npm run start:cli "${url}"`;
// Add options
if (options.maxScrolls) cmd += ` --maxScrolls ${options.maxScrolls}`;
if (options.noHeadless) cmd += ` --noHeadless`;
if (options.output) cmd += ` --output "${options.output}"`;
// Smart Scan options
if (options.skipAnalysis) cmd += ` --skipAnalysis`;
if (options.analyze) cmd += ` --analyze`;
if (options.focused) cmd += ` --focused`;
if (options.deep) cmd += ` --deep`;
if (options.article) cmd += ` --article`;
if (options.product) cmd += ` --product`;
if (options.listing) cmd += ` --listing`;
// Execute the command
exec(cmd, (error, stdout, stderr) => {
if (error) {
reject(error);
return;
}
// Parse the output to get the result file path
const resultPathMatch = stdout.match(/Full results saved to:\s*([^\s]+)/);
if (resultPathMatch && resultPathMatch[1]) {
const resultPath = resultPathMatch[1];
// Load the results JSON file
const results = require(resultPath);
resolve(results);
} else {
reject(new Error('Could not find result path in output'));
}
});
});
}
// Usage
scrapeUrl('https://example.com', {
maxScrolls: 10,
focused: true, // Speed-optimized mode
article: true // Force article extraction
})
.then(results => console.log(results))
.catch(err => console.error(err));
```
### 3. API Integration (RESTful)
Start the API server and communicate with it via HTTP:
```javascript
const axios = require('axios');
const { spawn } = require('child_process');
const path = require('path');
class PrysmAPI {
constructor(options = {}) {
this.baseUrl = options.baseUrl || 'http://localhost:3001';
this.apiProcess = null;
}
async startServer() {
return new Promise((resolve, reject) => {
const prysmPath = path.join(__dirname, 'path/to/prysm/scraper');
this.apiProcess = spawn('npm', ['run', 'start:api'], {
cwd: prysmPath,
stdio: ['ignore', 'pipe', 'pipe']
});
// Wait for server to start
let output = '';
this.apiProcess.stdout.on('data', (data) => {
output += data.toString();
if (output.includes('Prysm API running at')) {
const match = output.match(/http:\/\/localhost:(\d+)/);
if (match) {
this.baseUrl = `http://localhost:${match[1]}`;
resolve(this.baseUrl);
}
}
});
this.apiProcess.stderr.on('data', (data) => {
reject(new Error(`Server error: ${data.toString()}`));
});
// Timeout if server doesn't start
setTimeout(() => {
reject(new Error('API server failed to start within timeout'));
}, 10000);
});
}
async stopServer() {
if (this.apiProcess) {
this.apiProcess.kill();
this.apiProcess = null;
}
}
async createJob(url, options = {}) {
const response = await axios.post(`${this.baseUrl}/api/jobs`, {
url,
options
});
return response.data;
}
async getJobStatus(jobId) {
const response = await axios.get(`${this.baseUrl}/api/jobs/${jobId}`);
return response.data;
}
async getJobResults(jobId) {
const response = await axios.get(`${this.baseUrl}/api/jobs/${jobId}/results`);
return response.data.result;
}
async pollUntilComplete(jobId, interval = 2000, timeout = 300000) {
const startTime = Date.now();
while (Date.now() - startTime < timeout) {
const status = await this.getJobStatus(jobId);
if (status.status === 'completed') {
return await this.getJobResults(jobId);
} else if (status.status === 'failed') {
throw new Error(`Job failed: ${status.error}`);
}
// Wait before checking again
await new Promise(resolve => setTimeout(resolve, interval));
}
throw new Error('Job timed out');
}
}
// Usage
async function scrapeWithAPI() {
const prysm = new PrysmAPI();
try {
await prysm.startServer();
console.log('API server started');
const job = await prysm.createJob('https://example.com', {
maxScrolls: 5,
bypassCloudflare: true,
// Smart Scan options
skipAnalysis: false, // Use Smart Scan (default)
focused: true, // Speed-optimized mode
product: true // Force product page extraction
});
console.log(`Job created: ${job.jobId}`);
// Wait for job to complete
const results = await prysm.pollUntilComplete(job.jobId);
console.log('Scraping results:', results);
return results;
} catch (error) {
console.error('API error:', error);
} finally {
await prysm.stopServer();
}
}
scrapeWithAPI();
```
## Practical Examples
### Example 1: Scrape Content and Save to Database
```javascript
const mongoose = require('mongoose');
const prysm = require('./path/to/prysm/scraper/main_scraper');
// Define a model for scraped content
const Article = mongoose.model('Article', {
title: String,
content: [String],
url: String,
scrapedAt: Date,
structureType: String
});
async function scrapeAndStore(url) {
// Connect to MongoDB
await mongoose.connect('mongodb://localhost:27017/scraped_content');
try {
// Scrape the URL
const result = await prysm(url, {
maxScrolls: 10,
bypassCloudflare: true
});
// Create a new article
const article = new Article({
title: result.title,
content: result.content,
url: url,
scrapedAt: new Date(),
structureType: result.structureType
});
// Save to database
await article.save();
console.log(`Saved article: ${result.title}`);
return article;
} catch (error) {
console.error('Error:', error);
} finally {
mongoose.disconnect();
}
}
// Usage
scrapeAndStore('https://example.com/article');
```
### Example 2: Bulk Scraping with Prysm API
```javascript
const fs = require('fs').promises;
const axios = require('axios');
async function bulkScrape(urlList, outDir) {
// Start API server (using the PrysmAPI class from previous example)
const prysm = new PrysmAPI();
await prysm.startServer();
try {
await fs.mkdir(outDir, { recursive: true });
// Create jobs for all URLs
const jobs = [];
for (const url of urlList) {
const job = await prysm.createJob(url, {
maxScrolls: 5,
bypassCloudflare: true,
handlePagination: true
});
jobs.push(job);
console.log(`Created job ${job.jobId} for ${url}`);
}
// Wait for all jobs to complete
const results = [];
for (const job of jobs) {
try {
const result = await prysm.pollUntilComplete(job.jobId);
results.push({ url: job.url, data: result });
// Save to file
const filename = new URL(job.url).hostname.replace(/[^a-z0-9]/g, '_');
await fs.writeFile(
`${outDir}/${filename}.json`,
JSON.stringify(result, null, 2)
);
console.log(`Completed: ${job.url}`);
} catch (err) {
console.error(`Failed to scrape ${job.url}:`, err.message);
}
}
return results;
} finally {
await prysm.stopServer();
}
}
// Usage
const urlsToScrape = [
'https://example.com/page1',
'https://example.com/page2',
'https://othersite.com/article'
];
bulkScrape(urlsToScrape, './scraped-data')
.then(results => console.log(`Scraped ${results.length} pages`))
.catch(err => console.error(err));
```
## Handling Scraped Data
Prysm's output format is consistent regardless of how you use it:
```javascript
{
title: "Page Title",
content: ["Paragraph 1", "Paragraph 2", ...],
metadata: { /* page metadata */ },
structureType: "article", // or "recipe", "product", etc.
paginationType: "infinite", // pagination type detected
extractionMethod: "ai", // extraction method used
url: "https://example.com/page"
}
```
## Tips for Production Use
1. **Error Handling**: Always wrap Prysm calls in try/catch blocks
2. **Resource Management**: Consider adding timeouts and resource limits
3. **Caching**: Implement caching to avoid re-scraping the same URLs
4. **Rate Limiting**: Add delays between requests to avoid overloading target sites
5. **Respect robots.txt**: Check robots.txt before scraping
6. **Proxy Rotation**: For large-scale scraping, rotate proxies to avoid IP blocks
## MCP Integration and Environment Variables
Prysm-LLM is optimized for integration with Large Language Models through Model Control Protocol (MCP). To configure the output directories when using with MCP or other automated systems, you can use environment variables.
### Environment Variables
Two environment variables can be used to configure output paths:
- `PRYSM_OUTPUT_DIR` - Sets the main output directory for results (default: ~/prysm/output)
- `PRYSM_IMAGE_OUTPUT_DIR` - Sets the output directory for downloaded images (default: ~/prysm/output/images)
These variables allow you to configure paths without command line arguments, which is especially useful for MCP integration where you may not have direct control over command execution.
### Setting Environment Variables
```javascript
// In Node.js, before importing the scraper
process.env.PRYSM_OUTPUT_DIR = '/custom/path/to/results';
process.env.PRYSM_IMAGE_OUTPUT_DIR = '/custom/path/to/images';
// Then import and use the scraper
const { scrape } = require('@pinkpixel/prysm-llm');
```
In a shell environment:
```bash
export PRYSM_OUTPUT_DIR="/custom/path/to/results"
export PRYSM_IMAGE_OUTPUT_DIR="/custom/path/to/images"
```
### MCP Configuration Example
When configuring Prysm-LLM for use with MCP, it's important to set these environment variables in your integration code:
```javascript
// MCP integration example
const { scrape } = require('@pinkpixel/prysm-llm');
// Configure environment variables for MCP
process.env.PRYSM_OUTPUT_DIR = '/tmp/mcp-results';
process.env.PRYSM_IMAGE_OUTPUT_DIR = '/tmp/mcp-images';
// Example MCP function
async function scrapeFocused(url, options = {}) {
const result = await scrape(url, {
maxScrolls: 5,
scrollDelay: 1000,
...options
});
return result;
}
// Example function for MCP integration
async function scrapeBalanced(url, options = {}) {
const result = await scrape(url, {
maxScrolls: 10,
scrollDelay: 2000,
...options
});
return result;
}
// Example function for MCP integration
async function scrapeDeep(url, options = {}) {
const result = await scrape(url, {
maxScrolls: 20,
scrollDelay: 3000,
...options
});
return result;
}
```
### Important Notes for MCP Integration
1. **Output Paths**: Ensure that the MCP environment has write permissions to the specified output directories.
2. **Minimal Output**: Prysm-LLM is designed to minimize console output for clean LLM integration.
3. **Error Handling**: In MCP environments, errors are returned as part of the result object rather than being logged.
4. **Headless Mode**: Always use headless mode (default) in MCP environments.
By following this guide, you should be able to integrate Prysm into your projects effectively and leverage its powerful scraping capabilities.
---
✨ Dream it, Pixel it | Made with ❤️ by Pink Pixel