@pinkpixel/prysm-mcp
Version:
MCP server for the Prysm web scraper - enabling AI assistants to scrape web content
274 lines (234 loc) • 7.11 kB
JavaScript
/**
* Simple Job Queue Manager
* Manages the processing of scraping jobs in a queue.
*/
const jobModel = require('./job');
const mainScraper = require('../../main_scraper');
const path = require('path');
const axios = require('axios').default;
// Queue state
const queue = [];
let isProcessing = false;
const MAX_CONCURRENT_JOBS = 1; // Limit concurrent jobs (can be adjusted)
const activeJobs = new Set();
/**
* Add a job to the processing queue
* @param {string} jobId - Job identifier
* @returns {boolean} - Whether the job was successfully added
*/
function addJob(jobId) {
// Check if job exists
const job = jobModel.getJob(jobId);
if (!job) return false;
// Add job to queue
queue.push({
jobId,
priority: job.priority || 5
});
// Sort queue by priority (lower number = higher priority)
queue.sort((a, b) => a.priority - b.priority);
// Start processing if not already running
if (!isProcessing) {
processQueue();
}
return true;
}
/**
* Process jobs in the queue
*/
async function processQueue() {
if (isProcessing || queue.length === 0) return;
isProcessing = true;
try {
// Process jobs while there are items in the queue and slots available
while (queue.length > 0 && activeJobs.size < MAX_CONCURRENT_JOBS) {
const { jobId } = queue.shift();
// Skip if job no longer exists
const job = jobModel.getJob(jobId);
if (!job) continue;
// Process job in background
activeJobs.add(jobId);
processJob(jobId).finally(() => {
activeJobs.delete(jobId);
});
}
} finally {
isProcessing = false;
// If there are still jobs and available slots, continue processing
if (queue.length > 0 && activeJobs.size < MAX_CONCURRENT_JOBS) {
processQueue();
}
}
}
/**
* Process a single job
* @param {string} jobId - Job identifier
*/
async function processJob(jobId) {
const job = jobModel.getJob(jobId);
if (!job) return;
// Update job status to processing
jobModel.updateJob(jobId, {
status: jobModel.JobStatus.PROCESSING,
progress: 5
});
try {
console.log(`🚀 Processing job ${jobId}: ${job.url}`);
// Convert boolean flags to proper options format if needed
const options = { ...job.options || {} };
// Handle Smart Scan options conversion
if (options.analyze) {
options.analyzeOnly = true;
}
if (options.skipAnalysis) {
options.skipAnalysis = true;
}
// Handle speed profile flags
if (options.focused || options.standard || options.deep) {
// Convert to profile - only one should be true
if (options.focused) {
options.profile = 'speed';
} else if (options.deep) {
options.profile = 'thorough';
} else if (options.standard) {
options.profile = 'balanced';
}
}
// Handle content type profile flags
if (options.article || options.product || options.listing) {
// Convert to profile - only one should be true
if (options.article) {
options.profile = 'article';
} else if (options.product) {
options.profile = 'product';
} else if (options.listing) {
options.profile = 'listing';
}
}
// Execute the scraper
const results = await mainScraper(job.url, options);
// Update job status to completed
jobModel.updateJob(jobId, {
status: jobModel.JobStatus.COMPLETED,
progress: 100
});
// Store results
await jobModel.storeResults(jobId, results);
// Send webhook notification if configured
if (job.webhook) {
try {
await sendWebhook(job, results);
} catch (webhookError) {
console.error(`Error sending webhook for job ${jobId}:`, webhookError);
}
}
console.log(`✅ Completed job ${jobId}`);
return results;
} catch (error) {
console.error(`❌ Error processing job ${jobId}:`, error);
// Update job status to failed
jobModel.updateJob(jobId, {
status: jobModel.JobStatus.FAILED,
error: error.message || 'Unknown error',
progress: 0
});
// Send webhook notification for failure if configured
if (job.webhook) {
try {
await sendWebhook(job, null, error);
} catch (webhookError) {
console.error(`Error sending webhook for job ${jobId}:`, webhookError);
}
}
throw error;
}
}
/**
* Send webhook notification for job completion or failure
* @param {object} job - Job object
* @param {object|null} results - Job results (null if job failed)
* @param {Error|null} error - Error object if job failed
*/
async function sendWebhook(job, results, error = null) {
if (!job.webhook) return;
const payload = {
jobId: job.jobId,
status: job.status,
url: job.url,
createdAt: job.createdAt,
completedAt: job.completedAt,
error: error ? error.message : job.error
};
// Include result summary if available
if (results) {
payload.resultSummary = {
title: results.title,
contentLength: results.content ? results.content.length : 0,
structureType: results.structureType,
paginationType: results.paginationType,
// Add Smart Scan information
contentType: results.contentType || results.analysisResult?.contentType,
smartScan: {
profileUsed: job.options?.profile || 'default',
optimizedStrategy: results.optimizedStrategy || results.recommendedStrategy,
analysisPerformed: !job.options?.skipAnalysis
}
};
}
// Send webhook notification
try {
await axios.post(job.webhook, payload, {
headers: {
'Content-Type': 'application/json',
'User-Agent': 'Prysm-Scraper-API/1.0'
},
timeout: 10000 // 10 second timeout
});
console.log(`📨 Webhook sent for job ${job.jobId}`);
} catch (error) {
console.error(`⚠️ Webhook delivery failed for job ${job.jobId}:`, error.message);
throw error;
}
}
/**
* Cancel a job in the queue
* @param {string} jobId - Job identifier
* @returns {boolean} - Whether the job was cancelled successfully
*/
function cancelJob(jobId) {
// Remove job from queue if it's still there
const queueIndex = queue.findIndex(item => item.jobId === jobId);
if (queueIndex !== -1) {
queue.splice(queueIndex, 1);
// Update job status
jobModel.updateJob(jobId, {
status: jobModel.JobStatus.CANCELLED
});
return true;
}
// Check if job is active (already processing)
if (activeJobs.has(jobId)) {
// We can't stop an active job easily, but we can mark it as cancelled
jobModel.updateJob(jobId, {
status: jobModel.JobStatus.CANCELLED
});
return true;
}
return false;
}
/**
* Get queue status
* @returns {object} - Queue status information
*/
function getQueueStatus() {
return {
waiting: queue.length,
active: activeJobs.size,
isProcessing
};
}
module.exports = {
addJob,
cancelJob,
getQueueStatus
};