uns-mcp-server
Version:
Pure JavaScript MCP server for Unstructured.io - No Python required!
287 lines (234 loc) • 8.17 kB
JavaScript
/**
* Web Content Processing Example
* Extract and process content from websites using Firecrawl integration
*/
// Example: Process documentation website
async function processDocumentationSite() {
// Step 1: Start web crawling job
const crawlJob = await invoke_firecrawl_crawlhtml({
url: "https://docs.example.com",
max_depth: 3,
max_pages: 100,
include_paths: ["/api/*", "/guides/*"],
exclude_paths: ["/archive/*", "/legacy/*"],
wait_for_selector: ".content-loaded", // Wait for dynamic content
headers: {
"User-Agent": "Documentation Crawler 1.0"
}
});
console.log(`Started crawl job: ${crawlJob.id}`);
// Step 2: Monitor crawl progress
let crawlStatus = await check_crawlhtml_status(crawlJob.id);
while (crawlStatus.state === "running") {
console.log(`Crawling progress: ${crawlStatus.pages_crawled}/${crawlStatus.total_pages}`);
console.log(`Current URL: ${crawlStatus.current_url}`);
await new Promise(resolve => setTimeout(resolve, 5000));
crawlStatus = await check_crawlhtml_status(crawlJob.id);
}
if (crawlStatus.state !== "completed") {
throw new Error(`Crawl failed: ${crawlStatus.error}`);
}
console.log(`✅ Crawled ${crawlStatus.pages_crawled} pages`);
// Step 3: Generate LLM-optimized text
const textJob = await invoke_firecrawl_llmtxt({
crawl_job_id: crawlJob.id,
output_format: "markdown",
clean_html: true,
remove_scripts: true,
remove_styles: true,
extract_main_content: true,
// LLM optimization settings
llm_config: {
chunk_size: 2000,
overlap: 200,
preserve_structure: true,
extract_metadata: true,
summarize_sections: false
},
// S3 output configuration
s3_output: {
bucket: "processed-content",
prefix: `docs/${new Date().toISOString().split('T')[0]}/`,
format: "jsonl"
}
});
console.log(`Started text generation job: ${textJob.id}`);
// Step 4: Wait for text generation
let textStatus = await check_llmtxt_status(textJob.id);
while (textStatus.state === "processing") {
console.log(`Processing: ${textStatus.progress}%`);
await new Promise(resolve => setTimeout(resolve, 3000));
textStatus = await check_llmtxt_status(textJob.id);
}
if (textStatus.state === "completed") {
console.log("✅ Text generation completed!");
console.log(`Output location: ${textStatus.output_url}`);
console.log(`Total chunks: ${textStatus.chunks_generated}`);
return textStatus.output_url;
} else {
throw new Error(`Text generation failed: ${textStatus.error}`);
}
}
// Example: Process product catalog from e-commerce site
async function processProductCatalog() {
// Crawl product pages
const crawlJob = await invoke_firecrawl_crawlhtml({
url: "https://shop.example.com",
max_depth: 2,
include_paths: ["/products/*", "/category/*"],
// Custom selectors for product data
custom_selectors: {
product_name: "h1.product-title",
price: ".price-tag",
description: ".product-description",
specifications: "table.specs",
reviews: ".review-content"
},
// JavaScript execution for dynamic content
javascript_enabled: true,
wait_time: 2000,
// Pagination handling
pagination: {
selector: "a.next-page",
max_pages: 50
}
});
// Wait for completion
await waitForCrawlCompletion(crawlJob.id);
// Process into structured data
const processedData = await createStructuredProductData(crawlJob.id);
// Send to vector database for search
await indexProductsToVectorDB(processedData);
return processedData;
}
// Example: Monitor competitor websites for changes
async function monitorCompetitorSites() {
const competitors = [
{ name: "Competitor A", url: "https://competitor-a.com", selector: ".pricing" },
{ name: "Competitor B", url: "https://competitor-b.com", selector: ".products" },
{ name: "Competitor C", url: "https://competitor-c.com", selector: ".features" }
];
const results = [];
for (const competitor of competitors) {
try {
// Crawl specific sections
const crawlJob = await invoke_firecrawl_crawlhtml({
url: competitor.url,
max_depth: 1,
include_paths: ["/pricing", "/products", "/features"],
custom_selectors: {
monitor_content: competitor.selector
},
// Comparison with previous version
compare_with_previous: true,
previous_crawl_id: competitor.last_crawl_id
});
const result = await waitForCrawlCompletion(crawlJob.id);
// Check for changes
if (result.has_changes) {
console.log(`⚠️ Changes detected on ${competitor.name}`);
results.push({
competitor: competitor.name,
changes: result.changes,
timestamp: new Date().toISOString()
});
}
} catch (error) {
console.error(`Failed to monitor ${competitor.name}:`, error);
}
}
return results;
}
// Example: Create knowledge base from multiple sources
async function createKnowledgeBase() {
const sources = [
{ type: "docs", url: "https://docs.example.com" },
{ type: "blog", url: "https://blog.example.com" },
{ type: "support", url: "https://support.example.com" },
{ type: "wiki", url: "https://wiki.example.com" }
];
const knowledgeBase = [];
for (const source of sources) {
// Crawl each source
const crawlJob = await invoke_firecrawl_crawlhtml({
url: source.url,
max_depth: 4,
max_pages: 200,
// Content extraction rules
extraction_rules: {
title: "h1, h2",
content: "main, article, .content",
metadata: "meta",
links: "a[href]"
}
});
await waitForCrawlCompletion(crawlJob.id);
// Generate LLM-ready text
const textJob = await invoke_firecrawl_llmtxt({
crawl_job_id: crawlJob.id,
output_format: "markdown",
// Organize by sections
structure_config: {
group_by: "url_path",
create_toc: true,
add_metadata: true,
source_type: source.type
}
});
const result = await waitForTextGeneration(textJob.id);
knowledgeBase.push({
source: source.type,
url: source.url,
content: result.content,
metadata: result.metadata,
processed_at: new Date().toISOString()
});
}
// Store in vector database for RAG
await storeInVectorDB(knowledgeBase);
return knowledgeBase;
}
// Helper functions
async function waitForCrawlCompletion(jobId) {
let status;
do {
await new Promise(resolve => setTimeout(resolve, 5000));
status = await check_crawlhtml_status(jobId);
console.log(`Crawl progress: ${status.pages_crawled} pages`);
} while (status.state === "running");
if (status.state !== "completed") {
throw new Error(`Crawl failed: ${status.error}`);
}
return status;
}
async function waitForTextGeneration(jobId) {
let status;
do {
await new Promise(resolve => setTimeout(resolve, 3000));
status = await check_llmtxt_status(jobId);
console.log(`Text generation: ${status.progress}%`);
} while (status.state === "processing");
if (status.state !== "completed") {
throw new Error(`Text generation failed: ${status.error}`);
}
return status;
}
async function createStructuredProductData(crawlJobId) {
// Implementation would extract and structure product data
console.log("Creating structured product data...");
return [];
}
async function indexProductsToVectorDB(products) {
// Implementation would index products to vector database
console.log(`Indexing ${products.length} products to vector DB...`);
}
async function storeInVectorDB(knowledgeBase) {
// Implementation would store in vector database
console.log(`Storing ${knowledgeBase.length} documents in vector DB...`);
}
module.exports = {
processDocumentationSite,
processProductCatalog,
monitorCompetitorSites,
createKnowledgeBase
};