mcp-smart-crawler
Version:
A command-line tool acting as an MCP (ModelContextProtocol) server, using Playwright to crawl web content for AI models.
327 lines (326 loc) • 15.4 kB
JavaScript
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import { z } from 'zod';
import { parseArgs, downloadFile, extractXhsUrl, ensureBrowserInstalled } from './helper.js';
import { defaultUserAgent } from './constants.js';
import * as fs from 'fs';
import * as path from 'path';
import { chromium } from 'playwright';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// 1. Ensure the browser dependency is met *before* running the main logic
ensureBrowserInstalled();
// Create server instance
const server = new McpServer({
name: 'mcp-smart-crawler',
version: '1.0.0',
capabilities: {
resources: {},
tools: {},
},
});
// 解析命令行参数
const args = process.argv.slice(2); // 跳过 node 和脚本路径
let saveFolder = './downloads'; // 默认路径
// 使用 __dirname 获取当前文件所在目录
const currentFileDirectory = __dirname;
const parsedArgs = parseArgs(args);
if (parsedArgs['download-folder']) {
saveFolder = parsedArgs['download-folder'];
console.error(`Save folder set to: ${saveFolder}`);
}
else {
console.error('No --download-folder specified, using default:', saveFolder);
saveFolder = path.join(currentFileDirectory, saveFolder);
}
// 确保保存目录存在
if (!fs.existsSync(saveFolder)) {
fs.mkdirSync(saveFolder, { recursive: true });
}
const absolutePath = path.resolve(saveFolder);
console.error(`Created download folder: ${absolutePath}`);
const XhsPostSchema = z.object({
success: z.boolean(),
url: z.string().optional(),
title: z.string().optional(),
description: z.string().optional(),
images: z.array(z.string()).optional(),
downloadedVideoPaths: z.array(z.string()).optional(), // Paths to downloaded files
error: z.string().optional(),
debugInfo: z.object({
videoSourcesFound: z.array(z.string()).optional(),
interceptedVideoRequests: z.array(z.string()).optional(),
}).optional(),
});
server.tool('get-xhs-post', 'Get post content, images, and download videos from Xiaohongshu (Xiaohongshu/小红书) by shark link', {
shareLink: z.string().describe('The share link copied from Xiaohongshu (小红书) app or web'),
}, async ({ shareLink }) => {
const downloadDir = saveFolder; // Use the directory specified in command line args
console.error(`[get-xhs-post] Received share link text: "${shareLink}"`);
console.error(`[get-xhs-post] Download directory: "${downloadDir}"`);
// 1. Ensure download directory exists
try {
if (!fs.existsSync(downloadDir)) {
fs.mkdirSync(downloadDir, { recursive: true });
}
}
catch (err) {
return {
content: [{
type: 'text',
// Error creating directory
text: `Error creating download directory`
}]
};
}
// 2. Extract URL
const postUrl = extractXhsUrl(shareLink);
if (!postUrl) {
return {
content: [{
type: 'text',
// Invalid URL
text: 'Invalid Xiaohongshu share link'
}]
};
}
let browser = null;
const downloadedVideoPaths = [];
const interceptedVideoRequests = []; // For debugging routed URLs
const videoSourcesFound = []; // For debugging <video> src attributes
const ua = process?.env?.ua || defaultUserAgent;
try {
// 3. Launch Playwright
browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
userAgent: ua,
viewport: { width: 1280, height: 800 },
locale: 'zh-CN',
});
const page = await context.newPage();
// **** SETUP page.route INTERCEPTION ****
// Define a pattern to match your video URLs. Be specific if possible.
// Example: Match URLs ending in .mp4 from the specific XHS CDN host
const videoUrlPattern = /https:\/\/sns-bak-v\d+\.xhscdn\.com\/.*\.mp4/;
// Or a more general pattern (might intercept too much, filter inside handler)
// const videoUrlPattern = /.*\.mp4/;
console.error(`[get-xhs-post] Setting up route intercept for pattern: ${videoUrlPattern}`);
await page.route(videoUrlPattern, async (route, request) => {
const reqUrl = request.url();
console.error(`[route handler] Intercepted request: ${reqUrl}`);
interceptedVideoRequests.push(reqUrl); // Log the URL we are handling
try {
// 1. Fetch the original response *through the route*
// This allows Playwright to capture the response body more reliably
console.error(`[route handler] Fetching original response for: ${reqUrl}`);
const response = await route.fetch(); // Use route.fetch() not context.request.fetch()
console.error(`[route handler] Original response status: ${response.status()} for ${reqUrl}`);
// Check if response is OK and likely video content before processing
const contentType = response.headers()['content-type']?.toLowerCase();
if (response.ok() && contentType?.startsWith('video/')) {
// 2. Get the response body buffer
console.error(`[route handler] Getting response buffer for: ${reqUrl}`);
const bodyBuffer = await response.body(); // Try getting buffer from the fetched response
console.error(`[route handler] Got buffer (size: ${bodyBuffer.length} bytes) for: ${reqUrl}`);
if (bodyBuffer.length > 0) {
// 3. Save the buffer to file
const video_pathname = path.basename(new URL(reqUrl).pathname);
// if (video_pathname.endsWith('.mp4')) {
const filename = video_pathname.endsWith('.mp4') ? video_pathname : `${video_pathname}_${Date.now()}.mp4`; // Basic filename
const filePath = path.join(downloadDir, filename);
console.error(`[route handler] Saving video to: ${filePath}`);
fs.writeFileSync(filePath, bodyBuffer); // Consider async or streams for large files
downloadedVideoPaths.push(filePath);
console.error(`[route handler] Successfully saved: ${filePath}`);
}
else {
console.warn(`[route handler] Response body buffer is empty for: ${reqUrl}`);
}
}
else {
console.error(`[route handler] Skipping body processing for ${reqUrl} (Status: ${response.status()}, Content-Type: ${contentType})`);
}
// 4. CRITICAL: Fulfill the route with the original response
// This sends the data back to the browser page so the video player works
console.error(`[route handler] Fulfilling route for: ${reqUrl}`);
await route.fulfill({ response }); // Pass the APIResponse object back
}
catch (error) {
console.error(`[route handler] Error processing route for ${reqUrl}: ${error}`);
// Abort the route on error to avoid hanging the page request indefinitely
// Alternatively, you could try fulfilling with an error status if appropriate
await route.abort();
}
});
// **** END page.route SETUP ****
// Remove the page.on('response') handler for video saving as it's now handled by page.route
// page.on('response', ...) // REMOVE or keep only for logging non-video responses
// 4. Navigate
console.error(`[get-xhs-post] Navigating to ${postUrl}...`);
await page.goto(postUrl, { waitUntil: 'networkidle', timeout: 90000 }); // networkidle might be okay now, or 'load'
console.error(`[get-xhs-post] Navigation complete. Current URL: ${page.url()}`);
// 5. Wait a bit more if needed, although route should capture requests during load
// await page.waitForTimeout(5000); // Maybe less critical now
// --- Extract metadata (Title, Description, Images) ---
const noteContainerSelector = '#noteContainer'; // *** REPLACE ***
const titleSelector = '#detail-title'; // *** REPLACE ***
const descSelector = '#detail-desc'; // *** REPLACE ***
const imageSelector = '.swiper-slide img'; // *** REPLACE ***
// ... (metadata extraction logic remains the same)
let title = '';
let description = '';
const images = [];
try {
await page.waitForSelector(noteContainerSelector, { timeout: 5000 }).catch(() => console.warn('Note container not found'));
title = await page.locator(titleSelector).first().textContent({ timeout: 3000 }) ?? '';
description = await page.locator(descSelector).first().innerText({ timeout: 3000 }) ?? '';
const imageElements = await page.locator(imageSelector).all();
for (const img of imageElements) {
let src = await img.getAttribute('src') || await img.getAttribute('data-src');
if (src && src.startsWith('http'))
images.push(src);
}
// Get video element sources for debugging
const videoElements = await page.locator('video').all();
for (const videoEl of videoElements) {
const src = await videoEl.getAttribute('src');
if (src)
videoSourcesFound.push(src);
}
}
catch (metaError) {
console.warn(`[get-xhs-post] Error extracting metadata: ${metaError}`);
}
// --- Construct result ---
// The downloadedVideoPaths array is populated by the route handler now
console.error(`[get-xhs-post] Finalizing result. Videos downloaded: ${downloadedVideoPaths.length}`);
const result = {
success: true,
url: page.url(),
title: title.trim(),
description: description.trim(),
images: images,
downloadedVideoPaths: downloadedVideoPaths,
debugInfo: {
videoSourcesFound: videoSourcesFound,
interceptedVideoRequests: interceptedVideoRequests,
}
};
if (videoSourcesFound.some(src => src?.includes('.mp4')) && downloadedVideoPaths.length === 0) {
console.warn("[get-xhs-post] MP4 source detected in <video> tag, but no MP4 file was downloaded via routing. Check route pattern and network logs.");
// Consider adding a note to the error or result indicating potential download failure
}
XhsPostSchema.parse(result);
// Cleanup
if (browser) {
console.error('[get-xhs-post] Closing browser.');
await browser.close();
}
const contents = [];
if (title) {
contents.push({
type: 'text',
text: `Post Title: ${title}`
});
}
if (description) {
contents.push({
type: 'text',
text: `Post Description: ${description.trim()}`
});
}
if (downloadedVideoPaths?.length > 0) {
contents.push({
type: 'text',
text: `Downloaded Videos: ${downloadedVideoPaths.map((videoPath) => {
return `\n- ${videoPath}`;
}).join('')}`
});
}
if (images?.length > 0) {
for (const image of images) {
let imageBase64 = undefined;
// try{
// const imageName = Date.now() + '.' + 'jpeg'; // path.basename(image);
// const imagePath = path.join(downloadDir, imageName);
// // Download the image
// imageBase64 = await downloadFile(image, imagePath)
// }catch(error: any){
// console.error(`Error downloading image: ${error?.message}`);
// }
const imageName = path.basename(image);
const imagePath = path.join(downloadDir, imageName);
let imageContentInfo = `image URL: ${image};`;
const imageSaveInfo = await downloadFile(image, imagePath);
imageBase64 = imageSaveInfo?.dataUri || undefined;
const contentType = imageSaveInfo?.contentType;
const errorInfo = imageSaveInfo?.error;
if (contentType) {
imageContentInfo += `Content-Type: ${contentType};`;
}
if (errorInfo) {
// imageContentInfo += `Error: ${errorInfo};`
}
else if (imageBase64) {
imageContentInfo += `Image downloaded successfully. total size: ${imageBase64.length}; it has been saved to: ${imagePath};`;
}
contents.push({
type: 'text',
text: imageContentInfo
});
// contents.push({
// type: "text",
// text: `Image downloaded successfully. total size: ${imageBase64 && imageBase64.length || 0}; contentType: ${contentType || ''}; some base64 data: ${imageBase64?.slice(0, 50)}...; ${errorInfo ? `Error: ${errorInfo}` : ''}`
// })
// if(imageBase64){
// contents.push({
// type: 'image',
// data: imageBase64,
// mimeType: contentType || 'image/webp'
// // type: 'resource',
// // resource: {
// // blob: imageBase64,
// // uri: image,
// // mimeType: contentType || 'image/webp'
// // }
// })
// }
}
}
if (contents.length === 0) {
contents.push({
type: 'text',
text: `No content found in the post.`
});
}
return {
content: contents
};
}
catch (error) {
console.error(`[get-xhs-post] General error during scraping: ${error}`);
// Cleanup
if (browser) {
console.error('[get-xhs-post] Closing browser.');
await browser.close();
}
return {
content: [{
type: 'text',
// Error message
text: `Get Post Error: ${error}`
}]
};
}
});
async function main() {
const transport = new StdioServerTransport();
await server.connect(transport);
console.error('MCP Server running on stdio');
}
main().catch(error => {
console.error('Fatal error in main():', error);
process.exit(1);
});