@supadata/mcp
Version:
MCP server for Supadata video & web scraping integration. Features include YouTube, TikTok, Instagram, Twitter, and file video transcription, web scraping, batch processing and structured data extraction.
433 lines (409 loc) • 15.7 kB
JavaScript
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import { Supadata } from '@supadata/js';
import { z } from 'zod';
import dotenv from 'dotenv';
dotenv.config();
// Configuration schema
export const configSchema = z.object({
supadataApiKey: z.string().describe('Supadata API key for authentication'),
debug: z.boolean().default(false).describe('Enable debug logging'),
});
// Input schemas for tools
const scrapeInputSchema = {
url: z.string().describe('Web page URL to scrape'),
noLinks: z
.boolean()
.default(false)
.describe('When true, removes markdown links from the content'),
lang: z
.string()
.default('en')
.describe('Preferred language for the scraped content (ISO 639-1 code)'),
};
const mapInputSchema = {
url: z.string().describe('URL of the website to map'),
};
const crawlInputSchema = {
url: z.string().describe('URL of the webpage to crawl'),
limit: z
.number()
.min(1)
.max(5000)
.default(100)
.describe('Maximum number of pages to crawl (1-5000, default: 100)'),
};
const transcriptInputSchema = {
url: z
.string()
.describe('Video or file URL to get transcript from (YouTube, TikTok, Instagram, Twitter, file)'),
lang: z.string().optional().describe('Preferred language code (ISO 639-1)'),
text: z
.boolean()
.default(false)
.describe('Return plain text instead of formatted output'),
chunkSize: z
.number()
.optional()
.describe('Maximum characters per transcript chunk'),
mode: z
.enum(['native', 'auto', 'generate'])
.optional()
.describe('Transcript generation mode'),
};
const checkTranscriptStatusInputSchema = {
id: z
.string()
.describe('Transcript job ID returned from supadata_transcript'),
};
const checkCrawlStatusInputSchema = {
id: z.string().describe('Crawl job ID returned from supadata_crawl'),
};
// Configuration for retries and monitoring
const CONFIG = {
retry: {
maxAttempts: Number(process.env.SUPADATA_RETRY_MAX_ATTEMPTS) || 3,
initialDelay: Number(process.env.SUPADATA_RETRY_INITIAL_DELAY) || 1000,
maxDelay: Number(process.env.SUPADATA_RETRY_MAX_DELAY) || 10000,
backoffFactor: Number(process.env.SUPADATA_RETRY_BACKOFF_FACTOR) || 2,
},
};
// Utility functions
function delay(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function trimResponseText(text) {
return text.trim();
}
// Add retry logic with exponential backoff
async function withRetry(operation, context, attempt = 1) {
try {
return await operation();
}
catch (error) {
const isRateLimit = error instanceof Error &&
(error.message.includes('rate limit') || error.message.includes('429'));
if (isRateLimit && attempt < CONFIG.retry.maxAttempts) {
const delayMs = Math.min(CONFIG.retry.initialDelay *
Math.pow(CONFIG.retry.backoffFactor, attempt - 1), CONFIG.retry.maxDelay);
console.error(`Rate limit hit for ${context}. Attempt ${attempt}/${CONFIG.retry.maxAttempts}. Retrying in ${delayMs}ms`);
await delay(delayMs);
return withRetry(operation, context, attempt + 1);
}
throw error;
}
}
export default function createServer() {
const server = new McpServer({
name: '@supadata/mcp',
version: '1.0.0',
});
// Get API key
const SUPADATA_API_KEY = process.env.SUPADATA_API_KEY;
// Check if API key is provided
if (!SUPADATA_API_KEY && process.env.CLOUD_SERVICE !== 'true') {
console.error('Error: SUPADATA_API_KEY environment variable is required');
process.exit(1);
}
// Register transcript tool
server.tool('supadata_transcript', `Extract transcript from supported video platforms (YouTube, TikTok, Instagram, Twitter) or file URLs using Supadata's transcript API.
**Purpose:** Get transcripts from video content across multiple platforms.
**Best for:** Video content analysis, subtitle extraction, content indexing.
**Usage Example:**
\`\`\`json
{
"name": "supadata_transcript",
"arguments": {
"url": "https://youtube.com/watch?v=example",
"lang": "en",
"text": false,
"mode": "auto"
}
}
\`\`\`
**Returns:**
- Either immediate transcript content
- Or job ID for asynchronous processing (use supadata_check_transcript_status)
**Supported Platforms:** YouTube, TikTok, Instagram, Twitter, and file URLs`, transcriptInputSchema, async ({ url, lang, text, chunkSize, mode }) => {
const apiKey = process.env.CLOUD_SERVICE
? process.env.SUPADATA_API_KEY
: SUPADATA_API_KEY;
if (process.env.CLOUD_SERVICE && !apiKey) {
throw new Error('No API key provided');
}
const client = new Supadata({
apiKey: apiKey,
});
try {
const transcriptStartTime = Date.now();
console.error(`Starting transcript for URL: ${url} with options: ${JSON.stringify({ lang, text, chunkSize, mode })}`);
const options = { url };
if (lang)
options.lang = lang;
if (text !== undefined)
options.text = text;
if (chunkSize)
options.chunkSize = chunkSize;
if (mode)
options.mode = mode;
const response = await client.transcript(options);
console.error(`Transcript completed in ${Date.now() - transcriptStartTime}ms`);
// Check if response contains a job ID (async processing)
if (typeof response === 'object' &&
response !== null &&
'jobId' in response) {
const jobId = response.jobId;
return {
content: [
{
type: 'text',
text: trimResponseText(`Started transcript job for ${url} with job ID: ${jobId}. Use supadata_check_transcript_status to check progress.`),
},
],
};
}
return {
content: [
{
type: 'text',
text: trimResponseText(typeof response === 'string'
? response
: JSON.stringify(response, null, 2)),
},
],
};
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
throw new Error(errorMessage);
}
});
// Register check transcript status tool
server.tool('supadata_check_transcript_status', `Check the status and retrieve results of a transcript job created with supadata_transcript.
**Purpose:** Monitor transcript job progress and retrieve completed results.
**Workflow:** Use the job ID returned from supadata_transcript to check status and get results.
**Usage Example:**
\`\`\`json
{
"name": "supadata_check_transcript_status",
"arguments": {
"id": "550e8400-e29b-41d4-a716-446655440000"
}
}
\`\`\`
**Returns:**
- Job status: 'queued', 'active', 'completed', 'failed'
- For completed jobs: Full transcript content
- Error details if job failed
**Tip:** Poll this endpoint periodically until status is 'completed' or 'failed'.`, checkTranscriptStatusInputSchema, async ({ id }) => {
const apiKey = process.env.CLOUD_SERVICE
? process.env.SUPADATA_API_KEY
: SUPADATA_API_KEY;
if (process.env.CLOUD_SERVICE && !apiKey) {
throw new Error('No API key provided');
}
const client = new Supadata({
apiKey: apiKey,
});
const response = await client.transcript.getJobStatus(id);
return {
content: [
{
type: 'text',
text: trimResponseText(JSON.stringify(response, null, 2)),
},
],
};
});
// Register scrape tool
server.tool('supadata_scrape', `Extract content from any web page to Markdown format using Supadata's powerful scraping API.
**Purpose:** Single page content extraction with automatic formatting to Markdown.
**Best for:** When you know exactly which page contains the information you need.
**Usage Example:**
\`\`\`json
{
"name": "supadata_scrape",
"arguments": {
"url": "https://example.com",
"noLinks": false,
"lang": "en"
}
}
\`\`\`
**Returns:**
- URL of the scraped page
- Extracted content in Markdown format
- Page name and description
- Character count
- List of URLs found on the page`, scrapeInputSchema, async ({ url, noLinks, lang }) => {
const apiKey = process.env.CLOUD_SERVICE
? process.env.SUPADATA_API_KEY // In cloud service mode, get from env
: SUPADATA_API_KEY;
if (process.env.CLOUD_SERVICE && !apiKey) {
throw new Error('No API key provided');
}
const client = new Supadata({
apiKey: apiKey,
});
try {
const scrapeStartTime = Date.now();
console.error(`Starting scrape for URL: ${url} with options: ${JSON.stringify({ noLinks, lang })}`);
const response = await client.web.scrape(url);
console.error(`Scrape completed in ${Date.now() - scrapeStartTime}ms`);
return {
content: [
{
type: 'text',
text: trimResponseText(typeof response === 'string'
? response
: JSON.stringify(response, null, 2)),
},
],
};
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
throw new Error(errorMessage);
}
});
// Register map tool
server.tool('supadata_map', `Crawl a whole website and get all URLs on it using Supadata's mapping API.
**Purpose:** Extract all links found on a website for content discovery and sitemap creation.
**Best for:** Website content discovery, SEO analysis, content aggregation, automated web scraping and indexing.
**Use cases:** Creating a sitemap, running a crawler to fetch content from all pages of a website.
**Usage Example:**
\`\`\`json
{
"name": "supadata_map",
"arguments": {
"url": "https://example.com"
}
}
\`\`\`
**Returns:** Array of URLs found on the website.`, mapInputSchema, async ({ url }) => {
const apiKey = process.env.CLOUD_SERVICE
? process.env.SUPADATA_API_KEY
: SUPADATA_API_KEY;
if (process.env.CLOUD_SERVICE && !apiKey) {
throw new Error('No API key provided');
}
const client = new Supadata({
apiKey: apiKey,
});
const response = await client.web.map(url);
const urls = Array.isArray(response) ? response : [response];
return {
content: [
{
type: 'text',
text: trimResponseText(urls.join('\n')),
},
],
};
});
// Register crawl tool
server.tool('supadata_crawl', `Create a crawl job to extract content from all pages on a website using Supadata's crawling API.
**Purpose:** Crawl a whole website and get content of all pages on it.
**Best for:** Extracting content from multiple related pages when you need comprehensive coverage.
**Workflow:** 1) Create crawl job → 2) Receive job ID → 3) Check job status and retrieve results
**Crawling Behavior:**
- Follows only child links within the specified domain
- Example: For https://supadata.ai/blog, crawls https://supadata.ai/blog/article-1 but not https://supadata.ai/about
- To crawl entire website, use top-level URL like https://supadata.ai
**Usage Example:**
\`\`\`json
{
"name": "supadata_crawl",
"arguments": {
"url": "https://example.com",
"limit": 100
}
}
\`\`\`
**Returns:** Job ID for status checking. Use supadata_check_crawl_status to check progress.
**Job Status:** Possible statuses are 'scraping', 'completed', 'failed', or 'cancelled'
**Important:** Respect robots.txt and website terms of service when crawling web content.`, crawlInputSchema, async ({ url, limit }) => {
const apiKey = process.env.CLOUD_SERVICE
? process.env.SUPADATA_API_KEY
: SUPADATA_API_KEY;
if (process.env.CLOUD_SERVICE && !apiKey) {
throw new Error('No API key provided');
}
const client = new Supadata({
apiKey: apiKey,
});
const response = await withRetry(async () => client.web.crawl({
url,
limit: limit || 100,
}), 'crawl operation');
const jobId = response.jobId || response.id || response;
return {
content: [
{
type: 'text',
text: trimResponseText(`Started crawl for ${url} with job ID: ${jobId}. Use supadata_check_crawl_status to check progress.`),
},
],
};
});
// Register check crawl status tool
server.tool('supadata_check_crawl_status', `Check the status and retrieve results of a crawl job created with supadata_crawl.
**Purpose:** Monitor crawl job progress and retrieve completed results.
**Workflow:** Use the job ID returned from supadata_crawl to check status and get results.
**Usage Example:**
\`\`\`json
{
"name": "supadata_check_crawl_status",
"arguments": {
"id": "550e8400-e29b-41d4-a716-446655440000"
}
}
\`\`\`
**Returns:**
- Job status: 'scraping', 'completed', 'failed', or 'cancelled'
- For completed jobs: URL, Markdown content, page title, and description for each crawled page
- Progress information and any error details if applicable
**Tip:** Poll this endpoint periodically until status is 'completed' or 'failed'.`, checkCrawlStatusInputSchema, async ({ id }) => {
const apiKey = process.env.CLOUD_SERVICE
? process.env.SUPADATA_API_KEY
: SUPADATA_API_KEY;
if (process.env.CLOUD_SERVICE && !apiKey) {
throw new Error('No API key provided');
}
const client = new Supadata({
apiKey: apiKey,
});
const response = await client.web.getCrawlResults(id);
return {
content: [
{
type: 'text',
text: trimResponseText(JSON.stringify(response, null, 2)),
},
],
};
});
return server.server;
}
// Server startup
async function runServer() {
try {
console.error('Initializing Supadata MCP Server...');
const server = createServer();
const transport = new StdioServerTransport();
console.error('Running in stdio mode, logging will be directed to stderr');
await server.connect(transport);
console.error('Supadata MCP Server initialized successfully');
console.error('Supadata MCP Server running on stdio');
}
catch (error) {
console.error('Fatal error running server:', error);
process.exit(1);
}
}
// Only run the server if this file is executed directly
runServer().catch((error) => {
console.error('Fatal error running server:', error);
process.exit(1);
});