UNPKG

canvas-mcp-tool

Version:

canvas-mcp-tool - A MCP server for students

109 lines 4.49 kB
import { createLogger } from './logger.js'; const logger = createLogger('FileExtractor'); // Maximum file size: 5GB const MAX_FILE_SIZE = 5 * 1024 * 1024 * 1024; export class FileExtractor { async extractFromUrl(url, headers) { logger.info(`Extracting content from: ${url}`); // This is a placeholder - we'll implement the actual extraction // after deciding on the best libraries throw new Error('File extraction not yet implemented. Please download files directly from Canvas.'); } async extractFromBuffer(buffer, filename, mimeType) { if (buffer.length > MAX_FILE_SIZE) { throw new Error(`File too large: ${buffer.length} bytes (max: ${MAX_FILE_SIZE} bytes)`); } const metadata = { filename, size: buffer.length, type: mimeType, extractedAt: new Date().toISOString() }; // Placeholder for actual extraction logic switch (mimeType) { case 'application/pdf': return this.extractPDF(buffer, metadata); case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': return this.extractDOCX(buffer, metadata); case 'text/plain': return { content: buffer.toString('utf-8'), metadata: { ...metadata, extraction_method: 'basic' } }; default: throw new Error(`Unsupported file type: ${mimeType}`); } } async extractPDF(buffer, metadata) { // For now, return a placeholder indicating PDF support is coming return { content: '[PDF content extraction not yet implemented. The file has been identified as a PDF document.]', metadata: { ...metadata, extraction_method: 'basic', error: 'PDF extraction requires pdf-parse library' } }; } async extractDOCX(buffer, metadata) { try { // Basic DOCX text extraction using unzipper const unzipper = (await import('unzipper')).default; const directory = await unzipper.Open.buffer(buffer); // Find the main document content const documentFile = directory.files.find(f => f.path === 'word/document.xml'); if (!documentFile) { throw new Error('Invalid DOCX file: document.xml not found'); } const documentXml = await documentFile.buffer(); const xmlContent = documentXml.toString('utf-8'); // Extract text content from XML (basic extraction) // Remove XML tags and extract text const textContent = xmlContent .replace(/<w:p[^>]*>/g, '\n') // Paragraphs to newlines .replace(/<[^>]+>/g, '') // Remove all XML tags .replace(/&lt;/g, '<') .replace(/&gt;/g, '>') .replace(/&amp;/g, '&') .replace(/&quot;/g, '"') .replace(/&apos;/g, "'") .replace(/\s+/g, ' ') // Normalize whitespace .trim(); return { content: textContent, metadata: { ...metadata, extraction_method: 'basic', has_images: directory.files.some(f => f.path.startsWith('word/media/')), has_tables: xmlContent.includes('<w:tbl') } }; } catch (error) { // Fallback for when unzipper is not available return { content: `[DOCX file detected but cannot be extracted. Error: ${error.message}]`, metadata: { ...metadata, extraction_method: 'basic', error: 'DOCX extraction requires unzipper library' } }; } } // Helper to determine if a file type is supported static isSupported(mimeType) { const supportedTypes = [ 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text/plain', 'text/markdown', 'text/csv' ]; return supportedTypes.includes(mimeType); } } //# sourceMappingURL=file-extractor.js.map