canvas-mcp-tool
Version:
canvas-mcp-tool - A MCP server for students
109 lines • 4.49 kB
JavaScript
import { createLogger } from './logger.js';
const logger = createLogger('FileExtractor');
// Maximum file size: 5GB
const MAX_FILE_SIZE = 5 * 1024 * 1024 * 1024;
export class FileExtractor {
async extractFromUrl(url, headers) {
logger.info(`Extracting content from: ${url}`);
// This is a placeholder - we'll implement the actual extraction
// after deciding on the best libraries
throw new Error('File extraction not yet implemented. Please download files directly from Canvas.');
}
async extractFromBuffer(buffer, filename, mimeType) {
if (buffer.length > MAX_FILE_SIZE) {
throw new Error(`File too large: ${buffer.length} bytes (max: ${MAX_FILE_SIZE} bytes)`);
}
const metadata = {
filename,
size: buffer.length,
type: mimeType,
extractedAt: new Date().toISOString()
};
// Placeholder for actual extraction logic
switch (mimeType) {
case 'application/pdf':
return this.extractPDF(buffer, metadata);
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
return this.extractDOCX(buffer, metadata);
case 'text/plain':
return {
content: buffer.toString('utf-8'),
metadata: {
...metadata,
extraction_method: 'basic'
}
};
default:
throw new Error(`Unsupported file type: ${mimeType}`);
}
}
async extractPDF(buffer, metadata) {
// For now, return a placeholder indicating PDF support is coming
return {
content: '[PDF content extraction not yet implemented. The file has been identified as a PDF document.]',
metadata: {
...metadata,
extraction_method: 'basic',
error: 'PDF extraction requires pdf-parse library'
}
};
}
async extractDOCX(buffer, metadata) {
try {
// Basic DOCX text extraction using unzipper
const unzipper = (await import('unzipper')).default;
const directory = await unzipper.Open.buffer(buffer);
// Find the main document content
const documentFile = directory.files.find(f => f.path === 'word/document.xml');
if (!documentFile) {
throw new Error('Invalid DOCX file: document.xml not found');
}
const documentXml = await documentFile.buffer();
const xmlContent = documentXml.toString('utf-8');
// Extract text content from XML (basic extraction)
// Remove XML tags and extract text
const textContent = xmlContent
.replace(/<w:p[^>]*>/g, '\n') // Paragraphs to newlines
.replace(/<[^>]+>/g, '') // Remove all XML tags
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/&/g, '&')
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
return {
content: textContent,
metadata: {
...metadata,
extraction_method: 'basic',
has_images: directory.files.some(f => f.path.startsWith('word/media/')),
has_tables: xmlContent.includes('<w:tbl')
}
};
}
catch (error) {
// Fallback for when unzipper is not available
return {
content: `[DOCX file detected but cannot be extracted. Error: ${error.message}]`,
metadata: {
...metadata,
extraction_method: 'basic',
error: 'DOCX extraction requires unzipper library'
}
};
}
}
// Helper to determine if a file type is supported
static isSupported(mimeType) {
const supportedTypes = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'text/plain',
'text/markdown',
'text/csv'
];
return supportedTypes.includes(mimeType);
}
}
//# sourceMappingURL=file-extractor.js.map