@r-huijts/opentk-mcp

Version:

MCP server for Dutch parliamentary data access via OpenTK

117 lines (98 loc) • 4.12 kB

text/typescript

/** * Utility for extracting text from various document formats * Using established libraries for better reliability */ // Using require for pdf-parse due to CommonJS module compatibility const pdfParse = require('pdf-parse'); import * as mammoth from 'mammoth'; /** * Extracts text from a PDF document using pdf-parse library * @param data The PDF document as a Buffer * @returns The extracted text content */ export async function extractTextFromPdf(data: ArrayBuffer): Promise<string> { try { // Convert ArrayBuffer to Buffer for pdf-parse const buffer = Buffer.from(data); // Parse the PDF const result = await pdfParse(buffer); // Get the text content let extractedText = result.text || ''; // Clean up the text extractedText = extractedText.replace(/\s+/g, ' ').trim(); if (!extractedText || extractedText.length < 50) { return 'The document appears to be a PDF file, but no readable text content could be extracted. This might be due to the document structure, content format, or encryption. Please download the original document for full content.'; } return extractedText; } catch (error) { console.error(`Error extracting text from PDF: ${(error as Error).message}`); return 'Failed to extract text from the PDF document. This might be due to the document structure, content format, or encryption. Please download the original document for full content.'; } } /** * Extracts text from a DOCX document using mammoth library * @param data The DOCX document as an ArrayBuffer * @returns The extracted text content */ export async function extractTextFromDocx(data: ArrayBuffer): Promise<string> { try { // Convert ArrayBuffer to Buffer for mammoth const buffer = Buffer.from(data); // Extract text from the DOCX const result = await mammoth.extractRawText({ buffer }); // Get the text content let extractedText = result.value || ''; // Clean up the text extractedText = extractedText.replace(/\s+/g, ' ').trim(); if (!extractedText || extractedText.length < 50) { return 'The document appears to be a Word file, but no readable text content could be extracted. This might be due to the document structure or content format. Please download the original document for full content.'; } return extractedText; } catch (error) { console.error(`Error extracting text from DOCX: ${(error as Error).message}`); return 'Failed to extract text from the DOCX document. This might be due to the document structure or content format. Please download the original document for full content.'; } } /** * Summarizes the extracted text to a reasonable length * @param text The full extracted text * @param maxLength Maximum length of the summary (default: 8000 characters) * @param offset Starting position for extraction (default: 0) * @returns Object containing the summarized text and pagination info */ export function summarizeText(text: string, maxLength: number = 8000, offset: number = 0): { text: string; isTruncated: boolean; totalLength: number; currentOffset: number; nextOffset: number | null; remainingLength: number; } { const totalLength = text.length; // Validate offset if (offset >= totalLength) { return { text: 'No more content available. You have reached the end of the document.', isTruncated: false, totalLength, currentOffset: offset, nextOffset: null, remainingLength: 0 }; } // Extract the portion of text from offset to offset + maxLength const endPosition = Math.min(offset + maxLength, totalLength); const extractedText = text.substring(offset, endPosition); const isTruncated = endPosition < totalLength; // Calculate next offset and remaining length const nextOffset = isTruncated ? endPosition : null; const remainingLength = totalLength - endPosition; return { text: extractedText + (isTruncated ? '... [Text truncated due to length]' : ''), isTruncated, totalLength, currentOffset: offset, nextOffset, remainingLength }; }