@r-huijts/opentk-mcp
Version:
MCP server for Dutch parliamentary data access via OpenTK
132 lines • 5.69 kB
JavaScript
;
/**
* Utility for extracting text from various document formats
* Using established libraries for better reliability
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractTextFromPdf = extractTextFromPdf;
exports.extractTextFromDocx = extractTextFromDocx;
exports.summarizeText = summarizeText;
// Using require for pdf-parse due to CommonJS module compatibility
const pdfParse = require('pdf-parse');
const mammoth = __importStar(require("mammoth"));
/**
* Extracts text from a PDF document using pdf-parse library
* @param data The PDF document as a Buffer
* @returns The extracted text content
*/
async function extractTextFromPdf(data) {
try {
// Convert ArrayBuffer to Buffer for pdf-parse
const buffer = Buffer.from(data);
// Parse the PDF
const result = await pdfParse(buffer);
// Get the text content
let extractedText = result.text || '';
// Clean up the text
extractedText = extractedText.replace(/\s+/g, ' ').trim();
if (!extractedText || extractedText.length < 50) {
return 'The document appears to be a PDF file, but no readable text content could be extracted. This might be due to the document structure, content format, or encryption. Please download the original document for full content.';
}
return extractedText;
}
catch (error) {
console.error(`Error extracting text from PDF: ${error.message}`);
return 'Failed to extract text from the PDF document. This might be due to the document structure, content format, or encryption. Please download the original document for full content.';
}
}
/**
* Extracts text from a DOCX document using mammoth library
* @param data The DOCX document as an ArrayBuffer
* @returns The extracted text content
*/
async function extractTextFromDocx(data) {
try {
// Convert ArrayBuffer to Buffer for mammoth
const buffer = Buffer.from(data);
// Extract text from the DOCX
const result = await mammoth.extractRawText({ buffer });
// Get the text content
let extractedText = result.value || '';
// Clean up the text
extractedText = extractedText.replace(/\s+/g, ' ').trim();
if (!extractedText || extractedText.length < 50) {
return 'The document appears to be a Word file, but no readable text content could be extracted. This might be due to the document structure or content format. Please download the original document for full content.';
}
return extractedText;
}
catch (error) {
console.error(`Error extracting text from DOCX: ${error.message}`);
return 'Failed to extract text from the DOCX document. This might be due to the document structure or content format. Please download the original document for full content.';
}
}
/**
* Summarizes the extracted text to a reasonable length
* @param text The full extracted text
* @param maxLength Maximum length of the summary (default: 8000 characters)
* @param offset Starting position for extraction (default: 0)
* @returns Object containing the summarized text and pagination info
*/
function summarizeText(text, maxLength = 8000, offset = 0) {
const totalLength = text.length;
// Validate offset
if (offset >= totalLength) {
return {
text: 'No more content available. You have reached the end of the document.',
isTruncated: false,
totalLength,
currentOffset: offset,
nextOffset: null,
remainingLength: 0
};
}
// Extract the portion of text from offset to offset + maxLength
const endPosition = Math.min(offset + maxLength, totalLength);
const extractedText = text.substring(offset, endPosition);
const isTruncated = endPosition < totalLength;
// Calculate next offset and remaining length
const nextOffset = isTruncated ? endPosition : null;
const remainingLength = totalLength - endPosition;
return {
text: extractedText + (isTruncated ? '... [Text truncated due to length]' : ''),
isTruncated,
totalLength,
currentOffset: offset,
nextOffset,
remainingLength
};
}
//# sourceMappingURL=document-extractor.js.map