@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
214 lines • 7.76 kB
JavaScript
/**
* OpenDocument Processor
*
* Processes OpenDocument format files (.odt, .ods, .odp) by extracting
* text content from the internal XML structure.
*
* @module processors/document/OpenDocumentProcessor
*/
import { createRequire } from "node:module";
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
import { SIZE_LIMITS } from "../config/index.js";
const require = createRequire(import.meta.url);
// Re-import for local use within this file
/**
* OpenDocument Processor - handles .odt, .ods, .odp files
*
* OpenDocument files are ZIP archives containing XML content.
* The main content is in content.xml within the archive.
*
* Priority: ~105 (between Word and Text)
*/
export class OpenDocumentProcessor extends BaseFileProcessor {
constructor() {
super({
maxSizeMB: SIZE_LIMITS.DOCUMENT_MAX_MB,
timeoutMs: 60000,
supportedMimeTypes: [
"application/vnd.oasis.opendocument.text",
"application/vnd.oasis.opendocument.spreadsheet",
"application/vnd.oasis.opendocument.presentation",
],
supportedExtensions: [".odt", ".ods", ".odp"],
fileTypeName: "OpenDocument",
defaultFilename: "document.odt",
});
}
/**
* Validate that the file is a valid ZIP archive (OpenDocument format)
*/
async validateDownloadedFile(buffer, _fileInfo) {
if (buffer.length < 4) {
return "File too small to be a valid OpenDocument file";
}
// Check for PK (ZIP) signature
const signature = buffer.subarray(0, 2).toString("ascii");
if (signature !== "PK") {
return "Invalid OpenDocument format - not a valid ZIP archive";
}
return null;
}
/**
* Build the processed result by extracting content from the OpenDocument
*/
buildProcessedResult(buffer, fileInfo) {
const ext = this.getExtension(fileInfo.name);
const format = this.detectFormat(ext);
let textContent;
let paragraphCount;
let truncated = false;
try {
// Dynamically import adm-zip to avoid issues if not available
const AdmZip = require("adm-zip");
const zip = new AdmZip(buffer);
// Try to get content.xml
const contentEntry = zip.getEntry("content.xml");
if (contentEntry) {
const xmlContent = contentEntry.getData().toString("utf-8");
const extracted = this.extractTextFromXml(xmlContent);
textContent = extracted.text;
paragraphCount = extracted.paragraphCount;
// Check for truncation
const maxChars = 500000; // ~500KB of text
if (textContent.length > maxChars) {
textContent =
textContent.substring(0, maxChars) + "\n\n... [Content truncated]";
truncated = true;
}
}
else {
throw new Error("content.xml not found in OpenDocument archive");
}
}
catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
throw new Error(`Failed to extract OpenDocument content: ${message}`, {
cause: error,
});
}
return {
textContent,
format,
paragraphCount,
truncated,
buffer,
mimetype: fileInfo.mimetype || "application/vnd.oasis.opendocument.text",
size: fileInfo.size,
filename: this.getFilename(fileInfo),
};
}
/**
* Decode HTML entities in a single pass to prevent double-unescaping.
* Sequential replacement is vulnerable: "&lt;" → "<" → "<"
* Single-pass avoids this by replacing each entity exactly once.
*/
decodeHtmlEntities(text) {
const entityMap = {
"&": "&",
"<": "<",
">": ">",
""": '"',
"'": "'",
};
return text.replace(/&(?:amp|lt|gt|quot|apos);/g, (match) => entityMap[match] ?? match);
}
/**
* Extract text content from OpenDocument XML
*/
extractTextFromXml(xml) {
const paragraphs = [];
// Extract text from <text:p> elements (paragraphs)
// Also handle <text:h> (headings) and <text:span> (spans)
const textPattern = /<text:(?:p|h|span)[^>]*>([\s\S]*?)<\/text:(?:p|h|span)>/gi;
const matches = xml.matchAll(textPattern);
for (const match of matches) {
// Remove nested tags iteratively to handle fragments like "<scr<script>ipt>"
let stripped = match[1];
let prev;
do {
prev = stripped;
stripped = stripped.replace(/<[^>]+>/g, "");
} while (stripped !== prev);
const text = this.decodeHtmlEntities(stripped).trim();
if (text) {
paragraphs.push(text);
}
}
// Also try a simpler approach for spreadsheets and presentations
// where content might be in different structures
if (paragraphs.length === 0) {
// Try to extract any text content between tags (iterative to handle nested fragments)
let stripped = xml;
let prev;
do {
prev = stripped;
stripped = stripped.replace(/<[^>]+>/g, " ");
} while (stripped !== prev);
const simpleText = this.decodeHtmlEntities(stripped.replace(/\s+/g, " ")).trim();
if (simpleText) {
paragraphs.push(simpleText);
}
}
return {
text: paragraphs.join("\n\n"),
paragraphCount: paragraphs.length,
};
}
/**
* Detect the OpenDocument format from file extension
*/
detectFormat(ext) {
switch (ext?.toLowerCase()) {
case ".odt":
return "odt";
case ".ods":
return "ods";
case ".odp":
return "odp";
default:
return "unknown";
}
}
/**
* Get file extension from filename
*/
getExtension(filename) {
const match = filename.toLowerCase().match(/\.[^.]+$/);
return match ? match[0] : null;
}
}
// =============================================================================
// SINGLETON INSTANCE
// =============================================================================
/**
* Singleton instance of OpenDocumentProcessor
*/
export const openDocumentProcessor = new OpenDocumentProcessor();
// =============================================================================
// HELPER FUNCTIONS
// =============================================================================
/**
* Check if a file is an OpenDocument file by MIME type or extension
*/
export function isOpenDocumentFile(mimetype, filename) {
return openDocumentProcessor.isFileSupported(mimetype, filename);
}
/**
* Validate OpenDocument file size against limits
*/
export function validateOpenDocumentSize(sizeBytes) {
return sizeBytes <= SIZE_LIMITS.DOCUMENT_MAX_MB * 1024 * 1024;
}
/**
* Process an OpenDocument file
*/
export async function processOpenDocument(fileInfo, options) {
return openDocumentProcessor.processFile(fileInfo, options);
}
/**
* Get the maximum allowed OpenDocument file size in MB
*/
export function getOpenDocumentMaxSizeMB() {
return SIZE_LIMITS.DOCUMENT_MAX_MB;
}
//# sourceMappingURL=OpenDocumentProcessor.js.map