UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

252 lines (251 loc) 10 kB
/** * LaTeX-aware Chunker * * Splits LaTeX documents based on structure (sections, environments, math). * Best for academic papers, scientific documents, and mathematical content. */ import { randomUUID } from "crypto"; /** * LaTeX-aware chunker implementation * Splits based on LaTeX structure (sections, environments) */ export class LaTeXChunker { strategy = "latex"; defaultSplitEnvironments = [ "section", "subsection", "subsubsection", "chapter", "part", ]; mathEnvironments = [ "equation", "equation*", "align", "align*", "gather", "gather*", "multline", "multline*", "displaymath", ]; async chunk(text, config) { const { maxSize = 1000, overlap = 0, splitEnvironments = this.defaultSplitEnvironments, preserveMath = true, includePreamble = true, trimWhitespace = true, metadata = {}, } = config || {}; const documentId = randomUUID(); const chunks = []; if (!text || text.length === 0) { return chunks; } // Extract preamble if present const preambleMatch = text.match(/^([\s\S]*?)\\begin\{document\}([\s\S]*?)\\end\{document\}/); let preamble = ""; let documentContent = text; if (preambleMatch) { preamble = preambleMatch[1].trim(); documentContent = preambleMatch[2]; // Add preamble as first chunk if requested if (includePreamble && preamble.length > 0) { chunks.push({ id: randomUUID(), text: preamble, metadata: { documentId, chunkIndex: 0, startPosition: 0, endPosition: preamble.length, documentType: "latex", latexEnvironment: "preamble", custom: metadata, }, }); } } // Protect math environments let processedContent = documentContent; const mathBlocks = []; if (preserveMath) { // Protect display math environments for (const env of this.mathEnvironments) { const envPattern = new RegExp(`\\\\begin\\{${env}\\}[\\s\\S]*?\\\\end\\{${env}\\}`, "g"); processedContent = processedContent.replace(envPattern, (match) => { const placeholder = `__MATH_${mathBlocks.length}__`; mathBlocks.push({ placeholder, content: match }); return placeholder; }); } // Protect inline math processedContent = processedContent.replace(/\$\$[\s\S]*?\$\$/g, (match) => { const placeholder = `__MATH_${mathBlocks.length}__`; mathBlocks.push({ placeholder, content: match }); return placeholder; }); processedContent = processedContent.replace(/\$[^$]+\$/g, (match) => { const placeholder = `__MATH_${mathBlocks.length}__`; mathBlocks.push({ placeholder, content: match }); return placeholder; }); // Protect \[ \] math processedContent = processedContent.replace(/\\\[[\s\S]*?\\\]/g, (match) => { const placeholder = `__MATH_${mathBlocks.length}__`; mathBlocks.push({ placeholder, content: match }); return placeholder; }); } // Split by sectioning commands const sections = this.splitBySections(processedContent, splitEnvironments); let chunkIndex = chunks.length; let currentPosition = includePreamble && preamble.length > 0 ? preamble.length : 0; for (const section of sections) { const { title, content, environment } = section; // Restore math blocks let restoredContent = content; for (const { placeholder, content: mathContent } of mathBlocks) { restoredContent = restoredContent.replace(placeholder, mathContent); } // Split if content is too large const contentChunks = this.splitContent(restoredContent, maxSize, overlap); for (let i = 0; i < contentChunks.length; i++) { let chunkText = contentChunks[i]; // Include section command in first chunk if (i === 0 && title && environment) { chunkText = `\\${environment}{${title}}\n${chunkText}`; } const finalText = trimWhitespace ? chunkText.trim() : chunkText; if (finalText.length > 0) { chunks.push({ id: randomUUID(), text: finalText, metadata: { documentId, chunkIndex, startPosition: currentPosition, endPosition: currentPosition + chunkText.length, documentType: "latex", latexEnvironment: environment ?? undefined, header: title ?? undefined, custom: metadata, }, }); chunkIndex++; } currentPosition += chunkText.length; } } // Update total chunks count chunks.forEach((chunk) => { chunk.metadata.totalChunks = chunks.length; }); return chunks; } /** * Split LaTeX by sectioning commands */ splitBySections(content, splitEnvironments) { const sections = []; // Build pattern for sectioning commands const envPattern = splitEnvironments.join("|"); const sectionPattern = new RegExp(`\\\\(${envPattern})\\*?\\{([^}]*)\\}`, "g"); let lastIndex = 0; let lastTitle = null; let lastEnvironment = null; let match; // Reset regex sectionPattern.lastIndex = 0; while ((match = sectionPattern.exec(content)) !== null) { // Content before this section if (match.index > lastIndex) { const sectionContent = content.slice(lastIndex, match.index); if (sectionContent.trim()) { sections.push({ title: lastTitle, content: sectionContent.trim(), environment: lastEnvironment, }); } } lastEnvironment = match[1]; lastTitle = match[2]; lastIndex = match.index + match[0].length; } // Don't forget content after the last section if (lastIndex < content.length) { const remaining = content.slice(lastIndex); if (remaining.trim()) { sections.push({ title: lastTitle, content: remaining.trim(), environment: lastEnvironment, }); } } // If no sections found, return entire content if (sections.length === 0 && content.trim()) { sections.push({ title: null, content: content.trim(), environment: null, }); } return sections; } /** * Split content that exceeds max size */ splitContent(content, maxSize, overlap) { const effectiveMaxSize = Math.max(maxSize, 1); const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1); if (content.length <= effectiveMaxSize) { return [content]; } const chunks = []; let start = 0; while (start < content.length) { let end = Math.min(start + effectiveMaxSize, content.length); // Try to break at paragraph boundary if (end < content.length) { const searchStart = Math.max(start, end - 200); const searchText = content.slice(searchStart, end); // Look for paragraph break const paragraphBreak = searchText.lastIndexOf("\n\n"); if (paragraphBreak > 0) { end = searchStart + paragraphBreak; } else { // Look for sentence break const sentenceBreak = searchText.search(/[.!?]\s+[A-Z\\]/); if (sentenceBreak > 0) { end = searchStart + sentenceBreak + 1; } } } chunks.push(content.slice(start, end)); start = Math.max(start + 1, end - effectiveOverlap); } return chunks; } validateConfig(config) { const errors = []; const warnings = []; const latexConfig = config; if (latexConfig.maxSize !== undefined && latexConfig.maxSize <= 0) { errors.push("maxSize must be greater than 0"); } if (latexConfig.overlap !== undefined && latexConfig.overlap < 0) { errors.push("overlap must be non-negative"); } if (latexConfig.overlap !== undefined && latexConfig.maxSize !== undefined && latexConfig.overlap >= latexConfig.maxSize) { errors.push("overlap must be less than maxSize"); } if (latexConfig.splitEnvironments !== undefined && latexConfig.splitEnvironments.length === 0) { warnings.push("No split environments specified, using defaults"); } return { valid: errors.length === 0, errors, warnings, }; } }