UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

422 lines (421 loc) 13.8 kB
/** * Chunker Registry * * Centralized registry for all chunking strategies with metadata * and discovery capabilities. Follows the BaseRegistry pattern. */ import { BaseRegistry } from "../core/infrastructure/index.js"; import { logger } from "../utils/logger.js"; import { ChunkingError, RAGErrorCodes } from "./errors/RAGError.js"; /** * Default chunker metadata entries */ const DEFAULT_CHUNKER_METADATA = { character: { description: "Splits text into fixed-size character chunks with optional overlap", defaultConfig: { maxSize: 1000, overlap: 100, }, supportedOptions: ["maxSize", "overlap", "minSize"], useCases: [ "Simple text processing", "Fixed-size chunks needed", "Language-agnostic splitting", ], aliases: ["char", "fixed-size", "fixed"], }, recursive: { description: "Recursively splits text using ordered separators (paragraphs, sentences, etc.)", defaultConfig: { maxSize: 1000, overlap: 100, separators: ["\n\n", "\n", ". ", " ", ""], }, supportedOptions: [ "maxSize", "overlap", "separators", "keepSeparators", "minSize", ], useCases: [ "General text documents", "Preserving semantic boundaries", "Default choice for most use cases", ], aliases: ["recursive-character", "langchain-default"], }, sentence: { description: "Splits text by sentence boundaries for semantically meaningful chunks", defaultConfig: { maxSize: 1000, overlap: 1, }, supportedOptions: [ "maxSize", "overlap", "boundaryDetection", "maxSentences", ], useCases: [ "Q&A applications", "Sentence-level analysis", "Preserving complete thoughts", ], aliases: ["sent", "sentence-based"], }, token: { description: "Splits text by token count using a specific tokenizer (GPT, Claude, etc.)", defaultConfig: { maxSize: 512, overlap: 50, }, supportedOptions: ["maxSize", "overlap", "tokenizer", "maxTokens"], useCases: [ "Token-aware splitting", "Optimal for specific models", "Precise token budget management", ], aliases: ["tok", "tokenized"], }, markdown: { description: "Splits markdown content by headers and structural elements", defaultConfig: { maxSize: 1000, overlap: 0, }, supportedOptions: [ "maxSize", "overlap", "headerLevels", "splitCodeBlocks", "preserveMetadata", ], useCases: [ "Documentation processing", "README files", "Technical documentation", ], aliases: ["md", "markdown-header"], }, html: { description: "Splits HTML content by semantic tags while optionally stripping markup", defaultConfig: { maxSize: 1000, overlap: 0, }, supportedOptions: [ "maxSize", "overlap", "splitTags", "stripTags", "preserveAttributes", ], useCases: ["Web content processing", "HTML documents", "Web scraping"], aliases: ["html-tag", "web"], }, json: { description: "Splits JSON documents by object boundaries and nested structures", defaultConfig: { maxSize: 1000, overlap: 0, }, supportedOptions: ["maxSize", "overlap", "maxDepth", "chunkKeys"], useCases: [ "API response processing", "Structured data", "Configuration files", ], aliases: ["json-object", "structured"], }, latex: { description: "Splits LaTeX documents by sections, environments, and math blocks", defaultConfig: { maxSize: 1000, overlap: 0, }, supportedOptions: [ "maxSize", "overlap", "environments", "splitMathBlocks", "preserveMetadata", ], useCases: [ "Academic papers", "Scientific documents", "Mathematical content", ], aliases: ["tex", "latex-section"], }, semantic: { description: "Uses LLM to identify semantically meaningful split points", defaultConfig: { maxSize: 1000, overlap: 100, }, supportedOptions: [ "maxSize", "overlap", "modelName", "provider", "similarityThreshold", ], useCases: [ "Advanced semantic understanding", "Context-aware splitting", "AI-enhanced chunking", ], aliases: ["llm", "ai-semantic"], }, "semantic-markdown": { description: "Combines markdown splitting with semantic similarity for intelligent merging", defaultConfig: { maxSize: 1000, overlap: 100, }, supportedOptions: [ "maxSize", "overlap", "similarityThreshold", "maxMergeSize", "preserveMetadata", ], useCases: [ "Context-aware documentation", "Knowledge base creation", "Semantic search preparation", ], aliases: ["semantic-md", "smart-markdown"], }, }; /** * Chunker Registry * * Manages registration and discovery of all chunking strategies. * Extends BaseRegistry for consistent lifecycle management. */ export class ChunkerRegistry extends BaseRegistry { static instance = null; aliasMap = new Map(); constructor() { super(); } /** * Get singleton instance */ static getInstance() { if (!ChunkerRegistry.instance) { ChunkerRegistry.instance = new ChunkerRegistry(); } return ChunkerRegistry.instance; } /** * Reset singleton (for testing) */ static resetInstance() { if (ChunkerRegistry.instance) { ChunkerRegistry.instance.clear(); ChunkerRegistry.instance = null; } } /** * Register all default chunkers */ async registerAll() { // Register character chunker this.registerChunker("character", async () => { const { CharacterChunker } = await import("./chunkers/CharacterChunker.js"); return new CharacterChunker(); }, DEFAULT_CHUNKER_METADATA.character); // Register recursive chunker this.registerChunker("recursive", async () => { const { RecursiveChunker } = await import("./chunkers/RecursiveChunker.js"); return new RecursiveChunker(); }, DEFAULT_CHUNKER_METADATA.recursive); // Register sentence chunker this.registerChunker("sentence", async () => { const { SentenceChunker } = await import("./chunkers/SentenceChunker.js"); return new SentenceChunker(); }, DEFAULT_CHUNKER_METADATA.sentence); // Register token chunker this.registerChunker("token", async () => { const { TokenChunker } = await import("./chunkers/TokenChunker.js"); return new TokenChunker(); }, DEFAULT_CHUNKER_METADATA.token); // Register markdown chunker this.registerChunker("markdown", async () => { const { MarkdownChunker } = await import("./chunkers/MarkdownChunker.js"); return new MarkdownChunker(); }, DEFAULT_CHUNKER_METADATA.markdown); // Register HTML chunker this.registerChunker("html", async () => { const { HTMLChunker } = await import("./chunkers/HTMLChunker.js"); return new HTMLChunker(); }, DEFAULT_CHUNKER_METADATA.html); // Register JSON chunker this.registerChunker("json", async () => { const { JSONChunker } = await import("./chunkers/JSONChunker.js"); return new JSONChunker(); }, DEFAULT_CHUNKER_METADATA.json); // Register LaTeX chunker this.registerChunker("latex", async () => { const { LaTeXChunker } = await import("./chunkers/LaTeXChunker.js"); return new LaTeXChunker(); }, DEFAULT_CHUNKER_METADATA.latex); // Register semantic chunker this.registerChunker("semantic", async () => { const { SemanticChunker } = await import("./chunking/semanticChunker.js"); return new SemanticChunker(); }, DEFAULT_CHUNKER_METADATA.semantic); // Register semantic-markdown chunker this.registerChunker("semantic-markdown", async () => { const { SemanticMarkdownChunker } = await import("./chunkers/SemanticMarkdownChunker.js"); return new SemanticMarkdownChunker(); }, DEFAULT_CHUNKER_METADATA["semantic-markdown"]); logger.debug(`[ChunkerRegistry] Registered ${this.items.size} chunking strategies`); } /** * Register a chunker with aliases */ registerChunker(strategy, factory, metadata) { this.register(strategy, factory, metadata.aliases ?? [], { metadata }); // Register aliases in local alias map for strategy resolution if (metadata.aliases) { for (const alias of metadata.aliases) { this.aliasMap.set(alias.toLowerCase(), strategy); logger.debug(`[ChunkerRegistry] Registered alias '${alias}' -> '${strategy}'`); } } } /** * Resolve strategy name from alias */ resolveStrategy(nameOrAlias) { const lower = nameOrAlias.toLowerCase(); // Check if it's a direct strategy name if (this.items.has(lower)) { return lower; } // Check aliases const resolved = this.aliasMap.get(lower); if (resolved) { return resolved; } throw new ChunkingError(`Unknown chunking strategy: '${nameOrAlias}'. Available strategies: ${this.list() .map((item) => item.id) .join(", ")}`, { code: RAGErrorCodes.CHUNKING_STRATEGY_NOT_FOUND, details: { requestedStrategy: nameOrAlias, availableStrategies: this.list().map((item) => item.id), }, }); } /** * Get a chunker by strategy name or alias */ async getChunker(strategyOrAlias) { await this.ensureInitialized(); const strategy = this.resolveStrategy(strategyOrAlias); const chunker = await this.get(strategy); if (!chunker) { throw new ChunkingError(`Chunker not found: ${strategy}`, { code: RAGErrorCodes.CHUNKING_STRATEGY_NOT_FOUND, details: { strategy }, }); } return chunker; } /** * Get list of available chunker strategies */ async getAvailableChunkers() { await this.ensureInitialized(); return this.list().map((item) => item.id); } /** * Get metadata for a specific chunker */ getChunkerMetadata(strategyOrAlias) { const strategy = this.resolveStrategy(strategyOrAlias); const entry = this.list().find((item) => item.id === strategy); return entry?.metadata; } /** * Get all aliases for a strategy */ getAliasesForStrategy(strategy) { const metadata = DEFAULT_CHUNKER_METADATA[strategy]; return metadata?.aliases ?? []; } /** * Get all registered aliases */ getAllAliases() { return new Map(this.aliasMap); } /** * Check if a strategy or alias exists */ hasChunker(strategyOrAlias) { try { this.resolveStrategy(strategyOrAlias); return true; } catch { return false; } } /** * Get chunkers by use case */ getChunkersByUseCase(useCase) { const matches = []; const useCaseLower = useCase.toLowerCase(); for (const [strategy, metadata] of Object.entries(DEFAULT_CHUNKER_METADATA)) { const hasMatchingUseCase = metadata.useCases?.some((uc) => uc.toLowerCase().includes(useCaseLower)) ?? false; if (hasMatchingUseCase) { matches.push(strategy); } } return matches; } /** * Get default configuration for a chunker */ getDefaultConfig(strategyOrAlias) { const metadata = this.getChunkerMetadata(strategyOrAlias); return metadata?.defaultConfig; } /** * Clear the registry (also clears aliases) */ clear() { super.clear(); this.aliasMap.clear(); } } /** * Global chunker registry singleton */ export const chunkerRegistry = ChunkerRegistry.getInstance(); /** * Convenience function to get available chunkers */ export async function getAvailableChunkers() { return chunkerRegistry.getAvailableChunkers(); } /** * Convenience function to get chunker by strategy */ export async function getChunker(strategyOrAlias) { return chunkerRegistry.getChunker(strategyOrAlias); } /** * Convenience function to get chunker metadata */ export function getChunkerMetadata(strategyOrAlias) { return chunkerRegistry.getChunkerMetadata(strategyOrAlias); }