@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
195 lines (194 loc) • 6.85 kB
JavaScript
/**
* Chunker Registry
*
* Central registry for all chunking strategies following NeuroLink's registry pattern.
* Provides factory methods for creating chunker instances.
*/
import { SemanticMarkdownChunker } from "../chunkers/SemanticMarkdownChunker.js";
import { CharacterChunker } from "./characterChunker.js";
import { HTMLChunker } from "./htmlChunker.js";
import { JSONChunker } from "./jsonChunker.js";
import { LaTeXChunker } from "./latexChunker.js";
import { MarkdownChunker } from "./markdownChunker.js";
import { RecursiveChunker } from "./recursiveChunker.js";
import { SemanticChunker } from "./semanticChunker.js";
import { SentenceChunker } from "./sentenceChunker.js";
import { TokenChunker } from "./tokenChunker.js";
/**
* Registry for chunking strategies
* Follows NeuroLink's factory pattern with lazy initialization
*/
export class ChunkerRegistry {
static chunkers = new Map();
static initialized = false;
/**
* Initialize all built-in chunkers
*/
static initialize() {
if (ChunkerRegistry.initialized) {
return;
}
ChunkerRegistry.register("character", () => new CharacterChunker());
ChunkerRegistry.register("recursive", () => new RecursiveChunker());
ChunkerRegistry.register("sentence", () => new SentenceChunker());
ChunkerRegistry.register("token", () => new TokenChunker());
ChunkerRegistry.register("markdown", () => new MarkdownChunker());
ChunkerRegistry.register("html", () => new HTMLChunker());
ChunkerRegistry.register("json", () => new JSONChunker());
ChunkerRegistry.register("latex", () => new LaTeXChunker());
ChunkerRegistry.register("semantic", () => new SemanticChunker());
ChunkerRegistry.register("semantic-markdown", () => new SemanticMarkdownChunker());
ChunkerRegistry.initialized = true;
}
/**
* Register a custom chunker
* @param strategy - Strategy name
* @param factory - Factory function that creates chunker instance
*/
static register(strategy, factory) {
ChunkerRegistry.chunkers.set(strategy, factory);
}
/**
* Get a chunker by strategy name
* @param strategy - Chunking strategy name
* @returns Chunker instance
* @throws Error if strategy is not registered
*/
static get(strategy) {
ChunkerRegistry.initialize();
const factory = ChunkerRegistry.chunkers.get(strategy);
if (!factory) {
throw new Error(`Unknown chunking strategy: ${strategy}. Available strategies: ${ChunkerRegistry.getAvailableStrategies().join(", ")}`);
}
return factory();
}
/**
* Get all available chunking strategies
* @returns Array of strategy names
*/
static getAvailableStrategies() {
ChunkerRegistry.initialize();
return Array.from(ChunkerRegistry.chunkers.keys());
}
/**
* Check if a strategy is registered
* @param strategy - Strategy name to check
* @returns True if strategy is registered
*/
static has(strategy) {
ChunkerRegistry.initialize();
return ChunkerRegistry.chunkers.has(strategy);
}
/**
* Get strategy recommendation based on content type
* @param contentType - Document type or MIME type
* @returns Recommended chunking strategy
*/
static getRecommendedStrategy(contentType) {
const normalized = contentType.toLowerCase();
if (normalized.includes("markdown") || normalized === "md") {
return "markdown";
}
if (normalized.includes("html") || normalized.includes("htm")) {
return "html";
}
if (normalized.includes("json")) {
return "json";
}
// Check for latex specifically - don't match "text" which contains "tex"
if (normalized.includes("latex") ||
normalized === "tex" ||
normalized.endsWith("/tex")) {
return "latex";
}
if (normalized.includes("code") || normalized.includes("programming")) {
return "recursive";
}
if (normalized.includes("document") || normalized.includes("text")) {
return "sentence";
}
// Default to recursive for general text
return "recursive";
}
/**
* Get default configuration for a strategy
* @param strategy - Chunking strategy
* @returns Default configuration object
*/
static getDefaultConfig(strategy) {
const defaults = {
character: {
maxSize: 1000,
overlap: 0,
separator: "",
keepSeparator: false,
},
recursive: {
maxSize: 1000,
overlap: 200,
separators: ["\n\n", "\n", ". ", " ", ""],
},
sentence: {
maxSize: 1000,
overlap: 0,
minSentences: 1,
sentenceEnders: [".", "!", "?"],
},
token: {
maxTokens: 512,
tokenOverlap: 50,
tokenizer: "cl100k_base",
},
markdown: {
maxSize: 1000,
headerLevels: [1, 2, 3],
preserveCodeBlocks: true,
includeHeader: true,
},
html: {
maxSize: 1000,
splitTags: ["div", "p", "section", "article"],
extractTextOnly: false,
},
json: {
maxSize: 1000,
maxDepth: 10,
includeJsonPath: true,
},
latex: {
maxSize: 1000,
splitEnvironments: ["section", "subsection", "chapter"],
preserveMath: true,
},
semantic: {
maxSize: 1000,
similarityThreshold: 0.7,
joinThreshold: 100,
},
"semantic-markdown": {
maxSize: 1000,
overlap: 100,
similarityThreshold: 0.7,
},
};
return defaults[strategy] || { maxSize: 1000 };
}
/**
* Reset the registry (useful for testing)
*/
static reset() {
ChunkerRegistry.chunkers.clear();
ChunkerRegistry.initialized = false;
}
}
/**
* Convenience function to chunk text with a given strategy
* @param text - Text to chunk
* @param strategy - Chunking strategy (default: "recursive")
* @param config - Strategy-specific configuration
* @returns Array of chunks
*/
export async function chunkText(text, strategy = "recursive", config) {
const chunker = ChunkerRegistry.get(strategy);
return chunker.chunk(text, config);
}