@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
321 lines • 12.3 kB
JavaScript
/**
* Chunker Factory
*
* Factory for creating chunker instances with configuration.
* Follows the BaseFactory pattern for consistent lifecycle management.
*/
import { BaseFactory } from "../core/infrastructure/index.js";
import { logger } from "../utils/logger.js";
import { ChunkingError, RAGErrorCodes } from "./errors/RAGError.js";
/**
* Default chunker metadata entries
*/
const DEFAULT_CHUNKER_METADATA = {
character: {
description: "Splits text into fixed-size character chunks with optional overlap",
defaultConfig: { maxSize: 1000, overlap: 100 },
supportedOptions: ["maxSize", "overlap", "minSize"],
useCases: ["Simple text processing", "Fixed-size chunks needed"],
aliases: ["char", "fixed-size", "fixed"],
},
recursive: {
description: "Recursively splits text using ordered separators",
defaultConfig: {
maxSize: 1000,
overlap: 100,
separators: ["\n\n", "\n", ". ", " ", ""],
},
supportedOptions: ["maxSize", "overlap", "separators", "keepSeparators"],
useCases: ["General text documents", "Default choice"],
aliases: ["recursive-character", "langchain-default"],
},
sentence: {
description: "Splits text by sentence boundaries",
defaultConfig: { maxSize: 1000, overlap: 1 },
supportedOptions: [
"maxSize",
"overlap",
"boundaryDetection",
"maxSentences",
],
useCases: ["Q&A applications", "Sentence-level analysis"],
aliases: ["sent", "sentence-based"],
},
token: {
description: "Splits text by token count using a specific tokenizer",
defaultConfig: { maxSize: 512, overlap: 50 },
supportedOptions: ["maxSize", "overlap", "tokenizer", "maxTokens"],
useCases: ["Token-aware splitting", "Model-specific chunks"],
aliases: ["tok", "tokenized"],
},
markdown: {
description: "Splits markdown content by headers and structural elements",
defaultConfig: { maxSize: 1000, overlap: 50 },
supportedOptions: ["maxSize", "headerLevels", "splitCodeBlocks"],
useCases: ["Documentation processing", "README files"],
aliases: ["md", "markdown-header"],
},
html: {
description: "Splits HTML content by semantic tags",
defaultConfig: { maxSize: 1000, overlap: 0 },
supportedOptions: [
"maxSize",
"splitTags",
"stripTags",
"preserveAttributes",
],
useCases: ["Web content processing", "HTML documents"],
aliases: ["html-tag", "web"],
},
json: {
description: "Splits JSON documents by object boundaries",
defaultConfig: { maxSize: 1000, overlap: 0 },
supportedOptions: ["maxSize", "maxDepth", "chunkKeys"],
useCases: ["API response processing", "Structured data"],
aliases: ["json-object", "structured"],
},
latex: {
description: "Splits LaTeX documents by sections and environments",
defaultConfig: { maxSize: 1000, overlap: 0 },
supportedOptions: ["maxSize", "environments", "splitMathBlocks"],
useCases: ["Academic papers", "Scientific documents"],
aliases: ["tex", "latex-section"],
},
semantic: {
description: "Uses LLM to identify semantically meaningful split points",
defaultConfig: { maxSize: 1000, overlap: 100 },
supportedOptions: [
"maxSize",
"modelName",
"provider",
"similarityThreshold",
],
useCases: ["Advanced semantic understanding", "AI-enhanced chunking"],
aliases: ["llm", "ai-semantic"],
},
"semantic-markdown": {
description: "Combines markdown splitting with semantic similarity",
defaultConfig: { maxSize: 1000, overlap: 100 },
supportedOptions: ["maxSize", "similarityThreshold", "maxMergeSize"],
useCases: ["Context-aware documentation", "Knowledge bases"],
aliases: ["semantic-md", "smart-markdown"],
},
};
/**
* Chunker Factory
*
* Creates chunker instances based on strategy with configuration support.
* Uses lazy loading via dynamic imports to avoid circular dependencies.
*/
export class ChunkerFactory extends BaseFactory {
static instance = null;
metadataMap = new Map();
constructor() {
super();
}
/**
* Get singleton instance
*/
static getInstance() {
if (!ChunkerFactory.instance) {
ChunkerFactory.instance = new ChunkerFactory();
}
return ChunkerFactory.instance;
}
/**
* Reset singleton (for testing)
*/
static resetInstance() {
if (ChunkerFactory.instance) {
ChunkerFactory.instance.clear();
ChunkerFactory.instance = null;
}
}
/**
* Register all default chunkers
*/
async registerAll() {
// Register character chunker
this.registerChunker("character", async (config) => {
const { CharacterChunker } = await import("./chunkers/CharacterChunker.js");
return new CharacterChunker(config);
}, DEFAULT_CHUNKER_METADATA.character);
// Register recursive chunker
this.registerChunker("recursive", async (config) => {
const { RecursiveChunker } = await import("./chunkers/RecursiveChunker.js");
return new RecursiveChunker(config);
}, DEFAULT_CHUNKER_METADATA.recursive);
// Register sentence chunker
this.registerChunker("sentence", async (config) => {
const { SentenceChunker } = await import("./chunkers/SentenceChunker.js");
return new SentenceChunker(config);
}, DEFAULT_CHUNKER_METADATA.sentence);
// Register token chunker
this.registerChunker("token", async (config) => {
const { TokenChunker } = await import("./chunkers/TokenChunker.js");
return new TokenChunker(config);
}, DEFAULT_CHUNKER_METADATA.token);
// Register markdown chunker
this.registerChunker("markdown", async (config) => {
const { MarkdownChunker } = await import("./chunkers/MarkdownChunker.js");
return new MarkdownChunker(config);
}, DEFAULT_CHUNKER_METADATA.markdown);
// Register HTML chunker
this.registerChunker("html", async (config) => {
const { HTMLChunker } = await import("./chunkers/HTMLChunker.js");
return new HTMLChunker(config);
}, DEFAULT_CHUNKER_METADATA.html);
// Register JSON chunker
this.registerChunker("json", async (config) => {
const { JSONChunker } = await import("./chunkers/JSONChunker.js");
return new JSONChunker(config);
}, DEFAULT_CHUNKER_METADATA.json);
// Register LaTeX chunker
this.registerChunker("latex", async (config) => {
const { LaTeXChunker } = await import("./chunkers/LaTeXChunker.js");
return new LaTeXChunker(config);
}, DEFAULT_CHUNKER_METADATA.latex);
// Register semantic chunker (placeholder - uses recursive as fallback)
this.registerChunker("semantic", async (config) => {
// TODO: Implement dedicated SemanticChunker with LLM support
// For now, fall back to RecursiveChunker with semantic defaults
const { RecursiveChunker } = await import("./chunkers/RecursiveChunker.js");
return new RecursiveChunker(config);
}, DEFAULT_CHUNKER_METADATA.semantic);
// Register semantic-markdown chunker
this.registerChunker("semantic-markdown", async (config) => {
const { SemanticMarkdownChunker } = await import("./chunkers/SemanticMarkdownChunker.js");
return new SemanticMarkdownChunker(config);
}, DEFAULT_CHUNKER_METADATA["semantic-markdown"]);
logger.debug(`[ChunkerFactory] Registered ${this.items.size} chunking strategies`);
}
/**
* Register a chunker with metadata and aliases
*/
registerChunker(strategy, factory, metadata) {
// Store metadata
this.metadataMap.set(strategy, metadata);
// Register with aliases
this.register(strategy, factory, metadata.aliases, { metadata });
logger.debug(`[ChunkerFactory] Registered chunker '${strategy}' with aliases: ${metadata.aliases?.join(", ") ?? "none"}`);
}
/**
* Create a chunker by strategy name or alias
*/
async createChunker(strategyOrAlias, config) {
await this.ensureInitialized();
const resolvedName = this.resolveName(strategyOrAlias);
if (!this.has(resolvedName)) {
const available = this.getAvailable();
throw new ChunkingError(`Unknown chunking strategy: '${strategyOrAlias}'. Available strategies: ${available.join(", ")}`, {
code: RAGErrorCodes.CHUNKING_STRATEGY_NOT_FOUND,
details: {
requestedStrategy: strategyOrAlias,
availableStrategies: available,
},
});
}
try {
const chunker = await this.create(resolvedName, config);
logger.debug(`[ChunkerFactory] Created chunker '${resolvedName}' with config:`, config);
return chunker;
}
catch (error) {
throw new ChunkingError(`Failed to create chunker '${resolvedName}': ${error instanceof Error ? error.message : String(error)}`, {
code: RAGErrorCodes.CHUNKING_ERROR,
cause: error instanceof Error ? error : undefined,
details: { strategy: resolvedName, config },
});
}
}
/**
* Get metadata for a chunker
*/
getChunkerMetadata(strategyOrAlias) {
const resolvedName = this.resolveName(strategyOrAlias);
return this.metadataMap.get(resolvedName);
}
/**
* Get default configuration for a chunker
*/
getDefaultConfig(strategyOrAlias) {
const metadata = this.getChunkerMetadata(strategyOrAlias);
return metadata?.defaultConfig;
}
/**
* Get available chunking strategies (not including aliases)
*/
async getAvailableStrategies() {
await this.ensureInitialized();
return this.getAvailable();
}
/**
* Get all aliases mapped to their strategies
*/
getStrategyAliases() {
return this.getAliases();
}
/**
* Check if a strategy exists
*/
hasStrategy(strategyOrAlias) {
const resolved = this.resolveName(strategyOrAlias);
return this.has(resolved);
}
/**
* Get chunkers suitable for a use case
*/
getChunkersForUseCase(useCase) {
const matches = [];
const useCaseLower = useCase.toLowerCase();
for (const [strategy, metadata] of this.metadataMap) {
const hasMatch = metadata.useCases?.some((uc) => uc.toLowerCase().includes(useCaseLower)) ?? false;
if (hasMatch) {
matches.push(strategy);
}
}
return matches;
}
/**
* Get all chunker metadata
*/
getAllMetadata() {
return new Map(this.metadataMap);
}
/**
* Clear factory and metadata
*/
clear() {
super.clear();
this.metadataMap.clear();
}
}
/**
* Global chunker factory singleton
*/
export const chunkerFactory = ChunkerFactory.getInstance();
/**
* Convenience function to create a chunker
*/
export async function createChunker(strategyOrAlias, config) {
return chunkerFactory.createChunker(strategyOrAlias, config);
}
/**
* Convenience function to get available strategies
*/
export async function getAvailableStrategies() {
return chunkerFactory.getAvailableStrategies();
}
/**
* Convenience function to get chunker metadata
*/
export function getChunkerMetadata(strategyOrAlias) {
return chunkerFactory.getChunkerMetadata(strategyOrAlias);
}
/**
* Convenience function to get default config
*/
export function getDefaultConfig(strategyOrAlias) {
return chunkerFactory.getDefaultConfig(strategyOrAlias);
}
//# sourceMappingURL=ChunkerFactory.js.map