@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
363 lines (362 loc) • 11.3 kB
JavaScript
/**
* Metadata Extractor Registry
*
* Centralized registry for all metadata extractor implementations with metadata
* and discovery capabilities. Follows the BaseRegistry pattern.
*/
import { BaseRegistry } from "../../core/infrastructure/index.js";
import { logger } from "../../utils/logger.js";
import { MetadataExtractionError, RAGErrorCodes } from "../errors/RAGError.js";
/**
* Default metadata extractor metadata entries
*/
const DEFAULT_EXTRACTOR_METADATA = {
llm: {
description: "Full LLM-powered metadata extraction supporting all extraction types",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
temperature: 0.3,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"maxTokens",
"temperature",
],
useCases: [
"Comprehensive metadata extraction",
"Multi-type extraction in single pass",
"Custom schema extraction",
],
aliases: ["full", "comprehensive", "all"],
requiresModel: true,
extractionTypes: ["title", "summary", "keywords", "questions", "custom"],
},
title: {
description: "Extracts concise, descriptive titles from document content",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 100,
},
supportedOptions: ["provider", "modelName", "promptTemplate", "maxTokens"],
useCases: [
"Document indexing",
"Content organization",
"Navigation systems",
],
aliases: ["header", "heading"],
requiresModel: true,
extractionTypes: ["title"],
},
summary: {
description: "Generates concise summaries of document chunks",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 200,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"maxTokens",
"maxWords",
],
useCases: [
"Document previews",
"Search result snippets",
"Content condensation",
],
aliases: ["summarize", "abstract"],
requiresModel: true,
extractionTypes: ["summary"],
},
keywords: {
description: "Extracts key terms and phrases from content",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 100,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"maxKeywords",
],
useCases: ["Tag generation", "Topic modeling", "Search optimization"],
aliases: ["tags", "terms", "keyphrase"],
requiresModel: true,
extractionTypes: ["keywords"],
},
questions: {
description: "Generates Q&A pairs from content for training or FAQs",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 500,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"numQuestions",
"includeAnswers",
],
useCases: [
"FAQ generation",
"Training data creation",
"Knowledge base building",
],
aliases: ["qa", "faq", "questions-answers"],
requiresModel: true,
extractionTypes: ["questions"],
},
custom: {
description: "Extracts structured data according to custom schema",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 500,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"schema",
"description",
],
useCases: [
"Structured data extraction",
"Entity extraction",
"Custom field extraction",
],
aliases: ["schema", "structured", "entity"],
requiresModel: true,
extractionTypes: ["custom"],
},
composite: {
description: "Combines multiple extraction types in a single pass",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
},
supportedOptions: ["provider", "modelName", "extractors"],
useCases: [
"Multi-field extraction",
"Complete document processing",
"Pipeline integration",
],
aliases: ["multi", "combined", "batch"],
requiresModel: true,
extractionTypes: ["title", "summary", "keywords", "questions", "custom"],
},
};
/**
* Metadata Extractor Registry
*
* Manages registration and discovery of all metadata extractor implementations.
* Extends BaseRegistry for consistent lifecycle management.
*/
export class MetadataExtractorRegistry extends BaseRegistry {
static instance = null;
aliasMap = new Map();
constructor() {
super();
}
/**
* Get singleton instance
*/
static getInstance() {
if (!MetadataExtractorRegistry.instance) {
MetadataExtractorRegistry.instance = new MetadataExtractorRegistry();
}
return MetadataExtractorRegistry.instance;
}
/**
* Reset singleton (for testing)
*/
static resetInstance() {
if (MetadataExtractorRegistry.instance) {
MetadataExtractorRegistry.instance.clear();
MetadataExtractorRegistry.instance = null;
}
}
/**
* Register all built-in extractors
*/
async registerAll() {
const { LLMMetadataExtractor } = await import("./metadataExtractor.js");
// Register all extractor types
for (const [type, metadata] of Object.entries(DEFAULT_EXTRACTOR_METADATA)) {
this.registerExtractor(type, async () => this.createExtractorInstance(LLMMetadataExtractor, type), metadata);
}
logger.debug(`[MetadataExtractorRegistry] Registered ${this.items.size} extractor types`);
}
/**
* Create extractor instance wrapper
*/
createExtractorInstance(ExtractorClass, type) {
const extractor = new ExtractorClass();
return {
type,
async extract(chunks, params) {
return extractor.extract(chunks, params ?? {});
},
};
}
/**
* Register an extractor with aliases
*/
registerExtractor(type, factory, metadata) {
this.register(type, factory, metadata.aliases, { metadata });
// Register aliases in local alias map for type resolution
for (const alias of metadata.aliases) {
this.aliasMap.set(alias.toLowerCase(), type);
logger.debug(`[MetadataExtractorRegistry] Registered alias '${alias}' -> '${type}'`);
}
}
/**
* Resolve type from alias
*/
resolveType(nameOrAlias) {
const lower = nameOrAlias.toLowerCase();
// Check if it's a direct type
if (this.items.has(lower)) {
return lower;
}
// Check aliases
const resolved = this.aliasMap.get(lower);
if (resolved) {
return resolved;
}
throw new MetadataExtractionError(`Unknown metadata extractor type: '${nameOrAlias}'. Available types: ${this.getAvailableExtractors().join(", ")}`, {
code: RAGErrorCodes.METADATA_EXTRACTOR_NOT_FOUND,
extractorType: nameOrAlias,
details: {
requestedType: nameOrAlias,
availableTypes: this.getAvailableExtractors(),
},
});
}
/**
* Get an extractor by type or alias
*/
async getExtractor(typeOrAlias) {
await this.ensureInitialized();
const type = this.resolveType(typeOrAlias);
const extractor = await this.get(type);
if (!extractor) {
throw new MetadataExtractionError(`Metadata extractor not found: ${type}`, {
code: RAGErrorCodes.METADATA_EXTRACTOR_NOT_FOUND,
extractorType: type,
details: { type },
});
}
return extractor;
}
/**
* Get list of available extractor types
*/
getAvailableExtractors() {
return this.list().map((item) => item.id);
}
/**
* Get metadata for a specific extractor
*/
getExtractorMetadata(typeOrAlias) {
const type = this.resolveType(typeOrAlias);
const entry = this.list().find((item) => item.id === type);
return entry?.metadata;
}
/**
* Get all aliases for a type
*/
getAliasesForType(type) {
const metadata = DEFAULT_EXTRACTOR_METADATA[type];
return metadata?.aliases ?? [];
}
/**
* Get all registered aliases
*/
getAllAliases() {
return new Map(this.aliasMap);
}
/**
* Check if a type or alias exists
*/
hasExtractor(typeOrAlias) {
try {
this.resolveType(typeOrAlias);
return true;
}
catch {
return false;
}
}
/**
* Get extractors by use case
*/
getExtractorsByUseCase(useCase) {
const matches = [];
const useCaseLower = useCase.toLowerCase();
for (const [type, metadata] of Object.entries(DEFAULT_EXTRACTOR_METADATA)) {
const hasMatchingUseCase = metadata.useCases.some((uc) => uc.toLowerCase().includes(useCaseLower));
if (hasMatchingUseCase) {
matches.push(type);
}
}
return matches;
}
/**
* Get extractors that can produce a specific extraction type
*/
getExtractorsByExtractionType(extractionType) {
const matches = [];
for (const [type, metadata] of Object.entries(DEFAULT_EXTRACTOR_METADATA)) {
if (metadata.extractionTypes.includes(extractionType)) {
matches.push(type);
}
}
return matches;
}
/**
* Get default configuration for an extractor
*/
getDefaultConfig(typeOrAlias) {
const metadata = this.getExtractorMetadata(typeOrAlias);
return metadata?.defaultConfig;
}
/**
* Clear the registry (also clears aliases)
*/
clear() {
super.clear();
this.aliasMap.clear();
}
}
/**
* Global metadata extractor registry singleton
*/
export const metadataExtractorRegistry = MetadataExtractorRegistry.getInstance();
/**
* Convenience function to get available extractors
*/
export function getAvailableExtractors() {
return metadataExtractorRegistry.getAvailableExtractors();
}
/**
* Convenience function to get extractor by type
*/
export async function getExtractor(typeOrAlias) {
return metadataExtractorRegistry.getExtractor(typeOrAlias);
}
/**
* Convenience function to get extractor metadata
*/
export function getRegisteredExtractorMetadata(typeOrAlias) {
return metadataExtractorRegistry.getExtractorMetadata(typeOrAlias);
}