@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
419 lines (418 loc) • 14.5 kB
JavaScript
/**
* Metadata Extractor Factory
*
* Factory for creating metadata extractor instances with configuration.
* Follows the BaseFactory pattern for consistent lifecycle management.
*/
import { BaseFactory } from "../../core/infrastructure/index.js";
import { logger } from "../../utils/logger.js";
import { MetadataExtractionError, RAGErrorCodes } from "../errors/RAGError.js";
/**
* Default metadata extractor metadata entries
*/
const DEFAULT_EXTRACTOR_METADATA = {
llm: {
description: "Full LLM-powered metadata extraction supporting all extraction types",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
temperature: 0.3,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"maxTokens",
"temperature",
],
useCases: [
"Comprehensive metadata extraction",
"Multi-type extraction in single pass",
"Custom schema extraction",
],
aliases: ["full", "comprehensive", "all"],
requiresModel: true,
extractionTypes: ["title", "summary", "keywords", "questions", "custom"],
},
title: {
description: "Extracts concise, descriptive titles from document content",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 100,
},
supportedOptions: ["provider", "modelName", "promptTemplate", "maxTokens"],
useCases: [
"Document indexing",
"Content organization",
"Navigation systems",
],
aliases: ["header", "heading"],
requiresModel: true,
extractionTypes: ["title"],
},
summary: {
description: "Generates concise summaries of document chunks",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 200,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"maxTokens",
"maxWords",
],
useCases: [
"Document previews",
"Search result snippets",
"Content condensation",
],
aliases: ["summarize", "abstract"],
requiresModel: true,
extractionTypes: ["summary"],
},
keywords: {
description: "Extracts key terms and phrases from content",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 100,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"maxKeywords",
],
useCases: ["Tag generation", "Topic modeling", "Search optimization"],
aliases: ["tags", "terms", "keyphrase"],
requiresModel: true,
extractionTypes: ["keywords"],
},
questions: {
description: "Generates Q&A pairs from content for training or FAQs",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 500,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"numQuestions",
"includeAnswers",
],
useCases: [
"FAQ generation",
"Training data creation",
"Knowledge base building",
],
aliases: ["qa", "faq", "questions-answers"],
requiresModel: true,
extractionTypes: ["questions"],
},
custom: {
description: "Extracts structured data according to custom schema",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
maxTokens: 500,
},
supportedOptions: [
"provider",
"modelName",
"promptTemplate",
"schema",
"description",
],
useCases: [
"Structured data extraction",
"Entity extraction",
"Custom field extraction",
],
aliases: ["schema", "structured", "entity"],
requiresModel: true,
extractionTypes: ["custom"],
},
composite: {
description: "Combines multiple extraction types in a single pass",
defaultConfig: {
provider: "openai",
modelName: "gpt-4o-mini",
},
supportedOptions: ["provider", "modelName", "extractors"],
useCases: [
"Multi-field extraction",
"Complete document processing",
"Pipeline integration",
],
aliases: ["multi", "combined", "batch"],
requiresModel: true,
extractionTypes: ["title", "summary", "keywords", "questions", "custom"],
},
};
/**
* Metadata Extractor Factory
*
* Creates metadata extractor instances based on type with configuration support.
* Uses lazy loading via dynamic imports to avoid circular dependencies.
*/
export class MetadataExtractorFactory extends BaseFactory {
static instance = null;
metadataMap = new Map();
constructor() {
super();
}
/**
* Get singleton instance
*/
static getInstance() {
if (!MetadataExtractorFactory.instance) {
MetadataExtractorFactory.instance = new MetadataExtractorFactory();
}
return MetadataExtractorFactory.instance;
}
/**
* Reset singleton (for testing)
*/
static resetInstance() {
if (MetadataExtractorFactory.instance) {
MetadataExtractorFactory.instance.clear();
MetadataExtractorFactory.instance = null;
}
}
/**
* Register all default extractors
*/
async registerAll() {
// Register full LLM extractor
this.registerExtractor("llm", async (config) => {
const { LLMMetadataExtractor } = await import("./metadataExtractor.js");
return this.wrapExtractor(new LLMMetadataExtractor({
provider: config?.provider,
modelName: config?.modelName,
}), "llm");
}, DEFAULT_EXTRACTOR_METADATA.llm);
// Register title extractor
this.registerExtractor("title", async (config) => {
const { LLMMetadataExtractor } = await import("./metadataExtractor.js");
return this.createSpecializedExtractor(new LLMMetadataExtractor({
provider: config?.provider,
modelName: config?.modelName,
}), "title", { title: true });
}, DEFAULT_EXTRACTOR_METADATA.title);
// Register summary extractor
this.registerExtractor("summary", async (config) => {
const { LLMMetadataExtractor } = await import("./metadataExtractor.js");
return this.createSpecializedExtractor(new LLMMetadataExtractor({
provider: config?.provider,
modelName: config?.modelName,
}), "summary", { summary: true });
}, DEFAULT_EXTRACTOR_METADATA.summary);
// Register keywords extractor
this.registerExtractor("keywords", async (config) => {
const { LLMMetadataExtractor } = await import("./metadataExtractor.js");
return this.createSpecializedExtractor(new LLMMetadataExtractor({
provider: config?.provider,
modelName: config?.modelName,
}), "keywords", { keywords: true });
}, DEFAULT_EXTRACTOR_METADATA.keywords);
// Register questions extractor
this.registerExtractor("questions", async (config) => {
const { LLMMetadataExtractor } = await import("./metadataExtractor.js");
return this.createSpecializedExtractor(new LLMMetadataExtractor({
provider: config?.provider,
modelName: config?.modelName,
}), "questions", { questions: true });
}, DEFAULT_EXTRACTOR_METADATA.questions);
// Register custom extractor
this.registerExtractor("custom", async (config) => {
const { LLMMetadataExtractor } = await import("./metadataExtractor.js");
return this.wrapExtractor(new LLMMetadataExtractor({
provider: config?.provider,
modelName: config?.modelName,
}), "custom");
}, DEFAULT_EXTRACTOR_METADATA.custom);
// Register composite extractor
this.registerExtractor("composite", async (config) => {
const { LLMMetadataExtractor } = await import("./metadataExtractor.js");
return this.wrapExtractor(new LLMMetadataExtractor({
provider: config?.provider,
modelName: config?.modelName,
}), "composite");
}, DEFAULT_EXTRACTOR_METADATA.composite);
logger.debug(`[MetadataExtractorFactory] Registered ${this.items.size} extractor types`);
}
/**
* Wrap LLMMetadataExtractor to conform to MetadataExtractor interface
*/
wrapExtractor(extractor, type) {
return {
type,
async extract(chunks, params) {
return extractor.extract(chunks, params ?? {});
},
};
}
/**
* Create specialized extractor that only extracts specific types
*/
createSpecializedExtractor(extractor, type, defaultParams) {
return {
type,
async extract(chunks, params) {
// Merge default params with any provided params
const mergedParams = { ...defaultParams, ...params };
return extractor.extract(chunks, mergedParams);
},
};
}
/**
* Register an extractor with metadata and aliases
*/
registerExtractor(type, factory, metadata) {
// Store metadata
this.metadataMap.set(type, metadata);
// Register with aliases
this.register(type, factory, metadata.aliases, { metadata });
logger.debug(`[MetadataExtractorFactory] Registered extractor '${type}' with aliases: ${metadata.aliases.join(", ")}`);
}
/**
* Create an extractor by type or alias
*/
async createExtractor(typeOrAlias, config) {
await this.ensureInitialized();
const resolvedName = this.resolveName(typeOrAlias);
if (!this.has(resolvedName)) {
const available = this.getAvailable();
throw new MetadataExtractionError(`Unknown metadata extractor type: '${typeOrAlias}'. Available types: ${available.join(", ")}`, {
code: RAGErrorCodes.METADATA_EXTRACTOR_NOT_FOUND,
extractorType: typeOrAlias,
details: {
requestedType: typeOrAlias,
availableTypes: available,
},
});
}
try {
const extractor = await this.create(resolvedName, config);
logger.debug(`[MetadataExtractorFactory] Created extractor '${resolvedName}' with config:`, config);
return extractor;
}
catch (error) {
// Re-throw if already a MetadataExtractionError
if (error instanceof MetadataExtractionError) {
throw error;
}
throw new MetadataExtractionError(`Failed to create extractor '${resolvedName}': ${error instanceof Error ? error.message : String(error)}`, {
extractorType: resolvedName,
cause: error instanceof Error ? error : undefined,
details: { type: resolvedName, config },
});
}
}
/**
* Get metadata for an extractor
*/
getExtractorMetadata(typeOrAlias) {
const resolvedName = this.resolveName(typeOrAlias);
return this.metadataMap.get(resolvedName);
}
/**
* Get default configuration for an extractor
*/
getDefaultConfig(typeOrAlias) {
const metadata = this.getExtractorMetadata(typeOrAlias);
return metadata?.defaultConfig;
}
/**
* Get available extractor types (not including aliases)
*/
getAvailableTypes() {
return this.getAvailable();
}
/**
* Get all aliases mapped to their types
*/
getTypeAliases() {
return this.getAliases();
}
/**
* Check if a type exists
*/
hasType(typeOrAlias) {
const resolved = this.resolveName(typeOrAlias);
return this.has(resolved);
}
/**
* Get extractors suitable for a use case
*/
getExtractorsForUseCase(useCase) {
const matches = [];
const useCaseLower = useCase.toLowerCase();
for (const [type, metadata] of this.metadataMap) {
const hasMatch = metadata.useCases.some((uc) => uc.toLowerCase().includes(useCaseLower));
if (hasMatch) {
matches.push(type);
}
}
return matches;
}
/**
* Get extractors that can produce a specific extraction type
*/
getExtractorsForExtractionType(extractionType) {
const matches = [];
for (const [type, metadata] of this.metadataMap) {
if (metadata.extractionTypes.includes(extractionType)) {
matches.push(type);
}
}
return matches;
}
/**
* Get all extractor metadata
*/
getAllMetadata() {
return new Map(this.metadataMap);
}
/**
* Clear factory and metadata
*/
clear() {
super.clear();
this.metadataMap.clear();
}
}
/**
* Global metadata extractor factory singleton
*/
export const metadataExtractorFactory = MetadataExtractorFactory.getInstance();
/**
* Convenience function to create a metadata extractor
*/
export async function createMetadataExtractor(typeOrAlias, config) {
return metadataExtractorFactory.createExtractor(typeOrAlias, config);
}
/**
* Convenience function to get available extractor types
*/
export function getAvailableExtractorTypes() {
return metadataExtractorFactory.getAvailableTypes();
}
/**
* Convenience function to get extractor metadata
*/
export function getExtractorMetadata(typeOrAlias) {
return metadataExtractorFactory.getExtractorMetadata(typeOrAlias);
}
/**
* Convenience function to get default config
*/
export function getExtractorDefaultConfig(typeOrAlias) {
return metadataExtractorFactory.getDefaultConfig(typeOrAlias);
}