UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

281 lines (270 loc) 10.3 kB
/** * LLM-powered Metadata Extractor * * Extracts structured metadata from document chunks using language models. * Supports title, summary, keywords, Q&A pairs, and custom schema extraction. */ import { ProviderFactory } from "../../factories/providerFactory.js"; import { logger } from "../../utils/logger.js"; /** * Default prompts for metadata extraction */ const DEFAULT_PROMPTS = { title: `Extract a concise, descriptive title for the following content. Return only the title, nothing else. Content: {context} Title:`, summary: `Summarize the following content in {maxWords} words or less. Focus on the key points and main ideas. Content: {context} Summary:`, keywords: `Extract the {maxKeywords} most important keywords or key phrases from the following content. Return them as a comma-separated list. Content: {context} Keywords:`, questions: `Generate {numQuestions} questions that can be answered using the following content. {answerInstruction} Content: {context} Questions:`, }; /** * LLM-powered metadata extractor * Extracts title, summary, keywords, Q&A pairs, and custom schema data */ export class LLMMetadataExtractor { provider; modelName; constructor(options) { this.provider = options?.provider || "openai"; this.modelName = options?.modelName || "gpt-4o-mini"; } /** * Extract metadata from chunks based on configuration * @param chunks - Array of chunks to extract metadata from * @param params - Extraction parameters * @returns Array of extraction results, one per chunk */ async extract(chunks, params) { const results = []; // Group chunks by documentId for title extraction const chunksByDocument = this.groupByDocument(chunks); // Cache titles by document to avoid re-extraction const titleCache = new Map(); for (const chunk of chunks) { const result = {}; try { // Extract title (shared across chunks with same documentId) if (params.title) { const docId = chunk.metadata.documentId; if (!titleCache.has(docId)) { const titleConfig = typeof params.title === "boolean" ? {} : params.title; const title = await this.extractTitle(chunksByDocument.get(docId) || [chunk], titleConfig); titleCache.set(docId, title); } result.title = titleCache.get(docId); } // Extract summary if (params.summary) { const summaryConfig = typeof params.summary === "boolean" ? {} : params.summary; result.summary = await this.extractSummary(chunk, summaryConfig); } // Extract keywords if (params.keywords) { const keywordConfig = typeof params.keywords === "boolean" ? {} : params.keywords; result.keywords = await this.extractKeywords(chunk, keywordConfig); } // Generate Q&A pairs if (params.questions) { const questionConfig = typeof params.questions === "boolean" ? {} : params.questions; result.questions = await this.extractQuestions(chunk, questionConfig); } // Custom schema extraction if (params.custom) { result.custom = await this.extractCustom(chunk, params.custom); } results.push(result); } catch (error) { logger.error("[MetadataExtractor] Extraction failed for chunk", { chunkId: chunk.id, error: error instanceof Error ? error.message : String(error), }); results.push(result); } } return results; } /** * Group chunks by document ID */ groupByDocument(chunks) { const groups = new Map(); for (const chunk of chunks) { const docId = chunk.metadata.documentId; const group = groups.get(docId); if (group) { group.push(chunk); } else { groups.set(docId, [chunk]); } } return groups; } /** * Extract title from document chunks */ async extractTitle(chunks, config) { const { nodes = 3, promptTemplate = DEFAULT_PROMPTS.title } = config; // Use first N chunks for title extraction const relevantChunks = chunks.slice(0, nodes); const context = relevantChunks.map((c) => c.text).join("\n\n"); const prompt = promptTemplate.replace("{context}", context); const response = await this.callLLM(prompt, config); return response.trim(); } /** * Extract summary from a chunk */ async extractSummary(chunk, config) { const { maxWords = 100, promptTemplate = DEFAULT_PROMPTS.summary } = config; const prompt = promptTemplate .replace("{context}", chunk.text) .replace("{maxWords}", String(maxWords)); const response = await this.callLLM(prompt, config); return response.trim(); } /** * Extract keywords from a chunk */ async extractKeywords(chunk, config) { const { maxKeywords = 10, promptTemplate = DEFAULT_PROMPTS.keywords } = config; const prompt = promptTemplate .replace("{context}", chunk.text) .replace("{maxKeywords}", String(maxKeywords)); const response = await this.callLLM(prompt, config); // Parse comma-separated keywords return response .split(",") .map((k) => k.trim()) .filter((k) => k.length > 0) .slice(0, maxKeywords); } /** * Extract Q&A pairs from a chunk */ async extractQuestions(chunk, config) { const { numQuestions = 3, includeAnswers = true, promptTemplate = DEFAULT_PROMPTS.questions, } = config; const answerInstruction = includeAnswers ? "For each question, also provide a brief answer based on the content." : "Return only the questions."; const prompt = promptTemplate .replace("{context}", chunk.text) .replace("{numQuestions}", String(numQuestions)) .replace("{answerInstruction}", answerInstruction); const response = await this.callLLM(prompt, config); // Parse Q&A pairs from response return this.parseQAPairs(response, includeAnswers); } /** * Extract custom schema data from a chunk */ async extractCustom(chunk, config) { const { description, promptTemplate } = config; // Build extraction prompt const prompt = promptTemplate || `Extract the following information from the content: ${description || "Extract structured data according to the schema."} Content: ${chunk.text} Return the extracted data as JSON.`; const response = await this.callLLM(prompt, config); try { // Try to parse as JSON const jsonMatch = response.match(/\{[\s\S]*\}/); if (jsonMatch) { return JSON.parse(jsonMatch[0]); } return JSON.parse(response); } catch { logger.warn("[MetadataExtractor] Failed to parse custom extraction as JSON"); return { raw: response }; } } /** * Parse Q&A pairs from LLM response */ parseQAPairs(response, includeAnswers) { const pairs = []; // Try to parse numbered questions const lines = response.split("\n").filter((l) => l.trim()); let currentQuestion = null; let currentAnswer = null; for (const line of lines) { const trimmed = line.trim(); // Check if line is a question (starts with number or Q:) if (/^\d+[.):]\s*/.test(trimmed) || /^Q[.:]?\s*/i.test(trimmed)) { // Save previous Q&A pair if (currentQuestion) { pairs.push({ question: currentQuestion, ...(includeAnswers && currentAnswer ? { answer: currentAnswer } : {}), }); } currentQuestion = trimmed .replace(/^\d+[.):]\s*/, "") .replace(/^Q[.:]?\s*/i, ""); currentAnswer = null; } else if (/^A[.:]?\s*/i.test(trimmed) && currentQuestion) { currentAnswer = trimmed.replace(/^A[.:]?\s*/i, ""); } else if (currentQuestion && !currentAnswer) { // Continuation of question currentQuestion += " " + trimmed; } else if (currentAnswer) { // Continuation of answer currentAnswer += " " + trimmed; } } // Don't forget the last pair if (currentQuestion) { pairs.push({ question: currentQuestion, ...(includeAnswers && currentAnswer ? { answer: currentAnswer } : {}), }); } return pairs; } /** * Call the LLM with a prompt */ async callLLM(prompt, config) { const provider = await ProviderFactory.createProvider(config.provider || this.provider, config.modelName || this.modelName); const result = await provider.generate({ prompt, maxTokens: config.maxTokens || 500, temperature: config.temperature || 0.3, }); return result?.content || ""; } } /** * Convenience function to extract metadata from chunks * @param chunks - Chunks to process * @param params - Extraction parameters * @param options - Extractor options * @returns Extraction results */ export async function extractMetadata(chunks, params, options) { const extractor = new LLMMetadataExtractor(options); return extractor.extract(chunks, params); }