UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

351 lines 12.4 kB
/** * @file Content Similarity Scorer * Evaluates text similarity using various metrics (Jaccard, cosine, Levenshtein) */ import { BaseScorer } from "../baseScorer.js"; import { DEFAULT_RULE_SCORER_CONFIG } from "./baseRuleScorer.js"; /** * Scorer metadata for content similarity */ const CONTENT_SIMILARITY_METADATA = { id: "content-similarity", name: "Content Similarity", description: "Evaluates text similarity using various metrics like Jaccard, cosine, Levenshtein", type: "rule", category: "accuracy", version: "1.0.0", defaultConfig: { ...DEFAULT_RULE_SCORER_CONFIG, threshold: 0.5, }, requiredInputs: ["response"], optionalInputs: ["groundTruth", "context", "custom"], }; /** * ContentSimilarityScorer evaluates how similar the response is to a reference text */ export class ContentSimilarityScorer extends BaseScorer { _similarityConfig; constructor(config) { super(CONTENT_SIMILARITY_METADATA, config); this._similarityConfig = { metric: "jaccard", normalizeText: true, tokenLevel: "word", ngramSize: 2, compareWith: "groundTruth", metricCombination: "average", ...config, }; } /** * Get similarity-specific configuration */ get similarityConfig() { return this._similarityConfig; } /** * Get reference text based on configuration */ _getReferenceText(input) { switch (this._similarityConfig.compareWith) { case "groundTruth": return input.groundTruth ?? null; case "context": if (input.context && input.context.length > 0) { return input.context.join(" "); } return null; case "custom": return (this._similarityConfig.referenceText ?? input.custom?.referenceText ?? null); default: return input.groundTruth ?? null; } } /** * Calculate similarity between two texts */ _calculateSimilarity(text1, text2, metric) { const normalizedText1 = this._similarityConfig.normalizeText ? this._normalizeText(text1) : text1; const normalizedText2 = this._similarityConfig.normalizeText ? this._normalizeText(text2) : text2; const tokens1 = this._tokenize(normalizedText1); const tokens2 = this._tokenize(normalizedText2); switch (metric) { case "jaccard": return this._jaccardSimilarity(tokens1, tokens2); case "cosine": return this._cosineSimilarity(tokens1, tokens2); case "levenshtein": return this._levenshteinSimilarity(normalizedText1, normalizedText2); case "dice": return this._diceSimilarity(tokens1, tokens2); case "overlap": return this._overlapCoefficient(tokens1, tokens2); default: return this._jaccardSimilarity(tokens1, tokens2); } } /** * Normalize text for comparison */ _normalizeText(text) { return text .toLowerCase() .replace(/[^\w\s]/g, " ") .replace(/\s+/g, " ") .trim(); } /** * Tokenize text based on configuration */ _tokenize(text) { switch (this._similarityConfig.tokenLevel) { case "character": return text.split(""); case "ngram": { const n = this._similarityConfig.ngramSize ?? 2; const ngrams = []; for (let i = 0; i <= text.length - n; i++) { ngrams.push(text.slice(i, i + n)); } return ngrams; } case "word": default: return text.split(/\s+/).filter((word) => word.length > 0); } } /** * Calculate Jaccard similarity coefficient * J(A,B) = |A ∩ B| / |A ∪ B| */ _jaccardSimilarity(tokens1, tokens2) { const set1 = new Set(tokens1); const set2 = new Set(tokens2); const intersection = Array.from(set1).filter((x) => set2.has(x)); const unionArr = [...Array.from(set1), ...Array.from(set2)]; const union = new Set(unionArr); if (union.size === 0) { return 1.0; } return intersection.length / union.size; } /** * Calculate cosine similarity using term frequency vectors */ _cosineSimilarity(tokens1, tokens2) { const freq1 = this._getTermFrequency(tokens1); const freq2 = this._getTermFrequency(tokens2); const allTermsArr = [ ...Array.from(freq1.keys()), ...Array.from(freq2.keys()), ]; const allTerms = new Set(allTermsArr); let dotProduct = 0; let magnitude1 = 0; let magnitude2 = 0; for (const term of Array.from(allTerms)) { const f1 = freq1.get(term) ?? 0; const f2 = freq2.get(term) ?? 0; dotProduct += f1 * f2; magnitude1 += f1 * f1; magnitude2 += f2 * f2; } const magnitude = Math.sqrt(magnitude1) * Math.sqrt(magnitude2); if (magnitude === 0) { return 1.0; } return dotProduct / magnitude; } /** * Get term frequency map */ _getTermFrequency(tokens) { const freq = new Map(); for (const token of tokens) { freq.set(token, (freq.get(token) ?? 0) + 1); } return freq; } /** * Calculate normalized Levenshtein similarity * Returns 1 - (edit_distance / max_length) */ _levenshteinSimilarity(text1, text2) { // Guard against excessive memory usage for large texts const MAX_LEVENSHTEIN_LENGTH = 5000; if (text1.length > MAX_LEVENSHTEIN_LENGTH || text2.length > MAX_LEVENSHTEIN_LENGTH) { // Fall back to a faster approximation for large texts return this._jaccardSimilarity(text1.split(""), text2.split("")); } const distance = this._levenshteinDistance(text1, text2); const maxLength = Math.max(text1.length, text2.length); if (maxLength === 0) { return 1.0; } return 1 - distance / maxLength; } /** * Calculate Levenshtein edit distance using space-optimized two-row DP */ _levenshteinDistance(str1, str2) { const m = str1.length; const n = str2.length; // Use shorter string for row storage if (m < n) { return this._levenshteinDistance(str2, str1); } // Space-optimized: only keep previous and current row let prevRow = new Array(n + 1); let currRow = new Array(n + 1); // Initialize base case for (let j = 0; j <= n; j++) { prevRow[j] = j; } // Fill rows for (let i = 1; i <= m; i++) { currRow[0] = i; for (let j = 1; j <= n; j++) { if (str1[i - 1] === str2[j - 1]) { currRow[j] = prevRow[j - 1]; } else { currRow[j] = 1 + Math.min(prevRow[j], currRow[j - 1], prevRow[j - 1]); } } // Swap rows [prevRow, currRow] = [currRow, prevRow]; } return prevRow[n]; } /** * Calculate Dice coefficient (Sorensen-Dice) * DSC(A,B) = 2|A ∩ B| / (|A| + |B|) */ _diceSimilarity(tokens1, tokens2) { const set1 = new Set(tokens1); const set2 = new Set(tokens2); const intersection = Array.from(set1).filter((x) => set2.has(x)); const totalSize = set1.size + set2.size; if (totalSize === 0) { return 1.0; } return (2 * intersection.length) / totalSize; } /** * Calculate overlap coefficient * O(A,B) = |A ∩ B| / min(|A|, |B|) */ _overlapCoefficient(tokens1, tokens2) { const set1 = new Set(tokens1); const set2 = new Set(tokens2); const intersection = Array.from(set1).filter((x) => set2.has(x)); const minSize = Math.min(set1.size, set2.size); if (minSize === 0) { return 1.0; } return intersection.length / minSize; } /** * Override score to add detailed similarity metrics */ async score(input) { const reference = this._getReferenceText(input); if (!reference) { return this.createScoreResult(10, "No reference text available for comparison - passing by default", { metadata: { noReferenceText: true, compareWith: this._similarityConfig.compareWith ?? "groundTruth", }, }); } // Calculate all metrics for detailed reporting const metrics = this._similarityConfig.metrics ?? [ this._similarityConfig.metric ?? "jaccard", ]; const details = []; for (const metric of metrics) { const score = this._calculateSimilarity(input.response, reference, metric); const responseTokens = this._tokenize(this._similarityConfig.normalizeText ? this._normalizeText(input.response) : input.response); const referenceTokens = this._tokenize(this._similarityConfig.normalizeText ? this._normalizeText(reference) : reference); details.push({ metric, score, responseTokens: responseTokens.length, referenceTokens: referenceTokens.length, }); } // Calculate combined score const combinedScore = this._combineMetricScores(details); const normalizedScore = combinedScore * 10; // Scale to 0-10 return this.createScoreResult(normalizedScore, this._generateSimilarityReasoning(details, combinedScore), { metadata: { similarityDetails: details, combinedScore, compareWith: this._similarityConfig.compareWith ?? "groundTruth", tokenLevel: this._similarityConfig.tokenLevel ?? "word", }, }); } /** * Combine multiple metric scores */ _combineMetricScores(details) { if (details.length === 0) { return 1.0; } if (details.length === 1) { return details[0].score; } const combination = this._similarityConfig.metricCombination ?? "average"; const weights = this._similarityConfig.metricWeights ?? {}; switch (combination) { case "min": return Math.min(...details.map((d) => d.score)); case "max": return Math.max(...details.map((d) => d.score)); case "weighted": { let totalWeight = 0; let weightedSum = 0; for (const detail of details) { const weight = weights[detail.metric] ?? 1.0; totalWeight += weight; weightedSum += detail.score * weight; } return totalWeight > 0 ? weightedSum / totalWeight : 0; } case "average": default: return details.reduce((sum, d) => sum + d.score, 0) / details.length; } } /** * Generate reasoning from similarity details */ _generateSimilarityReasoning(details, combinedScore) { const parts = []; for (const detail of details) { parts.push(`${detail.metric}: ${(detail.score * 100).toFixed(1)}%`); } const overallPct = (combinedScore * 100).toFixed(1); return `Similarity scores - ${parts.join(", ")}. Overall: ${overallPct}%`; } } /** * Factory function for creating ContentSimilarityScorer instances */ export async function createContentSimilarityScorer(config) { return new ContentSimilarityScorer(config); } //# sourceMappingURL=contentSimilarityScorer.js.map