UNPKG

@orama/orama

Version:

A complete search engine and RAG pipeline in your browser, server, or edge network with support for full-text, vector, and hybrid search in less than 2kb.

95 lines 4.42 kB
import { createError } from '../errors.js'; export function prioritizeTokenScores(arrays, boost, threshold = 0, keywordsCount) { if (boost === 0) { throw createError('INVALID_BOOST_VALUE'); } const tokenScoresMap = new Map(); const mapsLength = arrays.length; for (let i = 0; i < mapsLength; i++) { const arr = arrays[i]; const entriesLength = arr.length; for (let j = 0; j < entriesLength; j++) { const [token, score] = arr[j]; const boostScore = score * boost; const oldScore = tokenScoresMap.get(token)?.[0]; if (oldScore !== undefined) { tokenScoresMap.set(token, [oldScore * 1.5 + boostScore, (tokenScoresMap?.get(token)?.[1] || 0) + 1]); } else { tokenScoresMap.set(token, [boostScore, 1]); } } } const tokenScores = []; for (const tokenScoreEntry of tokenScoresMap.entries()) { tokenScores.push([tokenScoreEntry[0], tokenScoreEntry[1][0]]); } const results = tokenScores.sort((a, b) => b[1] - a[1]); // If threshold is 1, it means we will return all the results with at least one search term, // prioritizing the ones that contains more search terms (fuzzy match) if (threshold === 1) { return results; } // For threshold = 0 when keywordsCount is 1 (single term search), // we return all matches since they automatically contain 100% of keywords if (threshold === 0 && keywordsCount === 1) { return results; } // Prepare keywords count tracking for threshold handling const allResults = results.length; const tokenScoreWithKeywordsCount = []; for (const tokenScoreEntry of tokenScoresMap.entries()) { tokenScoreWithKeywordsCount.push([tokenScoreEntry[0], tokenScoreEntry[1][0], tokenScoreEntry[1][1]]); } // Find the index of the last result with all keywords. // Order the documents by the number of keywords they contain, and then by the score. const keywordsPerToken = tokenScoreWithKeywordsCount.sort((a, b) => { // Compare by the third element, higher numbers first if (a[2] > b[2]) return -1; if (a[2] < b[2]) return 1; // If the third elements are equal, compare by the second element, higher numbers first if (a[1] > b[1]) return -1; if (a[1] < b[1]) return 1; // If both the second and third elements are equal, consider the elements equal return 0; }); let lastTokenWithAllKeywords = undefined; for (let i = 0; i < allResults; i++) { if (keywordsPerToken[i][2] === keywordsCount) { lastTokenWithAllKeywords = i; } else { break; } } // If no results had all the keywords, either bail out earlier or normalize if (typeof lastTokenWithAllKeywords === 'undefined') { if (threshold === 0) { return []; } lastTokenWithAllKeywords = 0; } const keywordsPerTokenLength = keywordsPerToken.length; const resultsWithIdAndScore = new Array(keywordsPerTokenLength); for (let i = 0; i < keywordsPerTokenLength; i++) { resultsWithIdAndScore[i] = [keywordsPerToken[i][0], keywordsPerToken[i][1]]; } // If threshold is 0, it means we will only return all the results that contains ALL the search terms (exact match) if (threshold === 0) { return resultsWithIdAndScore.slice(0, lastTokenWithAllKeywords + 1); } // If the threshold is between 0 and 1, we will return all the results that contains at least the threshold of search terms // For example, if threshold is 0.5, we will return all the results that contains at least 50% of the search terms // (fuzzy match with a minimum threshold) const thresholdLength = lastTokenWithAllKeywords + Math.ceil((threshold * 100 * (allResults - lastTokenWithAllKeywords)) / 100); return resultsWithIdAndScore.slice(0, Math.min(allResults, thresholdLength)); } export function BM25(tf, matchingCount, docsCount, fieldLength, averageFieldLength, { k, b, d }) { const idf = Math.log(1 + (docsCount - matchingCount + 0.5) / (matchingCount + 0.5)); return (idf * (d + tf * (k + 1))) / (tf + k * (1 - b + (b * fieldLength) / averageFieldLength)); } //# sourceMappingURL=algorithms.js.map