@orama/orama
Version:
A complete search engine and RAG pipeline in your browser, server, or edge network with support for full-text, vector, and hybrid search in less than 2kb.
95 lines • 4.42 kB
JavaScript
import { createError } from '../errors.js';
export function prioritizeTokenScores(arrays, boost, threshold = 0, keywordsCount) {
if (boost === 0) {
throw createError('INVALID_BOOST_VALUE');
}
const tokenScoresMap = new Map();
const mapsLength = arrays.length;
for (let i = 0; i < mapsLength; i++) {
const arr = arrays[i];
const entriesLength = arr.length;
for (let j = 0; j < entriesLength; j++) {
const [token, score] = arr[j];
const boostScore = score * boost;
const oldScore = tokenScoresMap.get(token)?.[0];
if (oldScore !== undefined) {
tokenScoresMap.set(token, [oldScore * 1.5 + boostScore, (tokenScoresMap?.get(token)?.[1] || 0) + 1]);
}
else {
tokenScoresMap.set(token, [boostScore, 1]);
}
}
}
const tokenScores = [];
for (const tokenScoreEntry of tokenScoresMap.entries()) {
tokenScores.push([tokenScoreEntry[0], tokenScoreEntry[1][0]]);
}
const results = tokenScores.sort((a, b) => b[1] - a[1]);
// If threshold is 1, it means we will return all the results with at least one search term,
// prioritizing the ones that contains more search terms (fuzzy match)
if (threshold === 1) {
return results;
}
// For threshold = 0 when keywordsCount is 1 (single term search),
// we return all matches since they automatically contain 100% of keywords
if (threshold === 0 && keywordsCount === 1) {
return results;
}
// Prepare keywords count tracking for threshold handling
const allResults = results.length;
const tokenScoreWithKeywordsCount = [];
for (const tokenScoreEntry of tokenScoresMap.entries()) {
tokenScoreWithKeywordsCount.push([tokenScoreEntry[0], tokenScoreEntry[1][0], tokenScoreEntry[1][1]]);
}
// Find the index of the last result with all keywords.
// Order the documents by the number of keywords they contain, and then by the score.
const keywordsPerToken = tokenScoreWithKeywordsCount.sort((a, b) => {
// Compare by the third element, higher numbers first
if (a[2] > b[2])
return -1;
if (a[2] < b[2])
return 1;
// If the third elements are equal, compare by the second element, higher numbers first
if (a[1] > b[1])
return -1;
if (a[1] < b[1])
return 1;
// If both the second and third elements are equal, consider the elements equal
return 0;
});
let lastTokenWithAllKeywords = undefined;
for (let i = 0; i < allResults; i++) {
if (keywordsPerToken[i][2] === keywordsCount) {
lastTokenWithAllKeywords = i;
}
else {
break;
}
}
// If no results had all the keywords, either bail out earlier or normalize
if (typeof lastTokenWithAllKeywords === 'undefined') {
if (threshold === 0) {
return [];
}
lastTokenWithAllKeywords = 0;
}
const keywordsPerTokenLength = keywordsPerToken.length;
const resultsWithIdAndScore = new Array(keywordsPerTokenLength);
for (let i = 0; i < keywordsPerTokenLength; i++) {
resultsWithIdAndScore[i] = [keywordsPerToken[i][0], keywordsPerToken[i][1]];
}
// If threshold is 0, it means we will only return all the results that contains ALL the search terms (exact match)
if (threshold === 0) {
return resultsWithIdAndScore.slice(0, lastTokenWithAllKeywords + 1);
}
// If the threshold is between 0 and 1, we will return all the results that contains at least the threshold of search terms
// For example, if threshold is 0.5, we will return all the results that contains at least 50% of the search terms
// (fuzzy match with a minimum threshold)
const thresholdLength = lastTokenWithAllKeywords + Math.ceil((threshold * 100 * (allResults - lastTokenWithAllKeywords)) / 100);
return resultsWithIdAndScore.slice(0, Math.min(allResults, thresholdLength));
}
export function BM25(tf, matchingCount, docsCount, fieldLength, averageFieldLength, { k, b, d }) {
const idf = Math.log(1 + (docsCount - matchingCount + 0.5) / (matchingCount + 0.5));
return (idf * (d + tf * (k + 1))) / (tf + k * (1 - b + (b * fieldLength) / averageFieldLength));
}
//# sourceMappingURL=algorithms.js.map