@orama/orama
Version:
A complete search engine and RAG pipeline in your browser, server, or edge network with support for full-text, vector, and hybrid search in less than 2kb.
688 lines • 27.9 kB
JavaScript
import { createError } from '../errors.js';
import { AVLTree } from '../trees/avl.js';
import { FlatTree } from '../trees/flat.js';
import { RadixTree } from '../trees/radix.js';
import { BKDTree } from '../trees/bkd.js';
import { BoolNode } from '../trees/bool.js';
import { convertDistanceToMeters, setIntersection, setUnion, setDifference } from '../utils.js';
import { BM25 } from './algorithms.js';
import { getInnerType, getVectorSize, isArrayType, isVectorType } from './defaults.js';
import { getInternalDocumentId } from './internal-document-id-store.js';
import { VectorIndex } from '../trees/vector.js';
export function insertDocumentScoreParameters(index, prop, id, tokens, docsCount) {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id);
index.avgFieldLength[prop] = ((index.avgFieldLength[prop] ?? 0) * (docsCount - 1) + tokens.length) / docsCount;
index.fieldLengths[prop][internalId] = tokens.length;
index.frequencies[prop][internalId] = {};
}
export function insertTokenScoreParameters(index, prop, id, tokens, token) {
let tokenFrequency = 0;
for (const t of tokens) {
if (t === token) {
tokenFrequency++;
}
}
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id);
const tf = tokenFrequency / tokens.length;
index.frequencies[prop][internalId][token] = tf;
if (!(token in index.tokenOccurrences[prop])) {
index.tokenOccurrences[prop][token] = 0;
}
// increase a token counter that may not yet exist
index.tokenOccurrences[prop][token] = (index.tokenOccurrences[prop][token] ?? 0) + 1;
}
export function removeDocumentScoreParameters(index, prop, id, docsCount) {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id);
if (docsCount > 1) {
index.avgFieldLength[prop] =
(index.avgFieldLength[prop] * docsCount - index.fieldLengths[prop][internalId]) / (docsCount - 1);
}
else {
index.avgFieldLength[prop] = undefined;
}
index.fieldLengths[prop][internalId] = undefined;
index.frequencies[prop][internalId] = undefined;
}
export function removeTokenScoreParameters(index, prop, token) {
index.tokenOccurrences[prop][token]--;
}
export function create(orama, sharedInternalDocumentStore, schema, index, prefix = '') {
if (!index) {
index = {
sharedInternalDocumentStore,
indexes: {},
vectorIndexes: {},
searchableProperties: [],
searchablePropertiesWithTypes: {},
frequencies: {},
tokenOccurrences: {},
avgFieldLength: {},
fieldLengths: {}
};
}
for (const [prop, type] of Object.entries(schema)) {
const path = `${prefix}${prefix ? '.' : ''}${prop}`;
if (typeof type === 'object' && !Array.isArray(type)) {
// Nested
create(orama, sharedInternalDocumentStore, type, index, path);
continue;
}
if (isVectorType(type)) {
index.searchableProperties.push(path);
index.searchablePropertiesWithTypes[path] = type;
index.vectorIndexes[path] = {
type: 'Vector',
node: new VectorIndex(getVectorSize(type)),
isArray: false
};
}
else {
const isArray = /\[/.test(type);
switch (type) {
case 'boolean':
case 'boolean[]':
index.indexes[path] = { type: 'Bool', node: new BoolNode(), isArray };
break;
case 'number':
case 'number[]':
index.indexes[path] = { type: 'AVL', node: new AVLTree(0, []), isArray };
break;
case 'string':
case 'string[]':
index.indexes[path] = { type: 'Radix', node: new RadixTree(), isArray };
index.avgFieldLength[path] = 0;
index.frequencies[path] = {};
index.tokenOccurrences[path] = {};
index.fieldLengths[path] = {};
break;
case 'enum':
case 'enum[]':
index.indexes[path] = { type: 'Flat', node: new FlatTree(), isArray };
break;
case 'geopoint':
index.indexes[path] = { type: 'BKD', node: new BKDTree(), isArray };
break;
default:
throw createError('INVALID_SCHEMA_TYPE', Array.isArray(type) ? 'array' : type, path);
}
index.searchableProperties.push(path);
index.searchablePropertiesWithTypes[path] = type;
}
}
return index;
}
function insertScalarBuilder(implementation, index, prop, internalId, language, tokenizer, docsCount, options) {
return (value) => {
const { type, node } = index.indexes[prop];
switch (type) {
case 'Bool': {
node[value ? 'true' : 'false'].add(internalId);
break;
}
case 'AVL': {
const avlRebalanceThreshold = options?.avlRebalanceThreshold ?? 1;
node.insert(value, internalId, avlRebalanceThreshold);
break;
}
case 'Radix': {
const tokens = tokenizer.tokenize(value, language, prop, false);
implementation.insertDocumentScoreParameters(index, prop, internalId, tokens, docsCount);
for (const token of tokens) {
implementation.insertTokenScoreParameters(index, prop, internalId, tokens, token);
node.insert(token, internalId);
}
break;
}
case 'Flat': {
node.insert(value, internalId);
break;
}
case 'BKD': {
node.insert(value, [internalId]);
break;
}
}
};
}
export function insert(implementation, index, prop, id, internalId, value, schemaType, language, tokenizer, docsCount, options) {
if (isVectorType(schemaType)) {
return insertVector(index, prop, value, id, internalId);
}
const insertScalar = insertScalarBuilder(implementation, index, prop, internalId, language, tokenizer, docsCount, options);
if (!isArrayType(schemaType)) {
return insertScalar(value);
}
const elements = value;
const elementsLength = elements.length;
for (let i = 0; i < elementsLength; i++) {
insertScalar(elements[i]);
}
}
export function insertVector(index, prop, value, id, internalDocumentId) {
index.vectorIndexes[prop].node.add(internalDocumentId, value);
}
function removeScalar(implementation, index, prop, id, internalId, value, schemaType, language, tokenizer, docsCount) {
if (isVectorType(schemaType)) {
index.vectorIndexes[prop].node.remove(internalId);
return true;
}
const { type, node } = index.indexes[prop];
switch (type) {
case 'AVL': {
node.removeDocument(value, internalId);
return true;
}
case 'Bool': {
node[value ? 'true' : 'false'].delete(internalId);
return true;
}
case 'Radix': {
const tokens = tokenizer.tokenize(value, language, prop);
implementation.removeDocumentScoreParameters(index, prop, id, docsCount);
for (const token of tokens) {
implementation.removeTokenScoreParameters(index, prop, token);
node.removeDocumentByWord(token, internalId);
}
return true;
}
case 'Flat': {
node.removeDocument(internalId, value);
return true;
}
case 'BKD': {
node.removeDocByID(value, internalId);
return false;
}
}
}
export function remove(implementation, index, prop, id, internalId, value, schemaType, language, tokenizer, docsCount) {
if (!isArrayType(schemaType)) {
return removeScalar(implementation, index, prop, id, internalId, value, schemaType, language, tokenizer, docsCount);
}
const innerSchemaType = getInnerType(schemaType);
const elements = value;
const elementsLength = elements.length;
for (let i = 0; i < elementsLength; i++) {
removeScalar(implementation, index, prop, id, internalId, elements[i], innerSchemaType, language, tokenizer, docsCount);
}
return true;
}
export function calculateResultScores(index, prop, term, ids, docsCount, bm25Relevance, resultsMap, boostPerProperty, whereFiltersIDs, keywordMatchesMap) {
const documentIDs = Array.from(ids);
const avgFieldLength = index.avgFieldLength[prop];
const fieldLengths = index.fieldLengths[prop];
const oramaOccurrences = index.tokenOccurrences[prop];
const oramaFrequencies = index.frequencies[prop];
// oramaOccurrences[term] can be undefined, 0, string, or { [k: string]: number }
const termOccurrences = typeof oramaOccurrences[term] === 'number' ? (oramaOccurrences[term] ?? 0) : 0;
// Calculate TF-IDF value for each term, in each document, for each index.
const documentIDsLength = documentIDs.length;
for (let k = 0; k < documentIDsLength; k++) {
const internalId = documentIDs[k];
if (whereFiltersIDs && !whereFiltersIDs.has(internalId)) {
continue;
}
// Track keyword matches per property
if (!keywordMatchesMap.has(internalId)) {
keywordMatchesMap.set(internalId, new Map());
}
const propertyMatches = keywordMatchesMap.get(internalId);
propertyMatches.set(prop, (propertyMatches.get(prop) || 0) + 1);
const tf = oramaFrequencies?.[internalId]?.[term] ?? 0;
const bm25 = BM25(tf, termOccurrences, docsCount, fieldLengths[internalId], avgFieldLength, bm25Relevance);
if (resultsMap.has(internalId)) {
resultsMap.set(internalId, resultsMap.get(internalId) + bm25 * boostPerProperty);
}
else {
resultsMap.set(internalId, bm25 * boostPerProperty);
}
}
}
export function search(index, term, tokenizer, language, propertiesToSearch, exact, tolerance, boost, relevance, docsCount, whereFiltersIDs, threshold = 0) {
const tokens = tokenizer.tokenize(term, language);
const keywordsCount = tokens.length || 1;
// Track keyword matches per document and property
const keywordMatchesMap = new Map();
// Track which tokens were found in the search
const tokenFoundMap = new Map();
const resultsMap = new Map();
for (const prop of propertiesToSearch) {
if (!(prop in index.indexes)) {
continue;
}
const tree = index.indexes[prop];
const { type } = tree;
if (type !== 'Radix') {
throw createError('WRONG_SEARCH_PROPERTY_TYPE', prop);
}
const boostPerProperty = boost[prop] ?? 1;
if (boostPerProperty <= 0) {
throw createError('INVALID_BOOST_VALUE', boostPerProperty);
}
// if the tokenizer returns an empty array, we returns all the documents
if (tokens.length === 0 && !term) {
tokens.push('');
}
// Process each token in the search term
const tokenLength = tokens.length;
for (let i = 0; i < tokenLength; i++) {
const token = tokens[i];
const searchResult = tree.node.find({ term: token, exact, tolerance });
// See if this token was found (for threshold=0 filtering)
const termsFound = Object.keys(searchResult);
if (termsFound.length > 0) {
tokenFoundMap.set(token, true);
}
// Process each matching term
const termsFoundLength = termsFound.length;
for (let j = 0; j < termsFoundLength; j++) {
const word = termsFound[j];
const ids = searchResult[word];
calculateResultScores(index, prop, word, ids, docsCount, relevance, resultsMap, boostPerProperty, whereFiltersIDs, keywordMatchesMap);
}
}
}
// Convert to array and sort by score
const results = Array.from(resultsMap.entries())
.map(([id, score]) => [id, score])
.sort((a, b) => b[1] - a[1]);
if (results.length === 0) {
return [];
}
// If threshold is 1, return all results
if (threshold === 1) {
return results;
}
// For threshold=0, check if all tokens were found
if (threshold === 0) {
// Quick return for single tokens - already validated
if (keywordsCount === 1) {
return results;
}
// For multiple tokens, verify that ALL tokens were found
// If any token wasn't found, return an empty result
for (const token of tokens) {
if (!tokenFoundMap.get(token)) {
return [];
}
}
// Find documents that have all keywords in at least one property
const fullMatches = results.filter(([id]) => {
const propertyMatches = keywordMatchesMap.get(id);
if (!propertyMatches)
return false;
// Check if any property has all keywords
return Array.from(propertyMatches.values()).some((matches) => matches === keywordsCount);
});
return fullMatches;
}
// Find documents that have all keywords in at least one property
const fullMatches = results.filter(([id]) => {
const propertyMatches = keywordMatchesMap.get(id);
if (!propertyMatches)
return false;
// Check if any property has all keywords
return Array.from(propertyMatches.values()).some((matches) => matches === keywordsCount);
});
// If we have full matches and threshold < 1, return full matches plus a percentage of partial matches
if (fullMatches.length > 0) {
const remainingResults = results.filter(([id]) => !fullMatches.some(([fid]) => fid === id));
const additionalResults = Math.ceil(remainingResults.length * threshold);
return [...fullMatches, ...remainingResults.slice(0, additionalResults)];
}
// If no full matches, return all results
return results;
}
export function searchByWhereClause(index, tokenizer, filters, language) {
// Handle logical operators
if ('and' in filters && filters.and && Array.isArray(filters.and)) {
const andFilters = filters.and;
if (andFilters.length === 0) {
return new Set();
}
const results = andFilters.map((filter) => searchByWhereClause(index, tokenizer, filter, language));
return setIntersection(...results);
}
if ('or' in filters && filters.or && Array.isArray(filters.or)) {
const orFilters = filters.or;
if (orFilters.length === 0) {
return new Set();
}
const results = orFilters.map((filter) => searchByWhereClause(index, tokenizer, filter, language));
// Use reduce to union all sets
return results.reduce((acc, set) => setUnion(acc, set), new Set());
}
if ('not' in filters && filters.not) {
const notFilter = filters.not;
// Get all document IDs from the internal document store
const allDocs = new Set();
// Get all document IDs from the internal document store
const docsStore = index.sharedInternalDocumentStore;
for (let i = 1; i <= docsStore.internalIdToId.length; i++) {
allDocs.add(i);
}
const notResult = searchByWhereClause(index, tokenizer, notFilter, language);
return setDifference(allDocs, notResult);
}
// Handle regular property filters (existing logic)
const filterKeys = Object.keys(filters);
const filtersMap = filterKeys.reduce((acc, key) => ({
[key]: new Set(),
...acc
}), {});
for (const param of filterKeys) {
const operation = filters[param];
if (typeof index.indexes[param] === 'undefined') {
throw createError('UNKNOWN_FILTER_PROPERTY', param);
}
const { node, type, isArray } = index.indexes[param];
if (type === 'Bool') {
const idx = node;
const filteredIDs = operation ? idx.true : idx.false;
filtersMap[param] = setUnion(filtersMap[param], filteredIDs);
continue;
}
if (type === 'BKD') {
let reqOperation;
if ('radius' in operation) {
reqOperation = 'radius';
}
else if ('polygon' in operation) {
reqOperation = 'polygon';
}
else {
throw new Error(`Invalid operation ${operation}`);
}
if (reqOperation === 'radius') {
const { value, coordinates, unit = 'm', inside = true, highPrecision = false } = operation[reqOperation];
const distanceInMeters = convertDistanceToMeters(value, unit);
const ids = node.searchByRadius(coordinates, distanceInMeters, inside, undefined, highPrecision);
filtersMap[param] = addGeoResult(filtersMap[param], ids);
}
else {
const { coordinates, inside = true, highPrecision = false } = operation[reqOperation];
const ids = node.searchByPolygon(coordinates, inside, undefined, highPrecision);
filtersMap[param] = addGeoResult(filtersMap[param], ids);
}
continue;
}
if (type === 'Radix' && (typeof operation === 'string' || Array.isArray(operation))) {
for (const raw of [operation].flat()) {
const term = tokenizer.tokenize(raw, language, param);
for (const t of term) {
const filteredIDsResults = node.find({ term: t, exact: true });
filtersMap[param] = addFindResult(filtersMap[param], filteredIDsResults);
}
}
continue;
}
const operationKeys = Object.keys(operation);
if (operationKeys.length > 1) {
throw createError('INVALID_FILTER_OPERATION', operationKeys.length);
}
if (type === 'Flat') {
const results = new Set(isArray
? node.filterArr(operation)
: node.filter(operation));
filtersMap[param] = setUnion(filtersMap[param], results);
continue;
}
if (type === 'AVL') {
const operationOpt = operationKeys[0];
const operationValue = operation[operationOpt];
let filteredIDs;
switch (operationOpt) {
case 'gt': {
filteredIDs = node.greaterThan(operationValue, false);
break;
}
case 'gte': {
filteredIDs = node.greaterThan(operationValue, true);
break;
}
case 'lt': {
filteredIDs = node.lessThan(operationValue, false);
break;
}
case 'lte': {
filteredIDs = node.lessThan(operationValue, true);
break;
}
case 'eq': {
const ret = node.find(operationValue);
filteredIDs = ret ?? new Set();
break;
}
case 'between': {
const [min, max] = operationValue;
filteredIDs = node.rangeSearch(min, max);
break;
}
default:
throw createError('INVALID_FILTER_OPERATION', operationOpt);
}
filtersMap[param] = setUnion(filtersMap[param], filteredIDs);
}
}
// AND operation: calculate the intersection between all the IDs in filterMap
return setIntersection(...Object.values(filtersMap));
}
export function getSearchableProperties(index) {
return index.searchableProperties;
}
export function getSearchablePropertiesWithTypes(index) {
return index.searchablePropertiesWithTypes;
}
export function load(sharedInternalDocumentStore, raw) {
const { indexes: rawIndexes, vectorIndexes: rawVectorIndexes, searchableProperties, searchablePropertiesWithTypes, frequencies, tokenOccurrences, avgFieldLength, fieldLengths } = raw;
const indexes = {};
const vectorIndexes = {};
for (const prop of Object.keys(rawIndexes)) {
const { node, type, isArray } = rawIndexes[prop];
switch (type) {
case 'Radix':
indexes[prop] = {
type: 'Radix',
node: RadixTree.fromJSON(node),
isArray
};
break;
case 'Flat':
indexes[prop] = {
type: 'Flat',
node: FlatTree.fromJSON(node),
isArray
};
break;
case 'AVL':
indexes[prop] = {
type: 'AVL',
node: AVLTree.fromJSON(node),
isArray
};
break;
case 'BKD':
indexes[prop] = {
type: 'BKD',
node: BKDTree.fromJSON(node),
isArray
};
break;
case 'Bool':
indexes[prop] = {
type: 'Bool',
node: BoolNode.fromJSON(node),
isArray
};
break;
default:
indexes[prop] = rawIndexes[prop];
}
}
for (const idx of Object.keys(rawVectorIndexes)) {
vectorIndexes[idx] = {
type: 'Vector',
isArray: false,
node: VectorIndex.fromJSON(rawVectorIndexes[idx])
};
}
return {
sharedInternalDocumentStore,
indexes,
vectorIndexes,
searchableProperties,
searchablePropertiesWithTypes,
frequencies,
tokenOccurrences,
avgFieldLength,
fieldLengths
};
}
export function save(index) {
const { indexes, vectorIndexes, searchableProperties, searchablePropertiesWithTypes, frequencies, tokenOccurrences, avgFieldLength, fieldLengths } = index;
const dumpVectorIndexes = {};
for (const idx of Object.keys(vectorIndexes)) {
dumpVectorIndexes[idx] = vectorIndexes[idx].node.toJSON();
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const savedIndexes = {};
for (const name of Object.keys(indexes)) {
const { type, node, isArray } = indexes[name];
if (type === 'Flat' || type === 'Radix' || type === 'AVL' || type === 'BKD' || type === 'Bool') {
savedIndexes[name] = {
type,
node: node.toJSON(),
isArray
};
}
else {
savedIndexes[name] = indexes[name];
savedIndexes[name].node = savedIndexes[name].node.toJSON();
}
}
return {
indexes: savedIndexes,
vectorIndexes: dumpVectorIndexes,
searchableProperties,
searchablePropertiesWithTypes,
frequencies,
tokenOccurrences,
avgFieldLength,
fieldLengths
};
}
export function createIndex() {
return {
create,
insert,
remove,
insertDocumentScoreParameters,
insertTokenScoreParameters,
removeDocumentScoreParameters,
removeTokenScoreParameters,
calculateResultScores,
search,
searchByWhereClause,
getSearchableProperties,
getSearchablePropertiesWithTypes,
load,
save
};
}
function addGeoResult(set, ids) {
if (!set) {
set = new Set();
}
const idsLength = ids.length;
for (let i = 0; i < idsLength; i++) {
const entry = ids[i].docIDs;
const idsLength = entry.length;
for (let j = 0; j < idsLength; j++) {
set.add(entry[j]);
}
}
return set;
}
function createGeoTokenScores(ids, centerPoint, highPrecision = false) {
const distanceFn = highPrecision ? BKDTree.vincentyDistance : BKDTree.haversineDistance;
const results = [];
// Calculate distances for all results to find the maximum
const distances = [];
for (const { point } of ids) {
distances.push(distanceFn(centerPoint, point));
}
const maxDistance = Math.max(...distances);
// Create results with inverse distance scores (higher score = closer)
let index = 0;
for (const { docIDs } of ids) {
const distance = distances[index];
// Use inverse score: closer points get higher scores
// Add 1 to avoid division by zero for points at exact center
const score = maxDistance - distance + 1;
for (const docID of docIDs) {
results.push([docID, score]);
}
index++;
}
// Sort by score (higher first - closer points)
results.sort((a, b) => b[1] - a[1]);
return results;
}
function isGeosearchOnlyQuery(filters, index) {
const filterKeys = Object.keys(filters);
if (filterKeys.length !== 1) {
return { isGeoOnly: false };
}
const param = filterKeys[0];
const operation = filters[param];
if (typeof index.indexes[param] === 'undefined') {
return { isGeoOnly: false };
}
const { type } = index.indexes[param];
if (type === 'BKD' && operation && ('radius' in operation || 'polygon' in operation)) {
return { isGeoOnly: true, geoProperty: param, geoOperation: operation };
}
return { isGeoOnly: false };
}
export function searchByGeoWhereClause(index, filters) {
const indexTyped = index;
const geoInfo = isGeosearchOnlyQuery(filters, indexTyped);
if (!geoInfo.isGeoOnly || !geoInfo.geoProperty || !geoInfo.geoOperation) {
return null;
}
const { node } = indexTyped.indexes[geoInfo.geoProperty];
const operation = geoInfo.geoOperation;
// Cast node to BKDTree since we already verified it's type 'BKD'
const bkdNode = node;
let results;
if ('radius' in operation) {
const { value, coordinates, unit = 'm', inside = true, highPrecision = false } = operation.radius;
const centerPoint = coordinates;
const distanceInMeters = convertDistanceToMeters(value, unit);
results = bkdNode.searchByRadius(centerPoint, distanceInMeters, inside, 'asc', highPrecision);
return createGeoTokenScores(results, centerPoint, highPrecision);
}
else if ('polygon' in operation) {
const { coordinates, inside = true, highPrecision = false } = operation.polygon;
results = bkdNode.searchByPolygon(coordinates, inside, 'asc', highPrecision);
const centroid = BKDTree.calculatePolygonCentroid(coordinates);
return createGeoTokenScores(results, centroid, highPrecision);
}
return null;
}
function addFindResult(set, filteredIDsResults) {
if (!set) {
set = new Set();
}
const keys = Object.keys(filteredIDsResults);
const keysLength = keys.length;
for (let i = 0; i < keysLength; i++) {
const ids = filteredIDsResults[keys[i]];
const idsLength = ids.length;
for (let j = 0; j < idsLength; j++) {
set.add(ids[j]);
}
}
return set;
}
//# sourceMappingURL=index.js.map