UNPKG

@orama/orama

Version:

A complete search engine and RAG pipeline in your browser, server, or edge network with support for full-text, vector, and hybrid search in less than 2kb.

235 lines 11.1 kB
import { isArrayType, isGeoPointType, isVectorType } from '../components.js'; import { isAsyncFunction, sleep } from '../utils.js'; import { runMultipleHook, runSingleHook } from '../components/hooks.js'; import { createError } from '../errors.js'; import { getInternalDocumentId } from '../components/internal-document-id-store.js'; export function insert(orama, doc, language, skipHooks, options) { const errorProperty = orama.validateSchema(doc, orama.schema); if (errorProperty) { throw createError('SCHEMA_VALIDATION_FAILURE', errorProperty); } const asyncNeeded = isAsyncFunction(orama.beforeInsert) || isAsyncFunction(orama.afterInsert) || isAsyncFunction(orama.index.beforeInsert) || isAsyncFunction(orama.index.insert) || isAsyncFunction(orama.index.afterInsert); if (asyncNeeded) { return innerInsertAsync(orama, doc, language, skipHooks, options); } return innerInsertSync(orama, doc, language, skipHooks, options); } const ENUM_TYPE = new Set(['enum', 'enum[]']); const STRING_NUMBER_TYPE = new Set(['string', 'number']); async function innerInsertAsync(orama, doc, language, skipHooks, options) { const { index, docs } = orama.data; const id = orama.getDocumentIndexId(doc); if (typeof id !== 'string') { throw createError('DOCUMENT_ID_MUST_BE_STRING', typeof id); } const internalId = getInternalDocumentId(orama.internalDocumentIDStore, id); if (!skipHooks) { await runSingleHook(orama.beforeInsert, orama, id, doc); } if (!orama.documentsStore.store(docs, id, internalId, doc)) { throw createError('DOCUMENT_ALREADY_EXISTS', id); } const docsCount = orama.documentsStore.count(docs); const indexableProperties = orama.index.getSearchableProperties(index); const indexablePropertiesWithTypes = orama.index.getSearchablePropertiesWithTypes(index); const indexableValues = orama.getDocumentProperties(doc, indexableProperties); for (const [key, value] of Object.entries(indexableValues)) { if (typeof value === 'undefined') continue; const actualType = typeof value; const expectedType = indexablePropertiesWithTypes[key]; validateDocumentProperty(actualType, expectedType, key, value); } await indexAndSortDocument(orama, id, indexableProperties, indexableValues, docsCount, language, doc, options); if (!skipHooks) { await runSingleHook(orama.afterInsert, orama, id, doc); } return id; } function innerInsertSync(orama, doc, language, skipHooks, options) { const { index, docs } = orama.data; const id = orama.getDocumentIndexId(doc); if (typeof id !== 'string') { throw createError('DOCUMENT_ID_MUST_BE_STRING', typeof id); } const internalId = getInternalDocumentId(orama.internalDocumentIDStore, id); if (!skipHooks) { runSingleHook(orama.beforeInsert, orama, id, doc); } if (!orama.documentsStore.store(docs, id, internalId, doc)) { throw createError('DOCUMENT_ALREADY_EXISTS', id); } const docsCount = orama.documentsStore.count(docs); const indexableProperties = orama.index.getSearchableProperties(index); const indexablePropertiesWithTypes = orama.index.getSearchablePropertiesWithTypes(index); const indexableValues = orama.getDocumentProperties(doc, indexableProperties); for (const [key, value] of Object.entries(indexableValues)) { if (typeof value === 'undefined') continue; const actualType = typeof value; const expectedType = indexablePropertiesWithTypes[key]; validateDocumentProperty(actualType, expectedType, key, value); } indexAndSortDocumentSync(orama, id, indexableProperties, indexableValues, docsCount, language, doc, options); if (!skipHooks) { runSingleHook(orama.afterInsert, orama, id, doc); } return id; } function validateDocumentProperty(actualType, expectedType, key, value) { if (isGeoPointType(expectedType) && typeof value === 'object' && typeof value.lon === 'number' && typeof value.lat === 'number') { return; } if (isVectorType(expectedType) && Array.isArray(value)) return; if (isArrayType(expectedType) && Array.isArray(value)) return; if (ENUM_TYPE.has(expectedType) && STRING_NUMBER_TYPE.has(actualType)) return; if (actualType !== expectedType) { throw createError('INVALID_DOCUMENT_PROPERTY', key, expectedType, actualType); } } async function indexAndSortDocument(orama, id, indexableProperties, indexableValues, docsCount, language, doc, options) { for (const prop of indexableProperties) { const value = indexableValues[prop]; if (typeof value === 'undefined') continue; const expectedType = orama.index.getSearchablePropertiesWithTypes(orama.data.index)[prop]; await orama.index.beforeInsert?.(orama.data.index, prop, id, value, expectedType, language, orama.tokenizer, docsCount); const internalId = orama.internalDocumentIDStore.idToInternalId.get(id); await orama.index.insert(orama.index, orama.data.index, prop, id, internalId, value, expectedType, language, orama.tokenizer, docsCount, options); await orama.index.afterInsert?.(orama.data.index, prop, id, value, expectedType, language, orama.tokenizer, docsCount); } const sortableProperties = orama.sorter.getSortableProperties(orama.data.sorting); const sortableValues = orama.getDocumentProperties(doc, sortableProperties); for (const prop of sortableProperties) { const value = sortableValues[prop]; if (typeof value === 'undefined') continue; const expectedType = orama.sorter.getSortablePropertiesWithTypes(orama.data.sorting)[prop]; orama.sorter.insert(orama.data.sorting, prop, id, value, expectedType, language); } } function indexAndSortDocumentSync(orama, id, indexableProperties, indexableValues, docsCount, language, doc, options) { for (const prop of indexableProperties) { const value = indexableValues[prop]; if (typeof value === 'undefined') continue; const expectedType = orama.index.getSearchablePropertiesWithTypes(orama.data.index)[prop]; const internalDocumentId = getInternalDocumentId(orama.internalDocumentIDStore, id); orama.index.beforeInsert?.(orama.data.index, prop, id, value, expectedType, language, orama.tokenizer, docsCount); orama.index.insert(orama.index, orama.data.index, prop, id, internalDocumentId, value, expectedType, language, orama.tokenizer, docsCount, options); orama.index.afterInsert?.(orama.data.index, prop, id, value, expectedType, language, orama.tokenizer, docsCount); } const sortableProperties = orama.sorter.getSortableProperties(orama.data.sorting); const sortableValues = orama.getDocumentProperties(doc, sortableProperties); for (const prop of sortableProperties) { const value = sortableValues[prop]; if (typeof value === 'undefined') continue; const expectedType = orama.sorter.getSortablePropertiesWithTypes(orama.data.sorting)[prop]; orama.sorter.insert(orama.data.sorting, prop, id, value, expectedType, language); } } export function insertMultiple(orama, docs, batchSize, language, skipHooks, timeout) { const asyncNeeded = isAsyncFunction(orama.afterInsertMultiple) || isAsyncFunction(orama.beforeInsertMultiple) || isAsyncFunction(orama.index.beforeInsert) || isAsyncFunction(orama.index.insert) || isAsyncFunction(orama.index.afterInsert); if (asyncNeeded) { return innerInsertMultipleAsync(orama, docs, batchSize, language, skipHooks, timeout); } return innerInsertMultipleSync(orama, docs, batchSize, language, skipHooks, timeout); } async function innerInsertMultipleAsync(orama, docs, batchSize = 1000, language, skipHooks, timeout = 0) { const ids = []; const processNextBatch = async (startIndex) => { const endIndex = Math.min(startIndex + batchSize, docs.length); const batch = docs.slice(startIndex, endIndex); for (const doc of batch) { const options = { avlRebalanceThreshold: batch.length }; const id = await insert(orama, doc, language, skipHooks, options); ids.push(id); } return endIndex; }; const processAllBatches = async () => { let currentIndex = 0; while (currentIndex < docs.length) { const startTime = Date.now(); currentIndex = await processNextBatch(currentIndex); if (timeout > 0) { const elapsedTime = Date.now() - startTime; const waitTime = timeout - elapsedTime; if (waitTime > 0) { sleep(waitTime); } } } }; await processAllBatches(); if (!skipHooks) { await runMultipleHook(orama.afterInsertMultiple, orama, docs); } return ids; } function innerInsertMultipleSync(orama, docs, batchSize = 1000, language, skipHooks, timeout = 0) { const ids = []; let i = 0; function processNextBatch() { const batch = docs.slice(i * batchSize, (i + 1) * batchSize); if (batch.length === 0) return false; for (const doc of batch) { const options = { avlRebalanceThreshold: batch.length }; const id = insert(orama, doc, language, skipHooks, options); ids.push(id); } i++; return true; } function processAllBatches() { const startTime = Date.now(); // eslint-disable-next-line no-constant-condition while (true) { const hasMoreBatches = processNextBatch(); if (!hasMoreBatches) break; if (timeout > 0) { const elapsedTime = Date.now() - startTime; if (elapsedTime >= timeout) { const remainingTime = timeout - (elapsedTime % timeout); if (remainingTime > 0) { sleep(remainingTime); } } } } } processAllBatches(); if (!skipHooks) { runMultipleHook(orama.afterInsertMultiple, orama, docs); } return ids; } export function innerInsertMultiple(orama, docs, batchSize, language, skipHooks, timeout) { const asyncNeeded = isAsyncFunction(orama.beforeInsert) || isAsyncFunction(orama.afterInsert) || isAsyncFunction(orama.index.beforeInsert) || isAsyncFunction(orama.index.insert) || isAsyncFunction(orama.index.afterInsert); if (asyncNeeded) { return innerInsertMultipleAsync(orama, docs, batchSize, language, skipHooks, timeout); } return innerInsertMultipleSync(orama, docs, batchSize, language, skipHooks, timeout); } //# sourceMappingURL=insert.js.map