find-voyage
Version:
A better way to search!!
682 lines (618 loc) • 22.8 kB
JavaScript
import { pipeline, env } from "@xenova/transformers";
// Since we will download the model from the Hugging Face Hub, we can skip the local model check
env.allowLocalModels = false;
// Due to a bug in onnxruntime-web, we must disable multithreading for now.
// See https://github.com/microsoft/onnxruntime/issues/14445 for more information.
// env.backends.onnx.wasm.numThreads = 1;
class PipelineSingleton {
static task = "feature-extraction";
static model = "Supabase/gte-small";
static instance = null;
static async getInstance(progress_callback = null) {
if (this.instance === null) {
this.instance = pipeline(this.task, this.model, { progress_callback });
}
return this.instance;
}
}
let model = null;
let progress = null;
/**
* Asynchronously loads the model.
*
* This function gets the pipeline instance which will load and build the model when run for the first time.
* It also provides a way to track the progress of the pipeline creation, which can be used to update a UI element like a progress bar.
*
* @async
* @function
* @throws {Error} If the model cannot be loaded, an error is thrown with a message detailing the reason.
*
* @example
* try {
* await loadModel();
* } catch (error) {
* console.error(error);
* }
*/
async function loadModel() {
try {
// Get the pipeline instance. This will load and build the model when run for the first time.
model = await PipelineSingleton.getInstance((data) => {
// You can track the progress of the pipeline creation here.
// e.g., you can send `data` back to the UI to indicate a progress bar
// can be accessed via vagueFinder.getProgress()
progress = data;
});
} catch (error) {
throw new Error(`Unable to load Model due to ${error}`);
}
}
/**
* Throws an error indicating that the model has not been loaded.
*
*
* @function
* @throws {Error} Always throws an error indicating that the model has not been loaded.
*/
function modelNotLoadedErrorMessage() {
throw new Error("Model has not been loaded, use vagueFinder.loadModel()");
}
/**
* Asynchronously classifies the similarity between two sentences.
*
* This function takes two sentences and their respective embeddings and cache flags as input.
* It calculates the embeddings for the sentences if they are not cached.
* Then, it calculates the cosine similarity between the two embeddings.
* It returns an object containing the two sentences, their similarity score, and the embedding of the first sentence.
*
* @async
* @function
* @param {string} sentenceOne - The first sentence to be compared.
* @param {string} sentenceTwo - The second sentence to be compared.
* @param {Array<number>} embedding1Cache - The cached embedding for the first sentence.
* @param {boolean} doesCache1Exist - Flag indicating whether the embedding for the first sentence is cached.
* @param {Array<number>} embedding2Cache - The cached embedding for the second sentence.
* @param {boolean} doesCache2Exist - Flag indicating whether the embedding for the second sentence is cached.
* @throws {Error} If the model is not loaded, an error is thrown.
* @returns {Promise<{sentenceOne: string, sentenceTwo: string, alike: number, embedding1Cache: Array<number>}>} A Promise that resolves to an object containing the two sentences, their similarity score, and the embedding of the first sentence.
*
* @example
* try {
* const result = await classify(sentence1, sentence2, embedding1Cache, true, embedding2Cache, false);
* console.log(result);
* } catch (error) {
* console.error(error);
* }
*/
const classify = async (
sentenceOne,
sentenceTwo,
embedding1Cache,
doesCache1Exist,
embedding2Cache,
doesCache2Exist,
) => {
if (!doesCache2Exist && !model) {
modelNotLoadedErrorMessage();
return;
}
let embedding1 = null;
let embedding2 = null;
if (doesCache1Exist) {
embedding1 = embedding1Cache;
} else {
embedding1 = await model(sentenceOne, {
pooling: "mean",
normalize: true,
});
}
if (doesCache2Exist) {
embedding2 = embedding2Cache;
} else {
embedding2 = await model(sentenceTwo, {
pooling: "mean",
normalize: true,
});
}
if (!doesCache1Exist) {
embedding1 = Array.from(embedding1.data);
}
if (!doesCache2Exist) {
embedding2 = Array.from(embedding2.data);
}
const similarity = calculateCosineSimilarity(embedding1, embedding2);
let result = similarity;
function calculateCosineSimilarity(embedding1, embedding2) {
// Calculate dot product and magnitudes
let dotProduct = 0;
let magnitude1 = 0;
let magnitude2 = 0;
for (let i = 0; i < embedding1.length; i++) {
dotProduct += embedding1[i] * embedding2[i];
magnitude1 += embedding1[i] * embedding1[i];
magnitude2 += embedding2[i] * embedding2[i];
}
magnitude1 = Math.sqrt(magnitude1);
magnitude2 = Math.sqrt(magnitude2);
// Calculate cosine similarity
const similarity = dotProduct / (magnitude1 * magnitude2);
return similarity;
}
return {
sentenceOne: sentenceOne,
sentenceTwo: sentenceTwo,
alike: result,
embedding1Cache: embedding1,
};
};
/**
* Asynchronously compares a sentence to an array of sentences.
*
* This function takes a sentence and an array of sentences, and a cache flag as input.
* It calculates the similarity between the input sentence and each sentence in the array.
* It returns an object containing the input sentence and the array of sentences with their similarity scores.
*
* @async
* @function
* @param {string} sentence - The sentence to be compared.
* @param {Array<string|{sentenceTwo: string, embedding: Array<number>}>} array - The array of sentences to be compared. Each element can be a string or an object with `sentenceTwo` and `embedding` properties.
* @param {boolean} doesCache2Exist - Flag indicating whether the embeddings for the sentences in the array are cached.
* @throws {Error} If the model is not loaded, an error is thrown.
* @returns {Promise<{sentenceOne: string, array: Array<{sentenceTwo: string, alike: number}>}>} A Promise that resolves to an object containing the input sentence and the array of sentences with their similarity scores.
*
* @example
* try {
* const result = await compareSentenceToArray(sentence, array, true);
* console.log(result);
* } catch (error) {
* console.error(error);
* }
*/
const compareSentenceToArray = async (
sentence,
array,
doesCache2Exist = false,
) => {
if (!doesCache2Exist && !model) {
modelNotLoadedErrorMessage();
return;
}
let cache = null;
array = [...array]; //Creating a copy, so that we don't alter the original;
for (let i = 0; i < array.length; i++) {
const { sentenceTwo, alike, embedding1Cache } = await classify(
sentence,
array[i].sentenceTwo ? array[i].sentenceTwo : array[i],
cache,
i !== 0,
array[i].embedding ? array[i].embedding : null,
doesCache2Exist,
);
if (i === 0) {
cache = embedding1Cache;
}
array[i] = { sentenceTwo: sentenceTwo, alike: alike };
}
return {
sentenceOne: sentence,
array: array,
};
};
/**
* Asynchronously compares a sentence to an array of sentences and returns the results in order of similarity.
*
* This function takes a sentence and an array of sentences as input. It uses the `compareSentenceToArray` function to calculate the cosine similarity between the input sentence and each sentence in the array.
* The function then sorts the results in descending order of similarity and returns an object containing the input sentence and the sorted array of comparison results.
*
* @async
* @function
* @param {string} sentence - The sentence to compare to the array of sentences.
* @param {Array<string>} array - The array of sentences to compare to the input sentence.
* @returns {Promise<{sentenceOne: string, array: Array<{sentenceTwo: string, alike: number}>}>} A Promise that resolves to an object containing the input sentence and an array of objects. Each object in the array contains:
* - `sentenceTwo`: A sentence from the input array.
* - `alike`: The cosine similarity score between the input sentence and `sentenceTwo`.
* The array is sorted in descending order of similarity score.
* @throws {Error} If the model has not been loaded.
*
* @example
* try {
* const result = await arrayInOrder("This is a sentence.", ["This is another sentence.", "Yet another sentence."]);
* console.log(result);
* } catch (error) {
* console.error(error);
* }
*/
const arrayInOrder = async (sentence, array) => {
if (!model) {
modelNotLoadedErrorMessage();
return;
}
array = [...array]; //Creating a copy, so that we don't alter the original;
const { sentenceOne, array: returnedArray } = await compareSentenceToArray(
sentence,
array,
false,
);
returnedArray.sort((a, b) => b.alike - a.alike);
return {
sentenceOne: sentenceOne,
array: returnedArray,
};
};
/**
* Returns the progress of the model loading process.
*
* If the model is loading, it returns an Object that represents the progress of the model loading process.
*
* @function
* @returns {Object} The progress of the model loading process.
*
* @example
* try {
* const progress = getProgress();
* console.log(progress);
* } catch (error) {
* console.error(error);
* }
*/
function getProgress() {
return progress;
}
/**
* Compares two sentences using the loaded model.
*
* This function takes two sentences as input and uses the `classify` function to calculate the cosine similarity between them. If the model has not been loaded, it throws an error.
*
* @async
* @function
* @param {string} sentenceOne - The first sentence to compare.
* @param {string} sentenceTwo - The second sentence to compare.
* @returns {Promise<{sentenceOne: string, sentenceTwo: string, alike: number}>} A Promise that resolves to an object containing:
* - `sentenceOne`: The first input sentence.
* - `sentenceTwo`: The second input sentence.
* - `alike`: The cosine similarity score between `sentenceOne` and `sentenceTwo`.
* @throws {Error} If the model has not been loaded.
*
* @example
* try {
* const result = await compareTwoSentences("This is a sentence.", "This is another sentence.");
* console.log(result);
* } catch (error) {
* console.error(error);
* }
*/
async function compareTwoSentences(sentenceOne, sentenceTwo) {
if (!model) {
modelNotLoadedErrorMessage();
return;
}
const { alike } = await classify(
sentenceOne,
sentenceTwo,
null,
false,
null,
false,
);
return { sentenceOne, sentenceTwo, alike };
}
/**
* Asynchronously generates embeddings for an array of sentences.
*
* This function takes an array of sentences as input.
* It generates embeddings for each sentence in the array using the model.
* It returns an array of objects, each containing a sentence and its corresponding embedding.
*
* Note: This function creates a copy of the input array to avoid altering the original.
*
* @async
* @function
* @param {Array<string>} array - The array of sentences for which embeddings are to be generated.
* @throws {Error} If the model is not loaded, an error is thrown.
* @returns {Promise<Array<{sentenceTwo: string, embedding: Array<number>}>>} A Promise that resolves to an array of objects, each containing a sentence and its corresponding embedding.
*
* @example
* try {
* const result = await getCached(array);
* console.log(result);
* } catch (error) {
* console.error(error);
* }
*/
async function getCached(array) {
if (!model) {
modelNotLoadedErrorMessage();
return;
}
array = [...array]; //Creating a copy, so that we don't alter the original;
let returnedArray = [];
for (let i = 0; i < array.length; i++) {
let embedding = await model(array[i], {
pooling: "mean",
normalize: true,
});
embedding = Array.from(embedding.data);
returnedArray[i] = { sentenceTwo: array[i], embedding: embedding };
}
return returnedArray;
}
/**
* Compares a sentence to an array of cached sentences.
*
* @async
* @function
* @param {string} sentence - The sentence to compare.
* @param {Array<{sentenceTwo: string, embedding: Array<number>}>} cachedArray - The array of cached sentences to compare against.
* @returns {Promise<{sentenceOne: string, array: Array<{sentenceTwo: string, alike: number}>}>} An object containing the original sentence and an array of comparison results.
*
* @example
* const result = await cachedCompareSentenceToArray('Hello world', cachedSentences);
* console.log(result);
*/
async function cachedCompareSentenceToArray(sentence, cachedArray) {
cachedArray.map((item) => {
if (!item.sentenceTwo) {
throw new Error(
"Each item in the cachedArray must have a sentenceTwo property",
);
}
return {
sentenceTwo: item.sentenceTwo,
embedding: [...item.embedding],
};
});
const { sentenceOne, array: returnedArray } = await compareSentenceToArray(
sentence,
cachedArray,
true,
);
return {
sentenceOne: sentenceOne,
array: returnedArray,
};
}
/**
* Asynchronously sorts an array of sentences based on their similarity to a given sentence.
*
* This function takes a sentence and an array of sentences as input.
* It calculates the similarity between the input sentence and each sentence in the array.
* It then sorts the array based on the similarity scores in descending order.
* It returns an object containing the input sentence and the sorted array of sentences with their similarity scores.
*
* This function differs from `arrayInOrder` in that it expects the array of sentences to already have cached embeddings.
* This function is useful when you have a large array of sentences and you want to cache their embeddings to avoid recalculating them each time you compare a new sentence to the array.
*
* @async
* @function
* @param {string} sentence - The sentence to be compared.
* @param {Array<string|{sentenceTwo: string, embedding: Array<number>}>} cachedArray - The array of sentences to be compared. Each element is a object with `sentenceTwo` and `embedding` properties.
* @returns {Promise<{sentenceOne: string, array: Array<{sentenceTwo: string, alike: number}>}>} A Promise that resolves to an object containing the input sentence and the sorted array of sentences with their similarity scores.
*
* @example
* try {
* const result = await cachedArrayInOrder(sentence, array);
* console.log(result);
* } catch (error) {
* console.error(error);
* }
*/
async function cachedArrayInOrder(sentence, cachedArray) {
cachedArray.map((item) => {
if (!item.sentenceTwo) {
throw new Error(
"Each item in the cachedArray must have a sentenceTwo property",
);
}
return {
sentenceTwo: item.sentenceTwo,
embedding: [...item.embedding],
};
});
const { sentenceOne, array: returnedArray } = await compareSentenceToArray(
sentence,
cachedArray,
true,
);
returnedArray.sort((a, b) => b.alike - a.alike);
return {
sentenceOne: sentenceOne,
array: returnedArray,
};
}
/**
* Asynchronously gets the top results from a list of sentences based on their similarity to a given sentence.
* This function is different from `getArrayInOrder` as it limits the results to the top 'numberOfResults' items.
*
* @async
* @param {string} sentence - The sentence to compare against.
* @param {Array<string>} array - The array of sentences to compare.
* @param {number} numberOfResults - The number of top results to return. This parameter constrains the size of the returned array.
* @throws {Error} Will throw an error if the model is not loaded or if numberOfResults is less than or equal to 0.
* @returns {Promise<{sentenceOne: string, array: Array<{sentenceTwo: string, alike: number}>}>} A promise that resolves to an object containing the original sentence and an array of the top results.
* The top results array contains objects with the properties 'sentenceTwo' and 'alike', where 'sentenceTwo' is a sentence from the input array and 'alike' is its similarity score to the original sentence.
*/
async function getTop(sentence, array, numberOfResults) {
if (!model) {
modelNotLoadedErrorMessage();
return;
}
if (numberOfResults <= 0) {
throw new Error("numberOfResults is either 0 or less than 0");
}
const arrayCopy = [...array];
numberOfResults = Math.min(numberOfResults, arrayCopy.length);
const list = new LinkedListInAlikeOrder(numberOfResults);
let cache = null;
for (let i = 0; i < array.length; i++) {
const { sentenceTwo, alike, embedding1Cache } = await classify(
sentence,
array[i],
cache,
i !== 0,
null,
false,
);
if (i === 0) {
cache = embedding1Cache;
}
list.addNode({ sentenceTwo: sentenceTwo, alike: alike });
}
const resultantArray = list.getArray();
return {
sentenceOne: sentence,
array: resultantArray,
};
}
/**
* Class representing a doubly linked list with a maximum length.
*/
class LinkedListInAlikeOrder {
head = null;
tail = null;
length = 0;
maxLength = 0;
/**
* Create a LinkedListInAlikeOrder.
* @param {number} maxLength - The maximum length of the linked list.
*/
constructor(maxLength) {
this.maxLength = maxLength;
}
/**
* Create a new node.
* @param {Object} obj - The object to be added to the node. The object should have two properties: 'alike' and 'sentenceTwo'.
* @return {Object} The new node.
* @private
*/
_node(obj) {
return {
...obj,
next: null,
prev: null,
};
}
/**
* Convert the linked list to an array.
* @return {Array<Object>} The array representation of the linked list.
*/
getArray() {
let currentNode = this.head;
const array = [];
while (currentNode !== null) {
const { alike, sentenceTwo } = currentNode;
array.push({ alike, sentenceTwo });
currentNode = currentNode.next;
}
return array;
}
/**
* Get the index where the new node should be inserted.
* @param {number} alike - The 'alike' value of the new node.
* @return {number} The index where the new node should be inserted.
* @private
*/
_getIndex(alike) {
let count = 0;
let currentNode = this.head;
while (count < this.maxLength) {
if (currentNode === null || currentNode.alike <= alike) {
return count;
}
currentNode = currentNode.next;
count++;
}
return this.maxLength;
}
/**
* Add a new node to the linked list.
* @param {Object} obj - The object to be added as a new node.
*/
addNode(obj) {
if (this.head === null) {
this.head = this.tail = this._node(obj);
this.length++;
this._audit();
return;
}
let index = this._getIndex(obj.alike);
this._insertAtIndex(index, obj);
}
/**
* Insert a new node at a specific index.
* @param {number} index - The index where the new node should be inserted.
* @param {Object} obj - The object to be added as a new node.
* @private
*/
_insertAtIndex(index, obj) {
const newNode = this._node(obj);
let currentNode = this.head;
if (index === 0) {
newNode.next = this.head;
this.head.prev = newNode;
this.head = newNode;
} else if (index === this.length) {
this.tail.next = newNode;
newNode.prev = this.tail;
this.tail = newNode;
} else {
let count = 0;
while (count + 1 < index) {
currentNode = currentNode.next;
count++;
}
newNode.next = currentNode.next;
currentNode.next.prev = newNode;
newNode.prev = currentNode;
currentNode.next = newNode;
}
this.length++;
this._audit();
}
/**
* Check if the linked list is longer than the maximum length.
* If it is, delete the last node.
* @private
*/
_audit() {
if (this.length > this.maxLength) {
this._deleteLastNode();
}
}
/**
* Delete the last node of the linked list.
* @private
*/
_deleteLastNode() {
const newTail = this.tail.prev;
this.tail.prev = null;
newTail.next = null;
this.tail = newTail;
this.length--;
}
}
/**
* The `vagueFinder` object provides a set of methods for comparing sentences using a loaded model.
*
* @namespace
* @property {function} loadModel - Loads the model. See {@link loadModel}.
* @property {function} getProgress - Returns the progress of the model loading process. See {@link getProgress}.
* @property {function} compareTwoSentences - Compares two sentences using the loaded model. See {@link compareTwoSentences}.
* @property {function} compareSentenceToArray - Compares a sentence to an array of sentences using the loaded model. See {@link compareSentenceToArray}.
* @property {function} arrayInOrder - Compares a sentence to an array of sentences using the loaded model and returns the results in order of similarity. See {@link arrayInOrder}.
* @property {function} getCached - Returns a cached array. See {@link getCached}.
* @property {function} cachedCompareSentenceToArray - Compare a sentence to an array of cached sentences. See {@link cachedCompareSentenceToArray}.
* @property {function} cachedArrayInOrder - Compares a sentence to an array of cached senteces and returns the results in order of similarity. See {@link cachedArrayInOrder}.
* @property {function} getTop - Compares a sentence to an array of sentences using the loaded model and returns the top 'numberOfResults' results. The number of results is constrained by the 'numberOfResults' parameter. See {@link getTop}.
*/
const vagueFinder = {
loadModel,
getProgress,
compareTwoSentences,
compareSentenceToArray,
arrayInOrder,
getCached,
cachedCompareSentenceToArray,
cachedArrayInOrder,
getTop,
};
export { vagueFinder };