closevector-web
Version:
CloseVector is fundamentally a vector database. We have made dedicated libraries available for both browsers and node.js, aiming for easy integration no matter your platform. One feature we've been working on is its potential for scalability. Instead of b
341 lines (295 loc) • 12 kB
text/typescript
import {
HnswlibModule,
loadHnswlib,
HierarchicalNSW as HierarchicalNSWT,
} from 'closevector-hnswlib-wasm';
import {
CloseVectorEmbeddings,
CloseVectorHNSWLibArgs,
CloseVectorSaveableVectorStore,
CloseVectorDocument,
SynchronousInMemoryDocstore,
CloseVectorHNSWLibBase,
CloseVectorCredentials,
} from 'closevector-common';
import { pathJoin, IDBFS, createUploadFileOperationUrl, createGetFileOperationUrl, createPublicGetFileOperationUrl } from './lib';
import { upload, download } from './loader';
export {
CloseVectorEmbeddings,
CloseVectorHNSWLibArgs,
CloseVectorSaveableVectorStore,
CloseVectorDocument,
SynchronousInMemoryDocstore,
CloseVectorHNSWLibBase,
CloseVectorCredentials,
} from 'closevector-common';
export type { HierarchicalNSW as HierarchicalNSWT } from 'closevector-hnswlib-wasm';
export type { HnswlibModule };
let __lib: HnswlibModule | undefined = undefined
export class CloseVectorHNSWWeb extends CloseVectorSaveableVectorStore {
declare FilterType: (doc: CloseVectorDocument) => boolean;
_index?: HierarchicalNSWT;
docstore: SynchronousInMemoryDocstore;
args: CloseVectorHNSWLibBase;
_uuid?: string;
_vectorstoreType(): string {
return 'hnswlib';
}
constructor(embeddings: CloseVectorEmbeddings, args: CloseVectorHNSWLibArgs<HierarchicalNSWT> & { credentials?: CloseVectorCredentials }) {
super(embeddings, args.credentials);
this._index = args.index;
this.args = args;
this.embeddings = embeddings;
this.docstore = args?.docstore ?? new SynchronousInMemoryDocstore();
}
async addDocuments(documents: CloseVectorDocument[]): Promise<void> {
const texts = documents.map(({ pageContent }) => pageContent);
return this.addVectors(await this.embeddings.embedDocuments(texts), documents);
}
private static async getHierarchicalNSW(args: CloseVectorHNSWLibBase) {
const { HierarchicalNSW } = await CloseVectorHNSWWeb.imports();
if (!args.space) {
throw new Error('closevector-hnswlib-wasm requires a space argument');
}
if (args.numDimensions === undefined) {
throw new Error('closevector-hnswlib-wasm requires a numDimensions argument');
}
return new HierarchicalNSW(args.space, args.numDimensions, '');
}
private async initIndex(vectors: number[][]) {
if (!this._index) {
if (this.args.numDimensions === undefined) {
this.args.numDimensions = vectors[0].length;
this.args.maxElements = vectors.length;
}
this.index = await CloseVectorHNSWWeb.getHierarchicalNSW(this.args);
this.index.initIndex(vectors.length, 48, 200, 100);
this.index.setEfSearch(32);
}
}
public get index(): HierarchicalNSWT {
if (!this._index) {
throw new Error('Vector store not initialised yet. Try calling `addTexts` first.');
}
return this._index;
}
private set index(index: HierarchicalNSWT) {
this._index = index;
}
public get uuid(): string {
return this._uuid;
}
private set uuid(uuid: string) {
this._uuid = uuid;
}
async addVectors(vectors: number[][], documents: CloseVectorDocument[]) {
if (vectors.length === 0) {
return;
}
await this.initIndex(vectors);
// TODO here we could optionally normalise the vectors to unit length
// so that dot product is equivalent to cosine similarity, like this
// https://github.com/nmslib/hnswlib/issues/384#issuecomment-1155737730
// While we only support OpenAI embeddings this isn't necessary
if (vectors.length !== documents.length) {
throw new Error(`Vectors and metadatas must have the same length`);
}
if (vectors[0].length !== this.args.numDimensions) {
throw new Error(
`Vectors must have the same length as the number of dimensions (${this.args.numDimensions})`
);
}
const capacity = this.index.getMaxElements();
const needed = this.index.getCurrentCount() + vectors.length;
if (needed > capacity) {
this.index.resizeIndex(needed);
}
const docstoreSize = this.index.getCurrentCount();
const toSave: Record<string, CloseVectorDocument> = {};
for (let i = 0; i < vectors.length; i += 1) {
this.index.addPoint(vectors[i], docstoreSize + i, false);
toSave[docstoreSize + i] = documents[i];
}
this.docstore.add(toSave);
}
async similaritySearchVectorWithScore(query: number[], k: number, filter?: this['FilterType']) {
if (this.args.numDimensions && !this._index) {
await this.initIndex([[]]);
}
if (query.length !== this.args.numDimensions) {
throw new Error(
`Query vector must have the same length as the number of dimensions (${this.args.numDimensions})`
);
}
if (k > this.index.getCurrentCount()) {
const total = this.index.getCurrentCount();
console.warn(
`k (${k}) is greater than the number of elements in the index (${total}), setting k to ${total}`
);
// eslint-disable-next-line no-param-reassign
k = total;
}
const filterFunction = (label: number): boolean => {
if (!filter) {
return true;
}
const document = this.docstore.search(String(label));
// eslint-disable-next-line no-instanceof/no-instanceof
if (typeof document !== 'string') {
return filter(document);
}
return false;
};
const result = this.index.searchKnn(query, k, filter ? filterFunction : undefined);
return result.neighbors.map(
(docIndex: unknown, resultIndex: number) =>
[this.docstore.search(String(docIndex)), result.distances[resultIndex]] as [
CloseVectorDocument,
number,
]
);
}
async saveToCloud(options?: {
uuid?: string;
public?: boolean;
description?: string;
credentials?: CloseVectorCredentials;
onProgress?: (progress: { loaded: number; total: number }) => void;
}) {
const _credentials = options.credentials || this.credentials;
if (!_credentials) {
throw new Error('You must provide credentials');
}
if (options.uuid) {
this.uuid = options.uuid;
}
const urlResp = await createUploadFileOperationUrl({
uuid: options.uuid ?? this.uuid,
description: options?.description ?? new Date().toISOString(),
accessKey: _credentials.key,
secret: _credentials.secret,
public: options?.public
});
this.uuid = urlResp.uuid;
const url = urlResp.url;
await this.save(this.uuid + ".hnsw");
let resp = await upload({
path: this.uuid + ".hnsw",
url,
onProgress: (progress) => {
if (options.onProgress) {
options.onProgress({
loaded: progress.uploaded,
total: progress.total
});
}
}
});
return resp;
}
async save(directory: string) {
// should not add /hnswlib-index to directory, because it is added in wasm inside
const indexPath = pathJoin(directory, 'hnswlib.index');
const argsPath = pathJoin(directory, 'args.json');
const docstorePath = pathJoin(directory, 'docstore.json');
const argsContent = JSON.stringify(this.args);
const docstoreContent = JSON.stringify(Array.from(this.docstore._docs.entries()));
await Promise.all([
IDBFS.writeStringToFile(argsPath, argsContent),
IDBFS.writeStringToFile(docstorePath, docstoreContent),
this.index.writeIndex(indexPath),
]);
}
static async loadFromCloud(options: {
embeddings: CloseVectorEmbeddings;
uuid: string;
public?: boolean,
credentials?: CloseVectorCredentials,
onProgress?: (progress: { loaded: number; total: number }) => void;
}) {
const path = options.uuid;
const { embeddings } = options;
const urlResp = options?.public ? await createPublicGetFileOperationUrl({
uuid: options.uuid,
accessKey: options.credentials?.key,
}) : await createGetFileOperationUrl({
uuid: options.uuid,
accessKey: options.credentials?.key,
secret: options.credentials?.secret
});
await download({
url: urlResp.url,
onProgress: options.onProgress
});
let instance = await CloseVectorHNSWWeb.load(path + ".hnsw", embeddings);
instance.uuid = options.uuid;
return instance;
}
static async load(directory: string, embeddings: CloseVectorEmbeddings) {
const argsPath = pathJoin(directory, 'args.json');
const docstorePath = pathJoin(directory, 'docstore.json');
const indexPath = pathJoin(directory, 'hnswlib.index');
const lib = await CloseVectorHNSWWeb.imports();
const argsFileContent = lib.EmscriptenFileSystemManager.getStringFromFile(argsPath);
const docstoreFileContent = lib.EmscriptenFileSystemManager.getStringFromFile(docstorePath);
const args = JSON.parse(argsFileContent);
const docs = JSON.parse(docstoreFileContent);
const index = await CloseVectorHNSWWeb.getHierarchicalNSW({
...args,
});
const indexLoaded = await index.readIndex(indexPath, args.maxElements || docs.length);
const [docstoreFiles] = [docs, indexLoaded];
args.docstore = new SynchronousInMemoryDocstore(new Map(docstoreFiles));
args.index = index;
return new CloseVectorHNSWWeb(embeddings, args);
}
static async fromTexts(
texts: string[],
metadatas: object[] | object,
embeddings: CloseVectorEmbeddings,
dbConfig?: {
docstore?: SynchronousInMemoryDocstore;
}
): Promise<CloseVectorHNSWWeb> {
const docs: CloseVectorDocument[] = [];
for (let i = 0; i < texts.length; i += 1) {
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
const newDoc = {
pageContent: texts[i],
metadata,
};
docs.push(newDoc);
}
return CloseVectorHNSWWeb.fromDocuments(docs, embeddings, dbConfig);
}
static async fromDocuments(
docs: CloseVectorDocument[],
embeddings: CloseVectorEmbeddings,
dbConfig?: {
docstore?: SynchronousInMemoryDocstore;
}
): Promise<CloseVectorHNSWWeb> {
const args: CloseVectorHNSWLibArgs<HierarchicalNSWT> = {
docstore: dbConfig?.docstore,
space: 'cosine',
maxElements: docs.length,
};
const instance = new this(embeddings, args);
await instance.addDocuments(docs);
return instance;
}
static async imports(): Promise<HnswlibModule> {
try {
if (__lib) {
return __lib;
}
const lib = await loadHnswlib();
__lib = lib;
return lib;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (err: any) {
throw new Error(`Could not import closevector-hnswlib-wasm.\nError: ${err?.message}`);
}
}
}
export const HNSWLib = CloseVectorHNSWWeb;