ragmatic
Version:
Automatically and continuously vectorize your PostgreSQL tables with the flexibility of your own embedding pipelines
222 lines (218 loc) • 6.94 kB
text/typescript
interface Logger {
error(message: string, meta?: Record<string, any>): void;
warn(message: string, meta?: Record<string, any>): void;
info(message: string, meta?: Record<string, any>): void;
debug(message: string, meta?: Record<string, any>): void;
}
interface LoggerConfig$1 {
level?: "error" | "warn" | "info" | "debug" | "trace";
format?: "json" | "text";
service?: string;
trackerName?: string;
silent?: boolean;
}
/**
* Creates a configurable logger for RAGmatic
*
* @param config Configuration for the logger
* @returns Logger instance
*/
declare function createLogger(config?: LoggerConfig$1): Logger;
declare const logger: Logger;
interface LoggerConfig {
logger?: LoggerConfig$1;
}
/**
* Database client interface
*/
interface DBClient {
query(queryText: string, values?: any[]): Promise<any>;
connect(): Promise<void>;
end(): Promise<void>;
}
interface DBConfig {
connectionString?: string;
dbClient?: DBClient;
}
interface TableConfig {
documentsTable: string;
docIdType?: "INT" | "UUID" | "TEXT" | "BIGINT" | string;
embeddingDimension: number;
}
/**
* Configuration options for setting up the database
*/
interface Config extends DBConfig, LoggerConfig, TableConfig {
/**
* Identifier to allow multiple trackers for the same table
* @default 'default'
*/
trackerName: "default" | string;
/**
* Name of the shadow table to track changes
* @default 'shadows'
*/
shadowTable?: "shadows" | string;
/**
* Name of the table to store embedding chunks
* @default 'chunks'
*/
chunksTable?: "chunks" | string;
/**
* If true, skips creating the hnsw index for cosine distance to setup an index manually.
* Read more: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing
* @default false
*/
skipEmbeddingIndexSetup?: boolean;
}
/**
* Configuration options for the worker
*/
interface WorkerConfig<T> extends DBConfig, LoggerConfig {
/**
* Name of the tracker to use
*/
trackerName: string;
/**
* Function to generate an embedding vector (and any other metadata) for a new chunk
* @param chunk - A new chunk that was created by the chunkGenerator, deduplicated by the hashFunction
* @param index - The index of the chunk in the document
* @returns The data to store in the database including the embedding vector
* @description This function is used to generate an embedding for a single chunk
* It's NOT called when a chunk's content hash has not changed, to avoid expensive re-embedding work.
*/
embeddingGenerator: (chunk: ChunkData, index: number) => Promise<EmbeddingData>;
/**
* Optional override for the default chunk generator. Splits a document into smaller chunks deterministically.
* @param doc - The document to generate chunks from
* @returns The generated chunks
* @description It is called every time a new document is added or updated in the database.
* Note that the deduplication assumes that the chunkGenerator returns chunks in the same order for the same document in a deterministic way.
* @default returns the document as a single chunk
*/
chunkGenerator?: (doc: T) => Promise<ChunkData[]>;
/**
* Optional override for the default hash function that is used to deduplicate chunks.
* @param chunk - The chunk to generate a hash for
* @returns A checksum of the chunk
* @description This function is used to deduplicate chunks in order to avoid expensive re-embedding work.
*/
hashFunction?: (chunk: ChunkData) => Promise<string>;
/**
* Polling interval in milliseconds
*/
pollingIntervalMs?: number;
/**
* Maximum number of dirty shadow records to process per polling cycle
* @default 5
*/
batchSize?: number;
/**
* Maximum number of retries for temporary errors
* @default 3
*/
maxRetries?: number;
/**
* Initial retry delay in milliseconds
* @default 1000
*/
initialRetryDelayMs?: number;
/**
* Maximum time a job can be stalled before being considered dead
* @default 1
*/
stalledJobTimeoutMinutes?: number;
}
/**
* Any JSON serializable data to embed
*/
interface ChunkDataBase extends Record<string, any> {
}
/**
* Extended chunk data to embed with blob data
*/
interface ChunkDataWithBlob extends ChunkDataBase {
/**
* Blob data to embed
* If you want to pass a blob and use the default hash function
* you need to pass the blob as a Buffer here
*/
blob: Buffer;
}
/**
* Chunk data to embed
*/
type ChunkData = ChunkDataWithBlob | ChunkDataBase;
interface EmbeddingDataBase {
/**
* Generated embedding vector
*/
embedding: number[];
}
interface EmbeddingDataWithText extends EmbeddingDataBase {
/**
* Text content of the chunk
*/
text: string;
}
interface EmbeddingDataWithJson extends EmbeddingDataBase {
/**
* JSON data that was embedded
*/
json: Record<string, any>;
}
interface EmbeddingDataWithBlob extends EmbeddingDataBase {
/**
* Blob data that was embedded
*/
blob: Buffer;
}
/**
* Embedded chunk data to store in the database
*/
type EmbeddingData = EmbeddingDataWithBlob | EmbeddingDataWithJson | EmbeddingDataWithText;
type Job = {
doc_id: string;
vector_clock: number;
status: "pending" | "processing" | "completed" | "failed" | "skipped";
created_at: Date;
processing_started_at?: Date;
completed_at?: Date;
worker_id?: string;
error?: string;
retry_count: number;
};
/**
* the RAGmatic public api
*/
interface RAGmatic$1<T> {
start(): Promise<void>;
stop(): Promise<void>;
reprocessAll(): Promise<void>;
countRemainingDocuments(): Promise<number>;
destroy(): Promise<void>;
}
/**
* RAGmatic configuration
*/
interface RAGmaticConfig<T> extends Omit<Config, "trackerName" | "documentsTable" | "shadowTable" | "chunksTable">, Omit<WorkerConfig<T>, "trackerName" | "chunkGenerator" | "embeddingGenerator"> {
name: string;
tableToWatch: string;
recordToChunksFunction: (doc: T) => Promise<ChunkData[]>;
chunkToEmbeddingFunction: (chunk: ChunkData) => Promise<EmbeddingData>;
}
declare class RAGmatic<T> implements RAGmatic$1<T> {
private static instances;
private worker;
private name;
private connectionString;
private dbClient;
private constructor();
static create<T>(config: RAGmaticConfig<T>): Promise<RAGmatic<T>>;
destroy(): Promise<void>;
start(): Promise<void>;
stop(): Promise<void>;
reprocessAll(): Promise<void>;
countRemainingDocuments(): Promise<number>;
}
export { type ChunkData, type DBClient, type EmbeddingData, type Job, type LoggerConfig$1 as LoggerConfig, RAGmatic, type RAGmaticConfig, createLogger, logger };