UNPKG

ragmatic

Version:

Automatically and continuously vectorize your PostgreSQL tables with the flexibility of your own embedding pipelines

222 lines (218 loc) 6.94 kB
interface Logger { error(message: string, meta?: Record<string, any>): void; warn(message: string, meta?: Record<string, any>): void; info(message: string, meta?: Record<string, any>): void; debug(message: string, meta?: Record<string, any>): void; } interface LoggerConfig$1 { level?: "error" | "warn" | "info" | "debug" | "trace"; format?: "json" | "text"; service?: string; trackerName?: string; silent?: boolean; } /** * Creates a configurable logger for RAGmatic * * @param config Configuration for the logger * @returns Logger instance */ declare function createLogger(config?: LoggerConfig$1): Logger; declare const logger: Logger; interface LoggerConfig { logger?: LoggerConfig$1; } /** * Database client interface */ interface DBClient { query(queryText: string, values?: any[]): Promise<any>; connect(): Promise<void>; end(): Promise<void>; } interface DBConfig { connectionString?: string; dbClient?: DBClient; } interface TableConfig { documentsTable: string; docIdType?: "INT" | "UUID" | "TEXT" | "BIGINT" | string; embeddingDimension: number; } /** * Configuration options for setting up the database */ interface Config extends DBConfig, LoggerConfig, TableConfig { /** * Identifier to allow multiple trackers for the same table * @default 'default' */ trackerName: "default" | string; /** * Name of the shadow table to track changes * @default 'shadows' */ shadowTable?: "shadows" | string; /** * Name of the table to store embedding chunks * @default 'chunks' */ chunksTable?: "chunks" | string; /** * If true, skips creating the hnsw index for cosine distance to setup an index manually. * Read more: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing * @default false */ skipEmbeddingIndexSetup?: boolean; } /** * Configuration options for the worker */ interface WorkerConfig<T> extends DBConfig, LoggerConfig { /** * Name of the tracker to use */ trackerName: string; /** * Function to generate an embedding vector (and any other metadata) for a new chunk * @param chunk - A new chunk that was created by the chunkGenerator, deduplicated by the hashFunction * @param index - The index of the chunk in the document * @returns The data to store in the database including the embedding vector * @description This function is used to generate an embedding for a single chunk * It's NOT called when a chunk's content hash has not changed, to avoid expensive re-embedding work. */ embeddingGenerator: (chunk: ChunkData, index: number) => Promise<EmbeddingData>; /** * Optional override for the default chunk generator. Splits a document into smaller chunks deterministically. * @param doc - The document to generate chunks from * @returns The generated chunks * @description It is called every time a new document is added or updated in the database. * Note that the deduplication assumes that the chunkGenerator returns chunks in the same order for the same document in a deterministic way. * @default returns the document as a single chunk */ chunkGenerator?: (doc: T) => Promise<ChunkData[]>; /** * Optional override for the default hash function that is used to deduplicate chunks. * @param chunk - The chunk to generate a hash for * @returns A checksum of the chunk * @description This function is used to deduplicate chunks in order to avoid expensive re-embedding work. */ hashFunction?: (chunk: ChunkData) => Promise<string>; /** * Polling interval in milliseconds */ pollingIntervalMs?: number; /** * Maximum number of dirty shadow records to process per polling cycle * @default 5 */ batchSize?: number; /** * Maximum number of retries for temporary errors * @default 3 */ maxRetries?: number; /** * Initial retry delay in milliseconds * @default 1000 */ initialRetryDelayMs?: number; /** * Maximum time a job can be stalled before being considered dead * @default 1 */ stalledJobTimeoutMinutes?: number; } /** * Any JSON serializable data to embed */ interface ChunkDataBase extends Record<string, any> { } /** * Extended chunk data to embed with blob data */ interface ChunkDataWithBlob extends ChunkDataBase { /** * Blob data to embed * If you want to pass a blob and use the default hash function * you need to pass the blob as a Buffer here */ blob: Buffer; } /** * Chunk data to embed */ type ChunkData = ChunkDataWithBlob | ChunkDataBase; interface EmbeddingDataBase { /** * Generated embedding vector */ embedding: number[]; } interface EmbeddingDataWithText extends EmbeddingDataBase { /** * Text content of the chunk */ text: string; } interface EmbeddingDataWithJson extends EmbeddingDataBase { /** * JSON data that was embedded */ json: Record<string, any>; } interface EmbeddingDataWithBlob extends EmbeddingDataBase { /** * Blob data that was embedded */ blob: Buffer; } /** * Embedded chunk data to store in the database */ type EmbeddingData = EmbeddingDataWithBlob | EmbeddingDataWithJson | EmbeddingDataWithText; type Job = { doc_id: string; vector_clock: number; status: "pending" | "processing" | "completed" | "failed" | "skipped"; created_at: Date; processing_started_at?: Date; completed_at?: Date; worker_id?: string; error?: string; retry_count: number; }; /** * the RAGmatic public api */ interface RAGmatic$1<T> { start(): Promise<void>; stop(): Promise<void>; reprocessAll(): Promise<void>; countRemainingDocuments(): Promise<number>; destroy(): Promise<void>; } /** * RAGmatic configuration */ interface RAGmaticConfig<T> extends Omit<Config, "trackerName" | "documentsTable" | "shadowTable" | "chunksTable">, Omit<WorkerConfig<T>, "trackerName" | "chunkGenerator" | "embeddingGenerator"> { name: string; tableToWatch: string; recordToChunksFunction: (doc: T) => Promise<ChunkData[]>; chunkToEmbeddingFunction: (chunk: ChunkData) => Promise<EmbeddingData>; } declare class RAGmatic<T> implements RAGmatic$1<T> { private static instances; private worker; private name; private connectionString; private dbClient; private constructor(); static create<T>(config: RAGmaticConfig<T>): Promise<RAGmatic<T>>; destroy(): Promise<void>; start(): Promise<void>; stop(): Promise<void>; reprocessAll(): Promise<void>; countRemainingDocuments(): Promise<number>; } export { type ChunkData, type DBClient, type EmbeddingData, type Job, type LoggerConfig$1 as LoggerConfig, RAGmatic, type RAGmaticConfig, createLogger, logger };