UNPKG

ragmatic

Version:

Automatically and continuously vectorize your PostgreSQL tables with the flexibility of your own embedding pipelines

1,526 lines (1,504 loc) 49.3 kB
'use strict'; var pg = require('pg'); var crypto = require('crypto'); const DEFAULT_LOG_LEVEL = "info"; function createLogger(config = {}) { try { const winston = require("winston"); const level = config.level || process.env.RAGMATIC_LOG_LEVEL || DEFAULT_LOG_LEVEL; const isJson = (config.format || process.env.RAGMATIC_LOG_FORMAT || "text") === "json"; const service = config.service || config.trackerName || "ragmatic"; const silent = config.silent || process.env.RAGMATIC_LOG_SILENT === "true"; const textFormat = winston.format.printf( ({ level: level2, message, timestamp, ...rest }) => { const meta = Object.keys(rest).length ? ` ${JSON.stringify(rest)}` : ""; return `${timestamp} [${level2.toUpperCase()}] [${service}]: ${message}${meta}`; } ); return winston.createLogger({ level, format: winston.format.combine( winston.format.timestamp(), winston.format.errors({ stack: true }), isJson ? winston.format.json() : textFormat ), defaultMeta: { service }, silent, transports: [new winston.transports.Console()] }); } catch (error) { const logLevels = { error: 0, warn: 1, info: 2, debug: 3, trace: 4 }; const selectedLevel = config.level || process.env.RAGMATIC_LOG_LEVEL || DEFAULT_LOG_LEVEL; const levelValue = logLevels[selectedLevel] || 2; const silent = config.silent || process.env.RAGMATIC_LOG_SILENT === "true"; const service = config.service || config.trackerName || "ragmatic"; if (silent) { return { error: () => { }, warn: () => { }, info: () => { }, debug: () => { } }; } return { error: (message, meta) => { if (levelValue >= 0) console.error(`[ERROR] [${service}]: ${message}`, meta || ""); }, warn: (message, meta) => { if (levelValue >= 1) console.warn(`[WARN] [${service}]: ${message}`, meta || ""); }, info: (message, meta) => { if (levelValue >= 2) console.info(`[INFO] [${service}]: ${message}`, meta || ""); }, debug: (message, meta) => { if (levelValue >= 3) console.debug(`[DEBUG] [${service}]: ${message}`, meta || ""); } }; } } const logger = createLogger(); const PREFIX = "ragmatic_"; const SHADOW_TABLE = "shadows"; const CHUNK_TABLE = "chunks"; const WORK_QUEUE_TABLE = "work_queue"; const RAGMATIC_SCHEMA_VERSION = 1; const sql = (strings, ...values) => strings.reduce((acc, str, i) => acc + str + (values[i] ?? ""), ""); async function setup(config) { const logger = createLogger({ ...config.logger, service: "ragmatic-setup", trackerName: config.trackerName }); logger.info("Starting database setup", { trackerName: config.trackerName }); const client = config.dbClient || new pg.Client({ connectionString: config.connectionString }); if (!config.dbClient && !config.connectionString) { const error = new Error( "Either dbClient or connectionString must be provided" ); logger.error("Setup failed", { error: error.message }); throw error; } let documentsSchema = "public"; let documentsTable = config.documentsTable; if (config.documentsTable.includes(".")) { documentsSchema = config.documentsTable.split(".")[0]; documentsTable = config.documentsTable.split(".")[1]; } documentsSchema = documentsSchema.replaceAll(/[^a-zA-Z0-9_]/g, "_"); documentsTable = documentsTable.replaceAll(/[^a-zA-Z0-9_]/g, "_"); const trackerName = config.trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_"); const embeddingDimension = config.embeddingDimension.toString().replaceAll(/[^0-9]/g, ""); const schemaName = `${PREFIX}${trackerName}`; const shadowTable = `${schemaName}.${(config.shadowTable || SHADOW_TABLE).replaceAll(/[^a-zA-Z0-9_]/g, "_")}`; const chunksTable = `${schemaName}.${(config.chunksTable || CHUNK_TABLE).replaceAll(/[^a-zA-Z0-9_]/g, "_")}`; const docIdType = (config.docIdType || "INT").replaceAll( /[^a-zA-Z0-9_]/g, "_" ); const skipEmbeddingIndexSetup = config.skipEmbeddingIndexSetup || false; logger.debug("Configuration prepared", { trackerName, documentsTable, schemaName, shadowTable, chunksTable, embeddingDimension, skipEmbeddingIndexSetup }); try { logger.debug("Connecting to database"); await client.connect?.(); await client.query(sql`BEGIN`); logger.debug("Creating vector extension if not exists"); await client.query(sql`CREATE EXTENSION IF NOT EXISTS vector`); logger.debug("Creating schema", { schema: schemaName }); await client.query(sql`CREATE SCHEMA IF NOT EXISTS ${schemaName}`); await client.query(sql` CREATE TABLE IF NOT EXISTS ${schemaName}.config ( id SERIAL PRIMARY KEY, key TEXT NOT NULL UNIQUE, value TEXT NOT NULL ); `); await client.query(sql` INSERT INTO ${schemaName}.config (key, value) VALUES ( 'documentsSchema', '${documentsSchema}' ), ( 'documentsTable', '${documentsTable}' ), ('docIdType', '${docIdType}'), ( 'embeddingDimension', '${embeddingDimension}' ), ( 'shadowTable', '${shadowTable}' ), ( 'chunksTable', '${chunksTable}' ), ( 'ragmaticSchemaVersion', '${RAGMATIC_SCHEMA_VERSION}' ) ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value; `); await client.query(sql` CREATE TABLE IF NOT EXISTS ${shadowTable} ( id SERIAL PRIMARY KEY, doc_id ${docIdType} NOT NULL, vector_clock BIGINT NOT NULL DEFAULT 1, -- At 1 billion increments per second, BIGINT lasts about 292 years. UNIQUE (doc_id) ); -- Allow constraints to be created DELETE FROM ${shadowTable} WHERE doc_id NOT IN ( SELECT id FROM ${documentsSchema}.${documentsTable} ); -- Always (re)create the constraint to ensure correctness -- this is important when someone drops the documents table and recreates it ALTER TABLE ${shadowTable} DROP CONSTRAINT IF EXISTS fk_documents_sync; ALTER TABLE ${shadowTable} ADD CONSTRAINT fk_documents_sync FOREIGN KEY (doc_id) REFERENCES ${documentsSchema}.${documentsTable} (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED; `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.shadowTable || SHADOW_TABLE}_vector_clock ON ${shadowTable} (vector_clock); `); await client.query(sql` CREATE OR REPLACE FUNCTION ${schemaName}.sync_documents_to_shadow () RETURNS trigger LANGUAGE plpgsql AS $$ BEGIN IF TG_OP = 'INSERT' THEN INSERT INTO ${shadowTable} (doc_id, vector_clock) VALUES (NEW.id, 1); -- Keep in mind this locks the document row for the duration of the update to the shadow table, -- so avoid locking the shadow table for too long otherwise this will cause a bottleneck! -- also do not update the shadow table outside of the library if you want to avoid deadlocks ELSIF TG_OP = 'UPDATE' THEN UPDATE ${shadowTable} SET vector_clock = vector_clock + 1 WHERE doc_id = NEW.id; END IF; RETURN NULL; END; $$; `); await client.query(sql` DROP TRIGGER IF EXISTS sync_${schemaName}_to_shadow ON ${documentsSchema}.${documentsTable}; CREATE TRIGGER sync_${schemaName}_to_shadow AFTER INSERT OR UPDATE ON ${documentsSchema}.${documentsTable} FOR EACH ROW EXECUTE FUNCTION ${schemaName}.sync_documents_to_shadow (); `); await client.query(sql` CREATE TABLE IF NOT EXISTS ${chunksTable} ( id SERIAL PRIMARY KEY, doc_id ${docIdType} NOT NULL, vector_clock BIGINT NOT NULL DEFAULT 0, index INT NOT NULL, chunk_hash TEXT NOT NULL, chunk_text TEXT, chunk_blob BYTEA, chunk_json JSONB, embedding VECTOR (${embeddingDimension}) NOT NULL ); -- Allow constraints to be created DELETE FROM ${chunksTable} WHERE doc_id NOT IN ( SELECT id FROM ${documentsSchema}.${documentsTable} ); -- Always (re)create the constraint to ensure correctness -- this is important when someone drops the documents table and recreates it ALTER TABLE ${chunksTable} DROP CONSTRAINT IF EXISTS fk_doc_chunks; ALTER TABLE ${chunksTable} ADD CONSTRAINT fk_doc_chunks FOREIGN KEY (doc_id) REFERENCES ${documentsSchema}.${documentsTable} (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED; `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_doc_id ON ${chunksTable} (doc_id); `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_chunk_hash ON ${chunksTable} (chunk_hash); `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_vector_clock ON ${chunksTable} (vector_clock); `); if (!skipEmbeddingIndexSetup) { logger.info("Creating HNSW index for vector similarity search", { table: chunksTable, embeddingDimension }); try { await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_embedding ON ${chunksTable} USING hnsw (embedding vector_cosine_ops); `); logger.info("HNSW index created successfully"); } catch (indexError) { logger.warn("Failed to create HNSW index", { error: indexError instanceof Error ? indexError.message : String(indexError), note: "This may be expected on certain PostgreSQL configurations. You can create the index manually later." }); } } await client.query(sql` CREATE TABLE IF NOT EXISTS ${schemaName}.${WORK_QUEUE_TABLE} ( id SERIAL PRIMARY KEY, doc_id ${docIdType} NOT NULL, vector_clock BIGINT NOT NULL, status TEXT NOT NULL DEFAULT 'pending', -- pending, processing, completed, failed, skipped created_at TIMESTAMP NOT NULL DEFAULT NOW(), processing_started_at TIMESTAMP, completed_at TIMESTAMP, worker_id TEXT DEFAULT NULL, error TEXT DEFAULT NULL, retry_count INT NOT NULL DEFAULT 0, UNIQUE (doc_id, vector_clock) -- NOTE: deletes are not cascaded from documents table to the work queue, but we need to keep the constraint: -- UNIQUE (doc_id, vector_clock) and the monotonicity of the vector_clock at all times -- even if eg.: you drop the docs table, recreate the table, run setup again, and then re-insert the doc_id with a smaller vector_clock the second time -- will only work if queue rows are cleaned up first ); `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_status ON ${schemaName}.${WORK_QUEUE_TABLE} (status); `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_doc_id ON ${schemaName}.${WORK_QUEUE_TABLE} (doc_id); `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_vector_clock ON ${schemaName}.${WORK_QUEUE_TABLE} (vector_clock); `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_stalled_jobs ON ${schemaName}.${WORK_QUEUE_TABLE} (status, processing_started_at); `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_doc_id_vector_clock ON ${chunksTable} (doc_id, vector_clock); `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_doc_id_vector_clock ON ${schemaName}.${WORK_QUEUE_TABLE} (doc_id, vector_clock DESC); `); await client.query(sql` CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_doc_id_index ON ${chunksTable} (doc_id, index); `); const res = await client.query(sql` SELECT rolsuper FROM pg_roles WHERE rolname = current_user; `); if (res.rows[0].rolsuper) { await client.query(sql` DO $do$ BEGIN -- Create the function CREATE OR REPLACE FUNCTION ${schemaName}.drop_ragmatic_schema() RETURNS event_trigger LANGUAGE plpgsql AS $func$ DECLARE obj record; BEGIN -- Loop through dropped objects FOR obj IN SELECT object_type, schema_name, object_name FROM pg_event_trigger_dropped_objects() LOOP IF obj.object_type = 'table' AND obj.schema_name = '${documentsSchema}' AND obj.object_name = '${documentsTable}' THEN DROP SCHEMA IF EXISTS ${schemaName} CASCADE; END IF; END LOOP; END; $func$; -- Create the event trigger DROP EVENT TRIGGER IF EXISTS sync_${schemaName}_on_drop_table; CREATE EVENT TRIGGER sync_${schemaName}_on_drop_table ON sql_drop EXECUTE FUNCTION ${schemaName}.drop_ragmatic_schema(); END; $do$; `); logger.debug( "Event trigger created successfully. The ragmatic schema will be dropped if you drop the documents table." ); } else { logger.warn( "Client does not have superuser privileges. Event trigger for table drops was not created. You can safely ignore this, but note that this ragmatic schema will not be dropped if you drop the documents table." ); } await client.query(sql` DELETE FROM ${schemaName}.${WORK_QUEUE_TABLE} `); await client.query(sql` INSERT INTO ${shadowTable} (doc_id) SELECT id FROM ${documentsSchema}.${documentsTable} d WHERE NOT EXISTS ( SELECT 1 FROM ${shadowTable} s WHERE s.doc_id = d.id ); `); await client.query(sql`COMMIT`); logger.info("Database setup complete", { trackerName, documentsTable, schemaName, shadowTable, chunksTable, embeddingDimension, chunkIndexesCreated: !skipEmbeddingIndexSetup }); logger.debug("Sample search query", { query: ` SELECT *, 1 - (embedding <=> '[0.1, 0.2, ...]'::vector) as similarity FROM ${chunksTable} ORDER BY similarity LIMIT 5; ` }); } catch (err) { logger.error("Error during database setup", { error: err instanceof Error ? err.message : String(err), stack: err instanceof Error ? err.stack : void 0, trackerName, documentsTable, schemaName }); await client.query(sql`ROLLBACK`); throw err; } finally { logger.debug("Closing database connection"); await client.end?.(); } } var ErrorType = /* @__PURE__ */ ((ErrorType2) => { ErrorType2["Temporary"] = "temporary"; ErrorType2["Permanent"] = "permanent"; return ErrorType2; })(ErrorType || {}); class ProcessingError extends Error { constructor(message, type, cause) { super(message); this.type = type; this.cause = cause; this.name = "ProcessingError"; } } class Worker { constructor(config) { this.config = config; this.running = null; this.connected = false; this.timer = null; this.createJobsTimer = null; this.createJobsRunning = null; // Config: this.workerId = crypto.randomUUID(); // Config loaded from db: this.shadowTable = null; this.chunksTable = null; this.documentsTable = null; this.embeddingDimension = null; this.docIdType = null; this.pollingInterval = config.pollingIntervalMs || 1e3; this.maxRetries = config.maxRetries || 3; this.initialRetryDelay = config.initialRetryDelayMs || 1e3; this.batchSize = config.batchSize || 5; this.stalledJobTimeoutMinutes = config.stalledJobTimeoutMinutes || 1; if (!config.dbClient && !config.connectionString) { throw new Error("Either dbClient or connectionString must be provided"); } this.pool = new pg.Pool({ connectionString: config.connectionString }); this.schemaName = `${PREFIX}${config.trackerName.replace(/[^a-zA-Z0-9_]/g, "_")}`; this.chunkGenerator = config.chunkGenerator || defaultChunkGenerator; this.embeddingGenerator = config.embeddingGenerator || defaultEmbeddingGenerator; this.hashFunction = config.hashFunction || defaultHash; this.logger = createLogger({ ...config.logger, trackerName: config.trackerName }); this.logger.debug("Worker instance created", { workerId: this.workerId, trackerName: config.trackerName, batchSize: this.batchSize, pollingInterval: this.pollingInterval }); } async loadConfig() { const configRes = await this.pool.query(sql` SELECT key, value FROM ${this.schemaName}.config `); const configMap = configRes.rows.reduce( (acc, row) => { acc[row.key] = row.value || null; return acc; }, {} ); this.documentsTable = configMap.documentsTable; this.shadowTable = configMap.shadowTable; this.chunksTable = configMap.chunksTable; this.embeddingDimension = Number(configMap.embeddingDimension); this.docIdType = configMap.docIdType; if (!this.documentsTable || !this.shadowTable || !this.chunksTable || !this.embeddingDimension || !this.docIdType) { throw new ProcessingError( "Missing config values. Please run setup() first.", ErrorType.Permanent ); } } // Start the worker by connecting to the database and beginning the polling loop. async start() { this.logger.info("Starting worker", { workerId: this.workerId }); if (!("query" in this.pool)) { const error = new Error( "Invalid database client. Please use a pg.Client compatible client or pass a connection string instead." ); this.logger.error("Failed to start worker", { error: error.message, workerId: this.workerId }); throw error; } if (!this.connected) { try { const schemaCheck = await this.pool.query(sql` SELECT 1 FROM pg_namespace WHERE nspname = '${this.schemaName}' `); if (schemaCheck.rowCount === 0) { const error = new ProcessingError( `Schema ${this.schemaName} does not exist. Please run setupDatabaseTracker() first.`, ErrorType.Permanent ); this.logger.error("Schema not found", { error: error.message, schema: this.schemaName, workerId: this.workerId }); throw error; } this.logger.debug("Loading configuration from database"); await this.loadConfig(); this.connected = true; this.logger.info("Worker connected to database", { schema: this.schemaName, embeddingDimension: this.embeddingDimension, documentsTable: this.documentsTable, workerId: this.workerId }); } catch (error) { this.logger.error("Failed to connect to database", { error: error instanceof Error ? error.message : String(error), schema: this.schemaName, workerId: this.workerId }); throw error; } } this.logger.info("Worker started", { workerId: this.workerId }); await this.runCreateJobs(); await this.run(); } async pause() { this.logger.info("Pausing worker", { workerId: this.workerId }); if (this.createJobsTimer) { clearTimeout(this.createJobsTimer); this.createJobsTimer = null; } if (this.createJobsRunning) { await this.createJobsRunning; this.createJobsRunning = null; } if (this.timer) { clearTimeout(this.timer); this.timer = null; } if (this.running) { await this.running; this.running = null; } this.logger.info("Worker paused", { workerId: this.workerId }); } // Stop the worker gracefully by stopping the polling loop and disconnecting. async stop() { this.logger.info("Stopping worker", { workerId: this.workerId }); await this.pause(); await this.pool.end(); this.connected = false; this.logger.info("Worker stopped", { workerId: this.workerId }); } async runCreateJobs() { if (this.createJobsRunning) return; this.createJobsTimer = setTimeout(async () => { if (!this.connected || this.createJobsRunning) return; this.createJobsRunning = this.createJobs(); await this.createJobsRunning; this.createJobsRunning = null; await this.runCreateJobs(); }, this.pollingInterval); } async run() { if (this.running) return; this.timer = setTimeout(async () => { if (!this.connected || this.running) return; this.running = this.poll(); await this.running; this.running = null; await this.run(); }, this.pollingInterval); } // The polling loop: poll the shadow table for dirty records, process them, and then schedule the next poll. async poll() { try { this.logger.debug("Starting polling cycle", { workerId: this.workerId }); this.logger.debug("Finding and claiming pending jobs"); const jobs = await this.findAndClaimJobs(); if (jobs.length > 0) { this.logger.info("Claimed jobs for processing", { jobCount: jobs.length, workerId: this.workerId }); } else { this.logger.debug("No jobs to process", { workerId: this.workerId }); } await this.processJobs(jobs); this.logger.debug("Completed polling cycle", { workerId: this.workerId }); } catch (err) { if (err instanceof ProcessingError && err.type === ErrorType.Temporary) { this.logger.warn("Temporary failure during processing", { error: err.message, cause: err.cause ? err.cause instanceof Error ? err.cause.message : String(err.cause) : void 0, workerId: this.workerId }); } else { this.logger.error("Error during processing", { error: err instanceof Error ? err.message : String(err), stack: err instanceof Error ? err.stack : void 0, workerId: this.workerId }); } } } async createJobs() { this.logger.debug("Creating jobs from outdated shadow records"); const client = await this.pool.connect(); try { await client.query(sql` SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL READ COMMITTED `); await client.query(sql`BEGIN`); this.logger.debug("Finding documents that need processing"); const res = await client.query(sql` WITH latest_shadow_clocks AS ( SELECT s.doc_id, s.vector_clock FROM ${this.shadowTable} s ), latest_chunk_clocks AS ( SELECT doc_id, MAX(vector_clock) AS max_chunk_vector_clock FROM ${this.chunksTable} GROUP BY doc_id ), -- Select the documents that have a shadow clock greater than the max chunk clock -- aka. the documents that have outdated or missing chunks... work_needed AS ( SELECT s.doc_id, s.vector_clock AS shadow_clock, COALESCE(c.max_chunk_vector_clock, 0) AS chunk_clock FROM latest_shadow_clocks s LEFT JOIN latest_chunk_clocks c ON s.doc_id = c.doc_id WHERE s.vector_clock > COALESCE(c.max_chunk_vector_clock, 0) ), -- ...and skip the (document, vector_clock) pairs that are already in the work queue current_work AS ( SELECT doc_id, vector_clock FROM ${this.schemaName}.${WORK_QUEUE_TABLE} ) SELECT w.doc_id, w.shadow_clock, w.chunk_clock FROM work_needed w LEFT JOIN current_work cw ON w.doc_id = cw.doc_id AND w.shadow_clock = cw.vector_clock WHERE cw.doc_id IS NULL -- Only select work not already in the queue ORDER BY w.shadow_clock - w.chunk_clock DESC, -- Prioritize documents most out of sync w.shadow_clock ASC -- Then older documents first LIMIT ${this.batchSize}; `); if (res.rows.length > 0) { this.logger.info("Creating new jobs in work queue", { jobCount: res.rows.length, workerId: this.workerId, docIds: res.rows.map((r) => r.doc_id) }); await client.query(sql` INSERT INTO ${this.schemaName}.${WORK_QUEUE_TABLE} (doc_id, vector_clock, status) VALUES ${res.rows.map( (r) => sql` ( ${r.doc_id}, ${r.shadow_clock}, 'pending' ) ` ).join(",")} ON CONFLICT DO NOTHING; `); } else { this.logger.debug("No new jobs to create", { workerId: this.workerId }); } await client.query(sql`COMMIT`); } catch (err) { await client.query(sql`ROLLBACK`); this.logger.error("Failed to create jobs", { error: err instanceof Error ? err.message : String(err), workerId: this.workerId }); throw err; } finally { client.release(); } } async findUnclaimedOrStalledJobs(client) { this.logger.debug("Finding unclaimed or stalled jobs", { workerId: this.workerId }); const res = await client.query(sql` SELECT doc_id, vector_clock, status, processing_started_at, worker_id, error, retry_count, created_at FROM ${this.schemaName}.${WORK_QUEUE_TABLE} WHERE status = 'pending' OR status = 'processing' AND processing_started_at < NOW() - INTERVAL '${this.stalledJobTimeoutMinutes} minutes' FOR UPDATE SKIP LOCKED LIMIT ${this.batchSize} `); this.logger.debug("Found jobs", { jobIdsAndVectorClocks: res.rows.map( (r) => ({ doc_id: r.doc_id, vector_clock: r.vector_clock }) ), jobCount: res.rows.length, workerId: this.workerId }); return res.rows; } async claimJobs(client, jobs, worker_id) { if (jobs.length === 0) { return []; } const conditions = []; const params = [worker_id]; let paramIndex = 2; for (const job of jobs) { conditions.push( `(doc_id = $${paramIndex} AND vector_clock = $${paramIndex + 1})` ); params.push(job.doc_id, job.vector_clock); paramIndex += 2; } const whereClause = conditions.join(" OR "); const query = ` UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE} SET status = 'processing', processing_started_at = NOW(), worker_id = $1 WHERE (${whereClause}) AND ( status = 'pending' OR ( status = 'processing' AND processing_started_at < NOW() - INTERVAL '${this.stalledJobTimeoutMinutes} minutes' ) ) RETURNING * `; const res = await client.query(query, params); this.logger.debug("Claimed jobs", { jobIdsAndVectorClocks: res.rows.map( (r) => ({ doc_id: r.doc_id, vector_clock: r.vector_clock }) ), jobCount: res.rows.length, workerId: this.workerId }); return res.rows; } async findAndClaimJobs() { this.logger.debug("Finding and claiming jobs", { workerId: this.workerId }); const client = await this.pool.connect(); try { await client.query(sql` SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL READ COMMITTED `); await client.query(sql`BEGIN`); let jobs = await this.findUnclaimedOrStalledJobs(client); jobs = await this.claimJobs(client, jobs, this.workerId); await client.query(sql`COMMIT`); this.logger.debug("Claimed jobs", { jobCount: jobs.length, workerId: this.workerId }); return jobs; } catch (err) { this.logger.error("Failed to find and claim jobs", { error: err instanceof Error ? err.message : String(err), workerId: this.workerId }); await client.query(sql`ROLLBACK`); throw err; } finally { client.release(); } } async processJobs(jobs) { for (const job of jobs) { await this.processJob(job); } } async skipJob(doc_id, vector_clock, error) { this.logger.debug("Skipping job", { docId: doc_id, vectorClock: vector_clock, error }); await this.pool.query( sql` UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE} SET status = 'skipped', error = $3, completed_at = NOW() WHERE doc_id = $1 AND vector_clock = $2 `, [doc_id, vector_clock, error] ); } async failJob(doc_id, vector_clock, error) { this.logger.debug("Failing job", { docId: doc_id, vectorClock: vector_clock, error: error.message }); await this.pool.query( sql` UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE} SET status = 'failed', error = $3, completed_at = NOW() WHERE doc_id = $1 AND vector_clock = $2 `, [doc_id, vector_clock, error] ); } async getLatestJob(doc_id) { const res = await this.pool.query( sql` SELECT vector_clock FROM ${this.schemaName}.${WORK_QUEUE_TABLE} WHERE doc_id = $1 ORDER BY vector_clock DESC LIMIT 1 `, [doc_id] ); if (res.rows.length === 0) { return null; } return { vector_clock: res.rows[0].vector_clock }; } async processJob(job) { this.logger.debug("Processing job", { docId: job.doc_id, vectorClock: job.vector_clock, workerId: this.workerId }); const latestJob = await this.getLatestJob(job.doc_id); if (latestJob && latestJob.vector_clock > job.vector_clock) { this.logger.debug("Newer job found, skipping", { docId: job.doc_id, vectorClock: job.vector_clock, latestVectorClock: latestJob.vector_clock }); await this.skipJob(job.doc_id, job.vector_clock, "Newer job found"); return; } const docRes = await this.pool.query( sql` SELECT * FROM ${this.documentsTable} WHERE id = $1 `, [job.doc_id] ); if (docRes.rows.length === 0) { this.logger.debug("Document already deleted, skipping", { docId: job.doc_id, vectorClock: job.vector_clock }); await this.skipJob(job.doc_id, job.vector_clock, "Document deleted"); return; } const doc = docRes.rows[0]; const chunks = await this.chunkGenerator(doc); const newChunkHashes = await Promise.all( chunks.map(async (c, i) => { const hash = await this.hashFunction(c); return `${hash}-${i}`; }) ); const client = await this.pool.connect(); try { await client.query(sql`BEGIN`); const { deduplicatedNew, deduplicatedOld } = await this.dedupeAndRemoveOld(client, job.doc_id, newChunkHashes); const toEmbed = chunks.map((c, index) => ({ ...c, index, hash: newChunkHashes[index] })).filter((chunk, i) => deduplicatedNew.has(chunk.hash)); const newChunks = await Promise.all( toEmbed.map( async (chunk, i) => this.generateEmbeddingForChunk( chunk, i, chunk.hash, job.doc_id, job.vector_clock ) ) ); await this.insertNewChunks(client, job.doc_id, newChunks); await this.removeOldChunks( client, job.doc_id, Array.from(deduplicatedOld) ); await client.query( sql` UPDATE ${this.chunksTable} SET vector_clock = $1 WHERE doc_id = $2 `, [job.vector_clock, job.doc_id] ); const updateRes = await client.query( sql` WITH latest_shadow_clock AS ( SELECT vector_clock AS latest_vector_clock FROM ${this.shadowTable} WHERE doc_id = $1 ), updated AS ( UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE} SET status = 'completed', completed_at = NOW() WHERE doc_id = $1 AND worker_id = $3 AND vector_clock = $2 AND vector_clock = ( SELECT latest_vector_clock FROM latest_shadow_clock ) RETURNING * ) SELECT * FROM updated `, [job.doc_id, job.vector_clock, this.workerId] ); if (updateRes.rowCount === 0) { this.logger.debug( "Vector clock is no longer latest or worker is not the job owner, rolling back", { docId: job.doc_id, vectorClock: job.vector_clock, workerId: this.workerId } ); await client.query(sql`ROLLBACK`); const shadowRes = await client.query( sql` SELECT vector_clock FROM ${this.shadowTable} WHERE doc_id = $1 `, [job.doc_id] ); if (shadowRes.rows[0]?.vector_clock > job.vector_clock) { this.logger.debug( "Vector clock is no longer latest, newer job found, skipping", { docId: job.doc_id, vectorClock: job.vector_clock, latestVectorClock: shadowRes.rows[0]?.vector_clock } ); await this.skipJob( job.doc_id, job.vector_clock, "Vector clock is no longer latest, newer job found" ); } } else { await client.query(sql`COMMIT`); } } catch (err) { await client.query(sql`ROLLBACK`); if (job.retry_count < this.maxRetries) { await this.retryJob(job, err); return; } else { await this.failJob(job.doc_id, job.vector_clock, err); } } finally { client.release(); } } async retryJob(job, err) { this.logger.info("Retrying job", { docId: job.doc_id, vectorClock: job.vector_clock, error: err.message, workerId: this.workerId, retryCount: job.retry_count + 1 }); await this.pool.query( sql` UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE} SET status = 'pending', processing_started_at = NULL, worker_id = NULL, error = $3, retry_count = retry_count + 1 WHERE doc_id = $1 AND vector_clock = $2 `, [job.doc_id, job.vector_clock, err.message] ); } async dedupeAndRemoveOld(client, doc_id, newChunkHashes) { const existingChunks = await client.query( sql` SELECT chunk_hash FROM ${this.chunksTable} WHERE doc_id = $1 `, [doc_id] ); const existingHashes = new Set( existingChunks.rows.map((r) => r.chunk_hash) ); const deduplicatedNew = new Set( newChunkHashes.filter((hash) => !existingHashes.has(hash)) ); const deduplicatedOld = new Set( [...existingHashes].filter( (hash) => !newChunkHashes.includes(hash) ) ); return { deduplicatedNew, deduplicatedOld }; } async removeOldChunks(client, doc_id, chunk_hashes) { await client.query( sql` DELETE FROM ${this.chunksTable} WHERE doc_id = $1 AND chunk_hash = ANY ($2) `, [doc_id, chunk_hashes] ); } async generateEmbeddingForChunk(chunk, index, hash, doc_id, vector_clock) { try { const { embedding, ...rest } = await this.embeddingGenerator( chunk, index ); if (!Array.isArray(embedding)) { throw new ProcessingError( `Invalid embedding format: expected number[], got ${typeof embedding}`, ErrorType.Permanent ); } if (embedding.length !== this.embeddingDimension) { throw new ProcessingError( `Invalid embedding dimension: expected ${this.embeddingDimension}, got ${embedding.length}`, ErrorType.Permanent ); } if (!embedding.every((n) => typeof n === "number")) { throw new ProcessingError( "Invalid embedding: all elements must be numbers", ErrorType.Permanent ); } const pgvectorEmbedding = `[${embedding.join(",")}]`; return { ...rest, hash, embedding: pgvectorEmbedding, index, vector_clock }; } catch (err) { if (err instanceof ProcessingError) { throw err; } if (err instanceof Error) { throw new ProcessingError( `Error generating embedding for doc_id ${doc_id}: ${err.message}`, ErrorType.Temporary, err ); } throw new ProcessingError( `Error generating embedding for doc_id ${doc_id}: Unknown error`, ErrorType.Temporary, err ); } } async insertNewChunks(client, doc_id, chunks) { if (chunks.length === 0) return; const values = chunks.map( (_, i) => `($1, $${i * 7 + 2}, $${i * 7 + 3}, $${i * 7 + 4}, $${i * 7 + 5}, $${i * 7 + 6}, $${i * 7 + 7}, $${i * 7 + 8})` ).join(","); const params = [ doc_id, ...chunks.flatMap((c) => [ c.hash, "text" in c ? c.text : null, "json" in c ? c.json : null, "blob" in c ? c.blob : null, c.embedding, c.index, c.vector_clock ]) ]; await client.query( sql` INSERT INTO ${this.chunksTable} ( doc_id, chunk_hash, chunk_text, chunk_json, chunk_blob, embedding, index, vector_clock ) VALUES ${values} `, params ); } } async function defaultHash(chunk) { let hash = ""; const { blob, ...rest } = chunk; if (blob instanceof Blob) { const arrayBuffer = await chunk.blob.arrayBuffer(); hash = crypto.createHash("md5").update(Buffer.from(arrayBuffer)).digest("hex"); } hash += crypto.createHash("md5").update(JSON.stringify(rest)).digest("hex"); return hash; } function defaultChunkGenerator(doc) { const { text, blob, ...json } = { ...doc }; return Promise.resolve([{ text, json: { ...json, text }, blob }]); } function defaultEmbeddingGenerator(chunk, index) { return Promise.resolve({ embedding: [], text: chunk.text, json: chunk.json, blob: chunk.blob }); } async function getTrackerConfig(connectionStringOrClient, trackerName) { const client = typeof connectionStringOrClient === "string" ? new pg.Client({ connectionString: connectionStringOrClient }) : connectionStringOrClient; const ownClient = typeof connectionStringOrClient === "string"; try { if (ownClient) { await client.connect(); } const sanitizedTrackerName = trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_"); const schemaName = `${PREFIX}${sanitizedTrackerName}`; const schemaQuery = ` SELECT schema_name FROM information_schema.schemata WHERE schema_name = '${schemaName}'; `; const schemaCheckResult = await client.query(schemaQuery); if (schemaCheckResult.rows.length === 0) { throw new Error(`Tracker "${trackerName}" not found`); } const configQuery = ` SELECT key, value FROM "${schemaName}".config; `; const configResult = await client.query(configQuery); if (configResult.rows.length === 0) { throw new Error(`Configuration for tracker "${trackerName}" not found`); } const config = {}; configResult.rows.forEach((row) => { config[row.key] = row.value; }); return { trackerName: sanitizedTrackerName, documentsTable: config.documentsTable || "", embeddingDimension: parseInt(config.embeddingDimension || "0", 10), shadowTable: config.shadowTable || "", chunksTable: config.chunksTable || "", docIdType: config.docIdType || "INT" }; } catch (error) { throw error; } finally { if (ownClient) { await client.end(); } } } async function countRemainingDocuments(connectionStringOrClient, trackerName) { const client = typeof connectionStringOrClient === "string" ? new pg.Client({ connectionString: connectionStringOrClient }) : connectionStringOrClient; const ownClient = typeof connectionStringOrClient === "string"; try { if (ownClient) { await client.connect(); } const sanitizedTrackerName = trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_"); const schemaName = `${PREFIX}${sanitizedTrackerName}`; const schemaQuery = ` SELECT schema_name FROM information_schema.schemata WHERE schema_name = '${schemaName}'; `; const schemaCheckResult = await client.query(schemaQuery); if (schemaCheckResult.rows.length === 0) { throw new Error(`Tracker "${trackerName}" not found`); } const countPendingJobsQuery = ` SELECT COUNT(*) as count FROM ${schemaName}.${WORK_QUEUE_TABLE} WHERE status = 'pending' OR status = 'processing'; `; const countWorkQueueResult = await client.query(countPendingJobsQuery); return parseInt(countWorkQueueResult.rows[0].count, 10); } catch (error) { throw error; } finally { if (ownClient) { await client.end(); } } } async function reprocessDocuments(connectionStringOrClient, trackerName) { const client = typeof connectionStringOrClient === "string" ? new pg.Client({ connectionString: connectionStringOrClient }) : connectionStringOrClient; const ownClient = typeof connectionStringOrClient === "string"; try { if (ownClient) { await client.connect(); } const sanitizedTrackerName = trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_"); const schemaName = `${PREFIX}${sanitizedTrackerName}`; const schemaQuery = ` SELECT schema_name FROM information_schema.schemata WHERE schema_name = '${schemaName}'; `; const schemaCheckResult = await client.query(schemaQuery); if (schemaCheckResult.rows.length === 0) { throw new Error(`Tracker "${trackerName}" not found`); } const config = await getTrackerConfig(client, trackerName); const documentsTable = config.documentsTable; await client.query("BEGIN"); const shadowTable = config.shadowTable; const docIdsQuery = ` SELECT id FROM ${documentsTable} `; const docIdsResult = await client.query(docIdsQuery); for (const doc of docIdsResult.rows) { const incQuery = ` UPDATE ${shadowTable} SET vector_clock = vector_clock + 1 WHERE doc_id = ${doc.id}; `; await client.query(incQuery); } const insertQuery = ` INSERT INTO ${shadowTable} (doc_id, vector_clock) SELECT id, 1 FROM ${documentsTable} d WHERE NOT EXISTS ( SELECT 1 FROM ${shadowTable} s WHERE s.doc_id = d.id ); `; await client.query(insertQuery); const chunksTable = config.chunksTable; const updateChunkHashesQuery = ` UPDATE ${chunksTable} SET chunk_hash = '' WHERE chunk_hash IS NOT NULL; `; await client.query(updateChunkHashesQuery); await client.query("COMMIT"); } catch (error) { await client.query("ROLLBACK"); throw error; } finally { if (ownClient) { await client.end(); } } } async function destroyTracker(connectionStringOrClient, trackerName) { const client = typeof connectionStringOrClient === "string" ? new pg.Client({ connectionString: connectionStringOrClient }) : connectionStringOrClient; const ownClient = typeof connectionStringOrClient === "string"; try { if (ownClient) { await client.connect(); } const sanitizedTrackerName = trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_"); const schemaName = `${PREFIX}${sanitizedTrackerName}`; const schemaQuery = ` SELECT schema_name FROM information_schema.schemata WHERE schema_name = '${schemaName}'; `; const schemaCheckResult = await client.query(schemaQuery); if (schemaCheckResult.rows.length === 0) { throw new Error(`Tracker "${trackerName}" not found`); } const configQuery = ` SELECT value FROM "${schemaName}".config WHERE key = 'documentsTable'; `; const configResult = await client.query(configQuery); if (configResult.rows.length === 0) { throw new Error( `Documents table configuration for tracker "${trackerName}" not found` ); } const documentsTable = configResult.rows[0].value; await client.query("BEGIN"); const dropTriggerQuery = ` DROP TRIGGER IF EXISTS sync_${schemaName}_to_shadow ON ${documentsTable}; `; await client.query(dropTriggerQuery); const dropSchemaQuery = ` DROP SCHEMA "${schemaName}" CASCADE; `; await client.query(dropSchemaQuery); await client.query("COMMIT"); } catch (error) { await client.query("ROLLBACK"); throw error; } finally { if (ownClient) { await client.end(); } } } const _RAGmatic = class _RAGmatic { constructor(worker, name, connectionString, dbClient) { this.worker = worker; this.name = name; this.connectionString = connectionString; this.dbClient = dbClient; } static async create(config) { if (_RAGmatic.instances.has(config.name)) { return _RAGmatic.instances.get(config.name); } await setup({ connectionString: config.connectionString, dbClient: config.dbClient, trackerName: config.name, documentsTable: config.tableToWatch, docIdType: config.docIdType, embeddingDimension: config.embeddingDimension, skipEmbeddingIndexSetup: config.skipEmbeddingIndexSetup, logger: config.logger }); const worker = new Worker({ connectionString: config.connectionString, dbClient: config.dbClient, trackerName: config.name, chunkGenerator: config.recordToChunksFunction, hashFunction: config.hashFunction, embeddingGenerator: config.chunkToEmbeddingFunction, pollingIntervalMs: config.pollingIntervalMs, batchSize: config.batchSize, maxRetries: config.maxRetries, initialRetryDelayMs: config.initialRetryDelayMs, stalledJobTimeoutMinutes: config.stalledJobTimeoutMinutes, logger: config.logger }); const ragmatic = new _RAGmatic( worker, config.name, config.connectionString, config.dbClient ); _RAGmatic.instances.set(config.name, ragmatic); return ragmatic; } async destroy() { await destroyTracker( this.connectionString ?? this.dbClient, this.name ); _RAGmatic.instances.delete(this.name); } async start() { return await this.worker.start(); } async stop() { return await this.worker.stop(); } async reprocessAll() { return await reprocessDocuments( this.connectionString ?? this.dbClient, this.name ); } async countRemainingDocuments() { return countRemainingDocuments( this.connectionString ?? this.dbClient, this.name ); } }; // create singleton instances per tracker name _RAGmatic.instances = /* @__PURE__ */ new Map(); let RAGmatic = _RAGmatic; exports.RAGmatic = RAGmatic; exports.createLogger = createLogger; exports.logger = logger;