ragmatic
Version:
Automatically and continuously vectorize your PostgreSQL tables with the flexibility of your own embedding pipelines
1,526 lines (1,504 loc) • 49.3 kB
JavaScript
'use strict';
var pg = require('pg');
var crypto = require('crypto');
const DEFAULT_LOG_LEVEL = "info";
function createLogger(config = {}) {
try {
const winston = require("winston");
const level = config.level || process.env.RAGMATIC_LOG_LEVEL || DEFAULT_LOG_LEVEL;
const isJson = (config.format || process.env.RAGMATIC_LOG_FORMAT || "text") === "json";
const service = config.service || config.trackerName || "ragmatic";
const silent = config.silent || process.env.RAGMATIC_LOG_SILENT === "true";
const textFormat = winston.format.printf(
({ level: level2, message, timestamp, ...rest }) => {
const meta = Object.keys(rest).length ? ` ${JSON.stringify(rest)}` : "";
return `${timestamp} [${level2.toUpperCase()}] [${service}]: ${message}${meta}`;
}
);
return winston.createLogger({
level,
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
isJson ? winston.format.json() : textFormat
),
defaultMeta: { service },
silent,
transports: [new winston.transports.Console()]
});
} catch (error) {
const logLevels = { error: 0, warn: 1, info: 2, debug: 3, trace: 4 };
const selectedLevel = config.level || process.env.RAGMATIC_LOG_LEVEL || DEFAULT_LOG_LEVEL;
const levelValue = logLevels[selectedLevel] || 2;
const silent = config.silent || process.env.RAGMATIC_LOG_SILENT === "true";
const service = config.service || config.trackerName || "ragmatic";
if (silent) {
return {
error: () => {
},
warn: () => {
},
info: () => {
},
debug: () => {
}
};
}
return {
error: (message, meta) => {
if (levelValue >= 0)
console.error(`[ERROR] [${service}]: ${message}`, meta || "");
},
warn: (message, meta) => {
if (levelValue >= 1)
console.warn(`[WARN] [${service}]: ${message}`, meta || "");
},
info: (message, meta) => {
if (levelValue >= 2)
console.info(`[INFO] [${service}]: ${message}`, meta || "");
},
debug: (message, meta) => {
if (levelValue >= 3)
console.debug(`[DEBUG] [${service}]: ${message}`, meta || "");
}
};
}
}
const logger = createLogger();
const PREFIX = "ragmatic_";
const SHADOW_TABLE = "shadows";
const CHUNK_TABLE = "chunks";
const WORK_QUEUE_TABLE = "work_queue";
const RAGMATIC_SCHEMA_VERSION = 1;
const sql = (strings, ...values) => strings.reduce((acc, str, i) => acc + str + (values[i] ?? ""), "");
async function setup(config) {
const logger = createLogger({
...config.logger,
service: "ragmatic-setup",
trackerName: config.trackerName
});
logger.info("Starting database setup", { trackerName: config.trackerName });
const client = config.dbClient || new pg.Client({ connectionString: config.connectionString });
if (!config.dbClient && !config.connectionString) {
const error = new Error(
"Either dbClient or connectionString must be provided"
);
logger.error("Setup failed", { error: error.message });
throw error;
}
let documentsSchema = "public";
let documentsTable = config.documentsTable;
if (config.documentsTable.includes(".")) {
documentsSchema = config.documentsTable.split(".")[0];
documentsTable = config.documentsTable.split(".")[1];
}
documentsSchema = documentsSchema.replaceAll(/[^a-zA-Z0-9_]/g, "_");
documentsTable = documentsTable.replaceAll(/[^a-zA-Z0-9_]/g, "_");
const trackerName = config.trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_");
const embeddingDimension = config.embeddingDimension.toString().replaceAll(/[^0-9]/g, "");
const schemaName = `${PREFIX}${trackerName}`;
const shadowTable = `${schemaName}.${(config.shadowTable || SHADOW_TABLE).replaceAll(/[^a-zA-Z0-9_]/g, "_")}`;
const chunksTable = `${schemaName}.${(config.chunksTable || CHUNK_TABLE).replaceAll(/[^a-zA-Z0-9_]/g, "_")}`;
const docIdType = (config.docIdType || "INT").replaceAll(
/[^a-zA-Z0-9_]/g,
"_"
);
const skipEmbeddingIndexSetup = config.skipEmbeddingIndexSetup || false;
logger.debug("Configuration prepared", {
trackerName,
documentsTable,
schemaName,
shadowTable,
chunksTable,
embeddingDimension,
skipEmbeddingIndexSetup
});
try {
logger.debug("Connecting to database");
await client.connect?.();
await client.query(sql`BEGIN`);
logger.debug("Creating vector extension if not exists");
await client.query(sql`CREATE EXTENSION IF NOT EXISTS vector`);
logger.debug("Creating schema", { schema: schemaName });
await client.query(sql`CREATE SCHEMA IF NOT EXISTS ${schemaName}`);
await client.query(sql`
CREATE TABLE IF NOT EXISTS ${schemaName}.config (
id SERIAL PRIMARY KEY,
key TEXT NOT NULL UNIQUE,
value TEXT NOT NULL
);
`);
await client.query(sql`
INSERT INTO
${schemaName}.config (key, value)
VALUES
(
'documentsSchema',
'${documentsSchema}'
),
(
'documentsTable',
'${documentsTable}'
),
('docIdType', '${docIdType}'),
(
'embeddingDimension',
'${embeddingDimension}'
),
(
'shadowTable',
'${shadowTable}'
),
(
'chunksTable',
'${chunksTable}'
),
(
'ragmaticSchemaVersion',
'${RAGMATIC_SCHEMA_VERSION}'
)
ON CONFLICT (key) DO UPDATE
SET
value = EXCLUDED.value;
`);
await client.query(sql`
CREATE TABLE IF NOT EXISTS ${shadowTable} (
id SERIAL PRIMARY KEY,
doc_id ${docIdType} NOT NULL,
vector_clock BIGINT NOT NULL DEFAULT 1, -- At 1 billion increments per second, BIGINT lasts about 292 years.
UNIQUE (doc_id)
);
-- Allow constraints to be created
DELETE FROM ${shadowTable}
WHERE
doc_id NOT IN (
SELECT
id
FROM
${documentsSchema}.${documentsTable}
);
-- Always (re)create the constraint to ensure correctness
-- this is important when someone drops the documents table and recreates it
ALTER TABLE ${shadowTable}
DROP CONSTRAINT IF EXISTS fk_documents_sync;
ALTER TABLE ${shadowTable}
ADD CONSTRAINT fk_documents_sync FOREIGN KEY (doc_id) REFERENCES ${documentsSchema}.${documentsTable} (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED;
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.shadowTable || SHADOW_TABLE}_vector_clock ON ${shadowTable} (vector_clock);
`);
await client.query(sql`
CREATE OR REPLACE FUNCTION ${schemaName}.sync_documents_to_shadow () RETURNS trigger LANGUAGE plpgsql AS $$
BEGIN
IF TG_OP = 'INSERT' THEN
INSERT INTO ${shadowTable} (doc_id, vector_clock)
VALUES (NEW.id, 1);
-- Keep in mind this locks the document row for the duration of the update to the shadow table,
-- so avoid locking the shadow table for too long otherwise this will cause a bottleneck!
-- also do not update the shadow table outside of the library if you want to avoid deadlocks
ELSIF TG_OP = 'UPDATE' THEN
UPDATE ${shadowTable} SET vector_clock = vector_clock + 1 WHERE doc_id = NEW.id;
END IF;
RETURN NULL;
END;
$$;
`);
await client.query(sql`
DROP TRIGGER IF EXISTS sync_${schemaName}_to_shadow ON ${documentsSchema}.${documentsTable};
CREATE TRIGGER sync_${schemaName}_to_shadow
AFTER INSERT
OR
UPDATE ON ${documentsSchema}.${documentsTable} FOR EACH ROW
EXECUTE FUNCTION ${schemaName}.sync_documents_to_shadow ();
`);
await client.query(sql`
CREATE TABLE IF NOT EXISTS ${chunksTable} (
id SERIAL PRIMARY KEY,
doc_id ${docIdType} NOT NULL,
vector_clock BIGINT NOT NULL DEFAULT 0,
index INT NOT NULL,
chunk_hash TEXT NOT NULL,
chunk_text TEXT,
chunk_blob BYTEA,
chunk_json JSONB,
embedding VECTOR (${embeddingDimension}) NOT NULL
);
-- Allow constraints to be created
DELETE FROM ${chunksTable}
WHERE
doc_id NOT IN (
SELECT
id
FROM
${documentsSchema}.${documentsTable}
);
-- Always (re)create the constraint to ensure correctness
-- this is important when someone drops the documents table and recreates it
ALTER TABLE ${chunksTable}
DROP CONSTRAINT IF EXISTS fk_doc_chunks;
ALTER TABLE ${chunksTable}
ADD CONSTRAINT fk_doc_chunks FOREIGN KEY (doc_id) REFERENCES ${documentsSchema}.${documentsTable} (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED;
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_doc_id ON ${chunksTable} (doc_id);
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_chunk_hash ON ${chunksTable} (chunk_hash);
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_vector_clock ON ${chunksTable} (vector_clock);
`);
if (!skipEmbeddingIndexSetup) {
logger.info("Creating HNSW index for vector similarity search", {
table: chunksTable,
embeddingDimension
});
try {
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_embedding ON ${chunksTable} USING hnsw (embedding vector_cosine_ops);
`);
logger.info("HNSW index created successfully");
} catch (indexError) {
logger.warn("Failed to create HNSW index", {
error: indexError instanceof Error ? indexError.message : String(indexError),
note: "This may be expected on certain PostgreSQL configurations. You can create the index manually later."
});
}
}
await client.query(sql`
CREATE TABLE IF NOT EXISTS ${schemaName}.${WORK_QUEUE_TABLE} (
id SERIAL PRIMARY KEY,
doc_id ${docIdType} NOT NULL,
vector_clock BIGINT NOT NULL,
status TEXT NOT NULL DEFAULT 'pending', -- pending, processing, completed, failed, skipped
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
processing_started_at TIMESTAMP,
completed_at TIMESTAMP,
worker_id TEXT DEFAULT NULL,
error TEXT DEFAULT NULL,
retry_count INT NOT NULL DEFAULT 0,
UNIQUE (doc_id, vector_clock)
-- NOTE: deletes are not cascaded from documents table to the work queue, but we need to keep the constraint:
-- UNIQUE (doc_id, vector_clock) and the monotonicity of the vector_clock at all times
-- even if eg.: you drop the docs table, recreate the table, run setup again, and then re-insert the doc_id with a smaller vector_clock the second time
-- will only work if queue rows are cleaned up first
);
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_status ON ${schemaName}.${WORK_QUEUE_TABLE} (status);
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_doc_id ON ${schemaName}.${WORK_QUEUE_TABLE} (doc_id);
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_vector_clock ON ${schemaName}.${WORK_QUEUE_TABLE} (vector_clock);
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_stalled_jobs ON ${schemaName}.${WORK_QUEUE_TABLE} (status, processing_started_at);
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_doc_id_vector_clock ON ${chunksTable} (doc_id, vector_clock);
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${WORK_QUEUE_TABLE}_doc_id_vector_clock ON ${schemaName}.${WORK_QUEUE_TABLE} (doc_id, vector_clock DESC);
`);
await client.query(sql`
CREATE INDEX IF NOT EXISTS idx_${schemaName}_${config.chunksTable || CHUNK_TABLE}_doc_id_index ON ${chunksTable} (doc_id, index);
`);
const res = await client.query(sql`
SELECT
rolsuper
FROM
pg_roles
WHERE
rolname = current_user;
`);
if (res.rows[0].rolsuper) {
await client.query(sql`
DO $do$
BEGIN
-- Create the function
CREATE OR REPLACE FUNCTION ${schemaName}.drop_ragmatic_schema()
RETURNS event_trigger
LANGUAGE plpgsql
AS $func$
DECLARE
obj record;
BEGIN
-- Loop through dropped objects
FOR obj IN SELECT object_type, schema_name, object_name FROM pg_event_trigger_dropped_objects()
LOOP
IF obj.object_type = 'table' AND obj.schema_name = '${documentsSchema}' AND obj.object_name = '${documentsTable}' THEN
DROP SCHEMA IF EXISTS ${schemaName} CASCADE;
END IF;
END LOOP;
END;
$func$;
-- Create the event trigger
DROP EVENT TRIGGER IF EXISTS sync_${schemaName}_on_drop_table;
CREATE EVENT TRIGGER sync_${schemaName}_on_drop_table
ON sql_drop
EXECUTE FUNCTION ${schemaName}.drop_ragmatic_schema();
END;
$do$;
`);
logger.debug(
"Event trigger created successfully. The ragmatic schema will be dropped if you drop the documents table."
);
} else {
logger.warn(
"Client does not have superuser privileges. Event trigger for table drops was not created. You can safely ignore this, but note that this ragmatic schema will not be dropped if you drop the documents table."
);
}
await client.query(sql` DELETE FROM ${schemaName}.${WORK_QUEUE_TABLE} `);
await client.query(sql`
INSERT INTO
${shadowTable} (doc_id)
SELECT
id
FROM
${documentsSchema}.${documentsTable} d
WHERE
NOT EXISTS (
SELECT
1
FROM
${shadowTable} s
WHERE
s.doc_id = d.id
);
`);
await client.query(sql`COMMIT`);
logger.info("Database setup complete", {
trackerName,
documentsTable,
schemaName,
shadowTable,
chunksTable,
embeddingDimension,
chunkIndexesCreated: !skipEmbeddingIndexSetup
});
logger.debug("Sample search query", {
query: `
SELECT *, 1 - (embedding <=> '[0.1, 0.2, ...]'::vector) as similarity
FROM ${chunksTable}
ORDER BY similarity
LIMIT 5;
`
});
} catch (err) {
logger.error("Error during database setup", {
error: err instanceof Error ? err.message : String(err),
stack: err instanceof Error ? err.stack : void 0,
trackerName,
documentsTable,
schemaName
});
await client.query(sql`ROLLBACK`);
throw err;
} finally {
logger.debug("Closing database connection");
await client.end?.();
}
}
var ErrorType = /* @__PURE__ */ ((ErrorType2) => {
ErrorType2["Temporary"] = "temporary";
ErrorType2["Permanent"] = "permanent";
return ErrorType2;
})(ErrorType || {});
class ProcessingError extends Error {
constructor(message, type, cause) {
super(message);
this.type = type;
this.cause = cause;
this.name = "ProcessingError";
}
}
class Worker {
constructor(config) {
this.config = config;
this.running = null;
this.connected = false;
this.timer = null;
this.createJobsTimer = null;
this.createJobsRunning = null;
// Config:
this.workerId = crypto.randomUUID();
// Config loaded from db:
this.shadowTable = null;
this.chunksTable = null;
this.documentsTable = null;
this.embeddingDimension = null;
this.docIdType = null;
this.pollingInterval = config.pollingIntervalMs || 1e3;
this.maxRetries = config.maxRetries || 3;
this.initialRetryDelay = config.initialRetryDelayMs || 1e3;
this.batchSize = config.batchSize || 5;
this.stalledJobTimeoutMinutes = config.stalledJobTimeoutMinutes || 1;
if (!config.dbClient && !config.connectionString) {
throw new Error("Either dbClient or connectionString must be provided");
}
this.pool = new pg.Pool({ connectionString: config.connectionString });
this.schemaName = `${PREFIX}${config.trackerName.replace(/[^a-zA-Z0-9_]/g, "_")}`;
this.chunkGenerator = config.chunkGenerator || defaultChunkGenerator;
this.embeddingGenerator = config.embeddingGenerator || defaultEmbeddingGenerator;
this.hashFunction = config.hashFunction || defaultHash;
this.logger = createLogger({
...config.logger,
trackerName: config.trackerName
});
this.logger.debug("Worker instance created", {
workerId: this.workerId,
trackerName: config.trackerName,
batchSize: this.batchSize,
pollingInterval: this.pollingInterval
});
}
async loadConfig() {
const configRes = await this.pool.query(sql`
SELECT
key,
value
FROM
${this.schemaName}.config
`);
const configMap = configRes.rows.reduce(
(acc, row) => {
acc[row.key] = row.value || null;
return acc;
},
{}
);
this.documentsTable = configMap.documentsTable;
this.shadowTable = configMap.shadowTable;
this.chunksTable = configMap.chunksTable;
this.embeddingDimension = Number(configMap.embeddingDimension);
this.docIdType = configMap.docIdType;
if (!this.documentsTable || !this.shadowTable || !this.chunksTable || !this.embeddingDimension || !this.docIdType) {
throw new ProcessingError(
"Missing config values. Please run setup() first.",
ErrorType.Permanent
);
}
}
// Start the worker by connecting to the database and beginning the polling loop.
async start() {
this.logger.info("Starting worker", { workerId: this.workerId });
if (!("query" in this.pool)) {
const error = new Error(
"Invalid database client. Please use a pg.Client compatible client or pass a connection string instead."
);
this.logger.error("Failed to start worker", {
error: error.message,
workerId: this.workerId
});
throw error;
}
if (!this.connected) {
try {
const schemaCheck = await this.pool.query(sql`
SELECT
1
FROM
pg_namespace
WHERE
nspname = '${this.schemaName}'
`);
if (schemaCheck.rowCount === 0) {
const error = new ProcessingError(
`Schema ${this.schemaName} does not exist. Please run setupDatabaseTracker() first.`,
ErrorType.Permanent
);
this.logger.error("Schema not found", {
error: error.message,
schema: this.schemaName,
workerId: this.workerId
});
throw error;
}
this.logger.debug("Loading configuration from database");
await this.loadConfig();
this.connected = true;
this.logger.info("Worker connected to database", {
schema: this.schemaName,
embeddingDimension: this.embeddingDimension,
documentsTable: this.documentsTable,
workerId: this.workerId
});
} catch (error) {
this.logger.error("Failed to connect to database", {
error: error instanceof Error ? error.message : String(error),
schema: this.schemaName,
workerId: this.workerId
});
throw error;
}
}
this.logger.info("Worker started", { workerId: this.workerId });
await this.runCreateJobs();
await this.run();
}
async pause() {
this.logger.info("Pausing worker", { workerId: this.workerId });
if (this.createJobsTimer) {
clearTimeout(this.createJobsTimer);
this.createJobsTimer = null;
}
if (this.createJobsRunning) {
await this.createJobsRunning;
this.createJobsRunning = null;
}
if (this.timer) {
clearTimeout(this.timer);
this.timer = null;
}
if (this.running) {
await this.running;
this.running = null;
}
this.logger.info("Worker paused", { workerId: this.workerId });
}
// Stop the worker gracefully by stopping the polling loop and disconnecting.
async stop() {
this.logger.info("Stopping worker", { workerId: this.workerId });
await this.pause();
await this.pool.end();
this.connected = false;
this.logger.info("Worker stopped", { workerId: this.workerId });
}
async runCreateJobs() {
if (this.createJobsRunning) return;
this.createJobsTimer = setTimeout(async () => {
if (!this.connected || this.createJobsRunning) return;
this.createJobsRunning = this.createJobs();
await this.createJobsRunning;
this.createJobsRunning = null;
await this.runCreateJobs();
}, this.pollingInterval);
}
async run() {
if (this.running) return;
this.timer = setTimeout(async () => {
if (!this.connected || this.running) return;
this.running = this.poll();
await this.running;
this.running = null;
await this.run();
}, this.pollingInterval);
}
// The polling loop: poll the shadow table for dirty records, process them, and then schedule the next poll.
async poll() {
try {
this.logger.debug("Starting polling cycle", { workerId: this.workerId });
this.logger.debug("Finding and claiming pending jobs");
const jobs = await this.findAndClaimJobs();
if (jobs.length > 0) {
this.logger.info("Claimed jobs for processing", {
jobCount: jobs.length,
workerId: this.workerId
});
} else {
this.logger.debug("No jobs to process", { workerId: this.workerId });
}
await this.processJobs(jobs);
this.logger.debug("Completed polling cycle", { workerId: this.workerId });
} catch (err) {
if (err instanceof ProcessingError && err.type === ErrorType.Temporary) {
this.logger.warn("Temporary failure during processing", {
error: err.message,
cause: err.cause ? err.cause instanceof Error ? err.cause.message : String(err.cause) : void 0,
workerId: this.workerId
});
} else {
this.logger.error("Error during processing", {
error: err instanceof Error ? err.message : String(err),
stack: err instanceof Error ? err.stack : void 0,
workerId: this.workerId
});
}
}
}
async createJobs() {
this.logger.debug("Creating jobs from outdated shadow records");
const client = await this.pool.connect();
try {
await client.query(sql`
SET
SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL READ COMMITTED
`);
await client.query(sql`BEGIN`);
this.logger.debug("Finding documents that need processing");
const res = await client.query(sql`
WITH
latest_shadow_clocks AS (
SELECT
s.doc_id,
s.vector_clock
FROM
${this.shadowTable} s
),
latest_chunk_clocks AS (
SELECT
doc_id,
MAX(vector_clock) AS max_chunk_vector_clock
FROM
${this.chunksTable}
GROUP BY
doc_id
),
-- Select the documents that have a shadow clock greater than the max chunk clock
-- aka. the documents that have outdated or missing chunks...
work_needed AS (
SELECT
s.doc_id,
s.vector_clock AS shadow_clock,
COALESCE(c.max_chunk_vector_clock, 0) AS chunk_clock
FROM
latest_shadow_clocks s
LEFT JOIN latest_chunk_clocks c ON s.doc_id = c.doc_id
WHERE
s.vector_clock > COALESCE(c.max_chunk_vector_clock, 0)
),
-- ...and skip the (document, vector_clock) pairs that are already in the work queue
current_work AS (
SELECT
doc_id,
vector_clock
FROM
${this.schemaName}.${WORK_QUEUE_TABLE}
)
SELECT
w.doc_id,
w.shadow_clock,
w.chunk_clock
FROM
work_needed w
LEFT JOIN current_work cw ON w.doc_id = cw.doc_id
AND w.shadow_clock = cw.vector_clock
WHERE
cw.doc_id IS NULL -- Only select work not already in the queue
ORDER BY
w.shadow_clock - w.chunk_clock DESC, -- Prioritize documents most out of sync
w.shadow_clock ASC -- Then older documents first
LIMIT
${this.batchSize};
`);
if (res.rows.length > 0) {
this.logger.info("Creating new jobs in work queue", {
jobCount: res.rows.length,
workerId: this.workerId,
docIds: res.rows.map((r) => r.doc_id)
});
await client.query(sql`
INSERT INTO
${this.schemaName}.${WORK_QUEUE_TABLE} (doc_id, vector_clock, status)
VALUES
${res.rows.map(
(r) => sql`
(
${r.doc_id},
${r.shadow_clock},
'pending'
)
`
).join(",")}
ON CONFLICT DO NOTHING;
`);
} else {
this.logger.debug("No new jobs to create", { workerId: this.workerId });
}
await client.query(sql`COMMIT`);
} catch (err) {
await client.query(sql`ROLLBACK`);
this.logger.error("Failed to create jobs", {
error: err instanceof Error ? err.message : String(err),
workerId: this.workerId
});
throw err;
} finally {
client.release();
}
}
async findUnclaimedOrStalledJobs(client) {
this.logger.debug("Finding unclaimed or stalled jobs", {
workerId: this.workerId
});
const res = await client.query(sql`
SELECT
doc_id,
vector_clock,
status,
processing_started_at,
worker_id,
error,
retry_count,
created_at
FROM
${this.schemaName}.${WORK_QUEUE_TABLE}
WHERE
status = 'pending'
OR status = 'processing'
AND processing_started_at < NOW() - INTERVAL '${this.stalledJobTimeoutMinutes} minutes'
FOR UPDATE
SKIP LOCKED
LIMIT
${this.batchSize}
`);
this.logger.debug("Found jobs", {
jobIdsAndVectorClocks: res.rows.map(
(r) => ({
doc_id: r.doc_id,
vector_clock: r.vector_clock
})
),
jobCount: res.rows.length,
workerId: this.workerId
});
return res.rows;
}
async claimJobs(client, jobs, worker_id) {
if (jobs.length === 0) {
return [];
}
const conditions = [];
const params = [worker_id];
let paramIndex = 2;
for (const job of jobs) {
conditions.push(
`(doc_id = $${paramIndex} AND vector_clock = $${paramIndex + 1})`
);
params.push(job.doc_id, job.vector_clock);
paramIndex += 2;
}
const whereClause = conditions.join(" OR ");
const query = `
UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE}
SET
status = 'processing',
processing_started_at = NOW(),
worker_id = $1
WHERE
(${whereClause})
AND (
status = 'pending'
OR (
status = 'processing'
AND processing_started_at < NOW() - INTERVAL '${this.stalledJobTimeoutMinutes} minutes'
)
)
RETURNING *
`;
const res = await client.query(query, params);
this.logger.debug("Claimed jobs", {
jobIdsAndVectorClocks: res.rows.map(
(r) => ({
doc_id: r.doc_id,
vector_clock: r.vector_clock
})
),
jobCount: res.rows.length,
workerId: this.workerId
});
return res.rows;
}
async findAndClaimJobs() {
this.logger.debug("Finding and claiming jobs", {
workerId: this.workerId
});
const client = await this.pool.connect();
try {
await client.query(sql`
SET
SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL READ COMMITTED
`);
await client.query(sql`BEGIN`);
let jobs = await this.findUnclaimedOrStalledJobs(client);
jobs = await this.claimJobs(client, jobs, this.workerId);
await client.query(sql`COMMIT`);
this.logger.debug("Claimed jobs", {
jobCount: jobs.length,
workerId: this.workerId
});
return jobs;
} catch (err) {
this.logger.error("Failed to find and claim jobs", {
error: err instanceof Error ? err.message : String(err),
workerId: this.workerId
});
await client.query(sql`ROLLBACK`);
throw err;
} finally {
client.release();
}
}
async processJobs(jobs) {
for (const job of jobs) {
await this.processJob(job);
}
}
async skipJob(doc_id, vector_clock, error) {
this.logger.debug("Skipping job", {
docId: doc_id,
vectorClock: vector_clock,
error
});
await this.pool.query(
sql`
UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE}
SET
status = 'skipped',
error = $3,
completed_at = NOW()
WHERE
doc_id = $1
AND vector_clock = $2
`,
[doc_id, vector_clock, error]
);
}
async failJob(doc_id, vector_clock, error) {
this.logger.debug("Failing job", {
docId: doc_id,
vectorClock: vector_clock,
error: error.message
});
await this.pool.query(
sql`
UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE}
SET
status = 'failed',
error = $3,
completed_at = NOW()
WHERE
doc_id = $1
AND vector_clock = $2
`,
[doc_id, vector_clock, error]
);
}
async getLatestJob(doc_id) {
const res = await this.pool.query(
sql`
SELECT
vector_clock
FROM
${this.schemaName}.${WORK_QUEUE_TABLE}
WHERE
doc_id = $1
ORDER BY
vector_clock DESC
LIMIT
1
`,
[doc_id]
);
if (res.rows.length === 0) {
return null;
}
return { vector_clock: res.rows[0].vector_clock };
}
async processJob(job) {
this.logger.debug("Processing job", {
docId: job.doc_id,
vectorClock: job.vector_clock,
workerId: this.workerId
});
const latestJob = await this.getLatestJob(job.doc_id);
if (latestJob && latestJob.vector_clock > job.vector_clock) {
this.logger.debug("Newer job found, skipping", {
docId: job.doc_id,
vectorClock: job.vector_clock,
latestVectorClock: latestJob.vector_clock
});
await this.skipJob(job.doc_id, job.vector_clock, "Newer job found");
return;
}
const docRes = await this.pool.query(
sql`
SELECT
*
FROM
${this.documentsTable}
WHERE
id = $1
`,
[job.doc_id]
);
if (docRes.rows.length === 0) {
this.logger.debug("Document already deleted, skipping", {
docId: job.doc_id,
vectorClock: job.vector_clock
});
await this.skipJob(job.doc_id, job.vector_clock, "Document deleted");
return;
}
const doc = docRes.rows[0];
const chunks = await this.chunkGenerator(doc);
const newChunkHashes = await Promise.all(
chunks.map(async (c, i) => {
const hash = await this.hashFunction(c);
return `${hash}-${i}`;
})
);
const client = await this.pool.connect();
try {
await client.query(sql`BEGIN`);
const { deduplicatedNew, deduplicatedOld } = await this.dedupeAndRemoveOld(client, job.doc_id, newChunkHashes);
const toEmbed = chunks.map((c, index) => ({ ...c, index, hash: newChunkHashes[index] })).filter((chunk, i) => deduplicatedNew.has(chunk.hash));
const newChunks = await Promise.all(
toEmbed.map(
async (chunk, i) => this.generateEmbeddingForChunk(
chunk,
i,
chunk.hash,
job.doc_id,
job.vector_clock
)
)
);
await this.insertNewChunks(client, job.doc_id, newChunks);
await this.removeOldChunks(
client,
job.doc_id,
Array.from(deduplicatedOld)
);
await client.query(
sql`
UPDATE ${this.chunksTable}
SET
vector_clock = $1
WHERE
doc_id = $2
`,
[job.vector_clock, job.doc_id]
);
const updateRes = await client.query(
sql`
WITH
latest_shadow_clock AS (
SELECT
vector_clock AS latest_vector_clock
FROM
${this.shadowTable}
WHERE
doc_id = $1
),
updated AS (
UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE}
SET
status = 'completed',
completed_at = NOW()
WHERE
doc_id = $1
AND worker_id = $3
AND vector_clock = $2
AND vector_clock = (
SELECT
latest_vector_clock
FROM
latest_shadow_clock
)
RETURNING
*
)
SELECT
*
FROM
updated
`,
[job.doc_id, job.vector_clock, this.workerId]
);
if (updateRes.rowCount === 0) {
this.logger.debug(
"Vector clock is no longer latest or worker is not the job owner, rolling back",
{
docId: job.doc_id,
vectorClock: job.vector_clock,
workerId: this.workerId
}
);
await client.query(sql`ROLLBACK`);
const shadowRes = await client.query(
sql`
SELECT
vector_clock
FROM
${this.shadowTable}
WHERE
doc_id = $1
`,
[job.doc_id]
);
if (shadowRes.rows[0]?.vector_clock > job.vector_clock) {
this.logger.debug(
"Vector clock is no longer latest, newer job found, skipping",
{
docId: job.doc_id,
vectorClock: job.vector_clock,
latestVectorClock: shadowRes.rows[0]?.vector_clock
}
);
await this.skipJob(
job.doc_id,
job.vector_clock,
"Vector clock is no longer latest, newer job found"
);
}
} else {
await client.query(sql`COMMIT`);
}
} catch (err) {
await client.query(sql`ROLLBACK`);
if (job.retry_count < this.maxRetries) {
await this.retryJob(job, err);
return;
} else {
await this.failJob(job.doc_id, job.vector_clock, err);
}
} finally {
client.release();
}
}
async retryJob(job, err) {
this.logger.info("Retrying job", {
docId: job.doc_id,
vectorClock: job.vector_clock,
error: err.message,
workerId: this.workerId,
retryCount: job.retry_count + 1
});
await this.pool.query(
sql`
UPDATE ${this.schemaName}.${WORK_QUEUE_TABLE}
SET
status = 'pending',
processing_started_at = NULL,
worker_id = NULL,
error = $3,
retry_count = retry_count + 1
WHERE
doc_id = $1
AND vector_clock = $2
`,
[job.doc_id, job.vector_clock, err.message]
);
}
async dedupeAndRemoveOld(client, doc_id, newChunkHashes) {
const existingChunks = await client.query(
sql`
SELECT
chunk_hash
FROM
${this.chunksTable}
WHERE
doc_id = $1
`,
[doc_id]
);
const existingHashes = new Set(
existingChunks.rows.map((r) => r.chunk_hash)
);
const deduplicatedNew = new Set(
newChunkHashes.filter((hash) => !existingHashes.has(hash))
);
const deduplicatedOld = new Set(
[...existingHashes].filter(
(hash) => !newChunkHashes.includes(hash)
)
);
return { deduplicatedNew, deduplicatedOld };
}
async removeOldChunks(client, doc_id, chunk_hashes) {
await client.query(
sql`
DELETE FROM ${this.chunksTable}
WHERE
doc_id = $1
AND chunk_hash = ANY ($2)
`,
[doc_id, chunk_hashes]
);
}
async generateEmbeddingForChunk(chunk, index, hash, doc_id, vector_clock) {
try {
const { embedding, ...rest } = await this.embeddingGenerator(
chunk,
index
);
if (!Array.isArray(embedding)) {
throw new ProcessingError(
`Invalid embedding format: expected number[], got ${typeof embedding}`,
ErrorType.Permanent
);
}
if (embedding.length !== this.embeddingDimension) {
throw new ProcessingError(
`Invalid embedding dimension: expected ${this.embeddingDimension}, got ${embedding.length}`,
ErrorType.Permanent
);
}
if (!embedding.every((n) => typeof n === "number")) {
throw new ProcessingError(
"Invalid embedding: all elements must be numbers",
ErrorType.Permanent
);
}
const pgvectorEmbedding = `[${embedding.join(",")}]`;
return {
...rest,
hash,
embedding: pgvectorEmbedding,
index,
vector_clock
};
} catch (err) {
if (err instanceof ProcessingError) {
throw err;
}
if (err instanceof Error) {
throw new ProcessingError(
`Error generating embedding for doc_id ${doc_id}: ${err.message}`,
ErrorType.Temporary,
err
);
}
throw new ProcessingError(
`Error generating embedding for doc_id ${doc_id}: Unknown error`,
ErrorType.Temporary,
err
);
}
}
async insertNewChunks(client, doc_id, chunks) {
if (chunks.length === 0) return;
const values = chunks.map(
(_, i) => `($1, $${i * 7 + 2}, $${i * 7 + 3}, $${i * 7 + 4}, $${i * 7 + 5}, $${i * 7 + 6}, $${i * 7 + 7}, $${i * 7 + 8})`
).join(",");
const params = [
doc_id,
...chunks.flatMap((c) => [
c.hash,
"text" in c ? c.text : null,
"json" in c ? c.json : null,
"blob" in c ? c.blob : null,
c.embedding,
c.index,
c.vector_clock
])
];
await client.query(
sql`
INSERT INTO
${this.chunksTable} (
doc_id,
chunk_hash,
chunk_text,
chunk_json,
chunk_blob,
embedding,
index,
vector_clock
)
VALUES
${values}
`,
params
);
}
}
async function defaultHash(chunk) {
let hash = "";
const { blob, ...rest } = chunk;
if (blob instanceof Blob) {
const arrayBuffer = await chunk.blob.arrayBuffer();
hash = crypto.createHash("md5").update(Buffer.from(arrayBuffer)).digest("hex");
}
hash += crypto.createHash("md5").update(JSON.stringify(rest)).digest("hex");
return hash;
}
function defaultChunkGenerator(doc) {
const { text, blob, ...json } = { ...doc };
return Promise.resolve([{ text, json: { ...json, text }, blob }]);
}
function defaultEmbeddingGenerator(chunk, index) {
return Promise.resolve({
embedding: [],
text: chunk.text,
json: chunk.json,
blob: chunk.blob
});
}
async function getTrackerConfig(connectionStringOrClient, trackerName) {
const client = typeof connectionStringOrClient === "string" ? new pg.Client({
connectionString: connectionStringOrClient
}) : connectionStringOrClient;
const ownClient = typeof connectionStringOrClient === "string";
try {
if (ownClient) {
await client.connect();
}
const sanitizedTrackerName = trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_");
const schemaName = `${PREFIX}${sanitizedTrackerName}`;
const schemaQuery = `
SELECT schema_name
FROM information_schema.schemata
WHERE schema_name = '${schemaName}';
`;
const schemaCheckResult = await client.query(schemaQuery);
if (schemaCheckResult.rows.length === 0) {
throw new Error(`Tracker "${trackerName}" not found`);
}
const configQuery = `
SELECT key, value
FROM "${schemaName}".config;
`;
const configResult = await client.query(configQuery);
if (configResult.rows.length === 0) {
throw new Error(`Configuration for tracker "${trackerName}" not found`);
}
const config = {};
configResult.rows.forEach((row) => {
config[row.key] = row.value;
});
return {
trackerName: sanitizedTrackerName,
documentsTable: config.documentsTable || "",
embeddingDimension: parseInt(config.embeddingDimension || "0", 10),
shadowTable: config.shadowTable || "",
chunksTable: config.chunksTable || "",
docIdType: config.docIdType || "INT"
};
} catch (error) {
throw error;
} finally {
if (ownClient) {
await client.end();
}
}
}
async function countRemainingDocuments(connectionStringOrClient, trackerName) {
const client = typeof connectionStringOrClient === "string" ? new pg.Client({
connectionString: connectionStringOrClient
}) : connectionStringOrClient;
const ownClient = typeof connectionStringOrClient === "string";
try {
if (ownClient) {
await client.connect();
}
const sanitizedTrackerName = trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_");
const schemaName = `${PREFIX}${sanitizedTrackerName}`;
const schemaQuery = `
SELECT schema_name
FROM information_schema.schemata
WHERE schema_name = '${schemaName}';
`;
const schemaCheckResult = await client.query(schemaQuery);
if (schemaCheckResult.rows.length === 0) {
throw new Error(`Tracker "${trackerName}" not found`);
}
const countPendingJobsQuery = `
SELECT COUNT(*) as count
FROM ${schemaName}.${WORK_QUEUE_TABLE}
WHERE status = 'pending' OR status = 'processing';
`;
const countWorkQueueResult = await client.query(countPendingJobsQuery);
return parseInt(countWorkQueueResult.rows[0].count, 10);
} catch (error) {
throw error;
} finally {
if (ownClient) {
await client.end();
}
}
}
async function reprocessDocuments(connectionStringOrClient, trackerName) {
const client = typeof connectionStringOrClient === "string" ? new pg.Client({
connectionString: connectionStringOrClient
}) : connectionStringOrClient;
const ownClient = typeof connectionStringOrClient === "string";
try {
if (ownClient) {
await client.connect();
}
const sanitizedTrackerName = trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_");
const schemaName = `${PREFIX}${sanitizedTrackerName}`;
const schemaQuery = `
SELECT schema_name
FROM information_schema.schemata
WHERE schema_name = '${schemaName}';
`;
const schemaCheckResult = await client.query(schemaQuery);
if (schemaCheckResult.rows.length === 0) {
throw new Error(`Tracker "${trackerName}" not found`);
}
const config = await getTrackerConfig(client, trackerName);
const documentsTable = config.documentsTable;
await client.query("BEGIN");
const shadowTable = config.shadowTable;
const docIdsQuery = `
SELECT id FROM ${documentsTable}
`;
const docIdsResult = await client.query(docIdsQuery);
for (const doc of docIdsResult.rows) {
const incQuery = `
UPDATE ${shadowTable}
SET vector_clock = vector_clock + 1
WHERE doc_id = ${doc.id};
`;
await client.query(incQuery);
}
const insertQuery = `
INSERT INTO ${shadowTable} (doc_id, vector_clock)
SELECT id, 1
FROM ${documentsTable} d
WHERE NOT EXISTS (
SELECT 1
FROM ${shadowTable} s
WHERE s.doc_id = d.id
);
`;
await client.query(insertQuery);
const chunksTable = config.chunksTable;
const updateChunkHashesQuery = `
UPDATE ${chunksTable}
SET chunk_hash = ''
WHERE chunk_hash IS NOT NULL;
`;
await client.query(updateChunkHashesQuery);
await client.query("COMMIT");
} catch (error) {
await client.query("ROLLBACK");
throw error;
} finally {
if (ownClient) {
await client.end();
}
}
}
async function destroyTracker(connectionStringOrClient, trackerName) {
const client = typeof connectionStringOrClient === "string" ? new pg.Client({
connectionString: connectionStringOrClient
}) : connectionStringOrClient;
const ownClient = typeof connectionStringOrClient === "string";
try {
if (ownClient) {
await client.connect();
}
const sanitizedTrackerName = trackerName.replaceAll(/[^a-zA-Z0-9_]/g, "_");
const schemaName = `${PREFIX}${sanitizedTrackerName}`;
const schemaQuery = `
SELECT schema_name
FROM information_schema.schemata
WHERE schema_name = '${schemaName}';
`;
const schemaCheckResult = await client.query(schemaQuery);
if (schemaCheckResult.rows.length === 0) {
throw new Error(`Tracker "${trackerName}" not found`);
}
const configQuery = `
SELECT value
FROM "${schemaName}".config
WHERE key = 'documentsTable';
`;
const configResult = await client.query(configQuery);
if (configResult.rows.length === 0) {
throw new Error(
`Documents table configuration for tracker "${trackerName}" not found`
);
}
const documentsTable = configResult.rows[0].value;
await client.query("BEGIN");
const dropTriggerQuery = `
DROP TRIGGER IF EXISTS sync_${schemaName}_to_shadow ON ${documentsTable};
`;
await client.query(dropTriggerQuery);
const dropSchemaQuery = `
DROP SCHEMA "${schemaName}" CASCADE;
`;
await client.query(dropSchemaQuery);
await client.query("COMMIT");
} catch (error) {
await client.query("ROLLBACK");
throw error;
} finally {
if (ownClient) {
await client.end();
}
}
}
const _RAGmatic = class _RAGmatic {
constructor(worker, name, connectionString, dbClient) {
this.worker = worker;
this.name = name;
this.connectionString = connectionString;
this.dbClient = dbClient;
}
static async create(config) {
if (_RAGmatic.instances.has(config.name)) {
return _RAGmatic.instances.get(config.name);
}
await setup({
connectionString: config.connectionString,
dbClient: config.dbClient,
trackerName: config.name,
documentsTable: config.tableToWatch,
docIdType: config.docIdType,
embeddingDimension: config.embeddingDimension,
skipEmbeddingIndexSetup: config.skipEmbeddingIndexSetup,
logger: config.logger
});
const worker = new Worker({
connectionString: config.connectionString,
dbClient: config.dbClient,
trackerName: config.name,
chunkGenerator: config.recordToChunksFunction,
hashFunction: config.hashFunction,
embeddingGenerator: config.chunkToEmbeddingFunction,
pollingIntervalMs: config.pollingIntervalMs,
batchSize: config.batchSize,
maxRetries: config.maxRetries,
initialRetryDelayMs: config.initialRetryDelayMs,
stalledJobTimeoutMinutes: config.stalledJobTimeoutMinutes,
logger: config.logger
});
const ragmatic = new _RAGmatic(
worker,
config.name,
config.connectionString,
config.dbClient
);
_RAGmatic.instances.set(config.name, ragmatic);
return ragmatic;
}
async destroy() {
await destroyTracker(
this.connectionString ?? this.dbClient,
this.name
);
_RAGmatic.instances.delete(this.name);
}
async start() {
return await this.worker.start();
}
async stop() {
return await this.worker.stop();
}
async reprocessAll() {
return await reprocessDocuments(
this.connectionString ?? this.dbClient,
this.name
);
}
async countRemainingDocuments() {
return countRemainingDocuments(
this.connectionString ?? this.dbClient,
this.name
);
}
};
// create singleton instances per tracker name
_RAGmatic.instances = /* @__PURE__ */ new Map();
let RAGmatic = _RAGmatic;
exports.RAGmatic = RAGmatic;
exports.createLogger = createLogger;
exports.logger = logger;