UNPKG

traceprompt-node

Version:

Client-side encrypted, audit-ready logging for LLM applications

1,675 lines (1,662 loc) 73.7 kB
import { performance } from 'perf_hooks'; import * as fs from 'fs'; import { createReadStream } from 'fs'; import * as path2 from 'path'; import path2__default from 'path'; import * as yaml from 'yaml'; import { buildClient, CommitmentPolicy, KmsKeyringNode } from '@aws-crypto/client-node'; import { Registry, Histogram, Counter, Gauge } from 'prom-client'; import winston from 'winston'; import { blake3 } from '@napi-rs/blake-hash'; import fs2 from 'fs/promises'; import { randomUUID, createHmac } from 'crypto'; import { createInterface } from 'readline'; import { fetch as fetch$1 } from 'undici'; import winkNLP from 'wink-nlp'; import model from 'wink-eng-lite-web-model'; import its from 'wink-nlp/src/its.js'; import nlp2 from 'compromise'; var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, { get: (a, b) => (typeof require !== "undefined" ? require : a)[b] }) : x)(function(x) { if (typeof require !== "undefined") return require.apply(this, arguments); throw Error('Dynamic require of "' + x + '" is not supported'); }); async function fetchHmacSecret(apiKey, ingestUrl) { try { const hmacUrl = `${ingestUrl.replace("/v1/ingest", "")}/v1/hmac-secret`; const response = await fetch(hmacUrl, { method: "GET", headers: { "x-api-key": apiKey, "Content-Type": "application/json" } }); if (!response.ok) { let errorMessage = `${response.status} ${response.statusText}`; try { const errorBody = await response.json(); if (errorBody.message) { errorMessage = errorBody.message; } else if (errorBody.error) { errorMessage = errorBody.error; } } catch { } throw new Error(`Failed to fetch HMAC secret: ${errorMessage}`); } const result = await response.json(); if (!result.success || !result.data?.hmacSecret) { throw new Error("Invalid HMAC secret response"); } return result.data.hmacSecret; } catch (error) { throw new Error(`Failed to fetch HMAC secret: ${error.message}`); } } async function resolveOrgFromApiKey(apiKey, ingestUrl) { try { const whoamiUrl = `${ingestUrl.replace("/v1/ingest", "")}/v1/whoami`; const response = await fetch(whoamiUrl, { method: "GET", headers: { "x-api-key": apiKey, "Content-Type": "application/json" } }); if (!response.ok) { let errorMessage = `${response.status} ${response.statusText}`; try { const errorBody = await response.json(); if (errorBody.message) { errorMessage = errorBody.message; } else if (errorBody.error) { errorMessage = errorBody.error; } } catch { } throw new Error(`Failed to resolve organization: ${errorMessage}`); } const result = await response.json(); if (!result.success) { throw new Error("Failed to resolve organization from API key"); } const orgId = result.data.orgId; if (!orgId) { throw new Error("No organization ID found in API key response"); } const cmkArn = result.data.kmsKeyArn; console.log(`\u2713 Traceprompt auto-resolved organization: ${orgId}`); return { orgId, cmkArn }; } catch (error) { throw new Error( `Failed to auto-resolve organization from API key: ${error instanceof Error ? error.message : String(error)}` ); } } function readYaml(filePath) { try { const abs = path2.resolve(process.cwd(), filePath); if (!fs.existsSync(abs)) return {}; const raw = fs.readFileSync(abs, "utf8"); return yaml.parse(raw) ?? {}; } catch { return {}; } } var ConfigManagerClass = class { async load(userCfg = {}) { if (this._cfg) return; if (this._loadPromise) { await this._loadPromise; return; } this._loadPromise = this._doLoad(userCfg); await this._loadPromise; } async _doLoad(userCfg = {}) { const fileCfg = process.env["TRACEPROMPT_RC"] ? readYaml(process.env["TRACEPROMPT_RC"]) : {}; const envCfg = { ...process.env["TRACEPROMPT_API_KEY"] && { apiKey: process.env["TRACEPROMPT_API_KEY"] }, ...process.env["TRACEPROMPT_INGEST_URL"] && { ingestUrl: process.env["TRACEPROMPT_INGEST_URL"] }, ...process.env["TRACEPROMPT_BATCH_SIZE"] && { batchSize: Number(process.env["TRACEPROMPT_BATCH_SIZE"]) }, ...process.env["TRACEPROMPT_FLUSH_INTERVAL_MS"] && { flushIntervalMs: Number(process.env["TRACEPROMPT_FLUSH_INTERVAL_MS"]) }, ...process.env["TRACEPROMPT_LOG_LEVEL"] && { logLevel: process.env["TRACEPROMPT_LOG_LEVEL"] } }; const merged = { apiKey: "", ingestUrl: "https://api-staging.traceprompt.com/v1/ingest", batchSize: 25, flushIntervalMs: 2e3, staticMeta: {}, logLevel: "info", // Default to info level ...fileCfg, ...envCfg, ...userCfg }; if (!merged.apiKey) throw new Error("Traceprompt: apiKey is required"); let orgId; let cmkArn; let hmacSecret; try { const resolved = await resolveOrgFromApiKey( merged.apiKey, merged.ingestUrl ); orgId = resolved.orgId; cmkArn = resolved.cmkArn; hmacSecret = await fetchHmacSecret(merged.apiKey, merged.ingestUrl); } catch (error) { throw new Error( `Failed to auto-resolve organization or HMAC secret: ${error instanceof Error ? error.message : String(error)}` ); } if (merged.batchSize <= 0) merged.batchSize = 25; if (merged.flushIntervalMs <= 0) merged.flushIntervalMs = 2e3; this._cfg = { ...merged, orgId, cmkArn, hmacSecret, apiKey: merged.apiKey, ingestUrl: merged.ingestUrl }; } get cfg() { if (!this._cfg) { throw new Error("Traceprompt: initTracePrompt() must be called first"); } return this._cfg; } }; async function initTracePrompt(cfg) { await ConfigManager.load(cfg); } var ConfigManager = new ConfigManagerClass(); function buildKeyring() { const { cmkArn } = ConfigManager.cfg; return new KmsKeyringNode({ generatorKeyId: cmkArn }); } var registry = new Registry(); var encryptHist = new Histogram({ name: "traceprompt_encrypt_ms", help: "Latency of client-side AES-GCM envelope encryption (ms)", buckets: [0.05, 0.1, 0.25, 0.5, 1, 2, 5], registers: [registry] }); new Histogram({ name: "traceprompt_token_count", help: "Tokens counted per prompt/response", buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1e3], registers: [registry] }); var flushFailures = new Counter({ name: "traceprompt_flush_failures_total", help: "Number of failed POSTs to the Traceprompt ingest API", registers: [registry] }); var queueGauge = new Gauge({ name: "traceprompt_queue_depth", help: "Number of events currently buffered in memory", registers: [registry] }); var logger = null; function createLogger() { const cfg = ConfigManager.cfg; const logLevel = cfg.logLevel || "verbose"; return winston.createLogger({ level: logLevel, format: winston.format.combine( winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }), winston.format.errors({ stack: true }), winston.format.printf(({ level, message, timestamp, stack }) => { const prefix = `[${timestamp}] [Traceprompt] [${level.toUpperCase()}]`; if (stack) { return `${prefix} ${message} ${stack}`; } return `${prefix} ${message}`; }) ), transports: [ new winston.transports.Console({ handleExceptions: true, handleRejections: true }) ], exitOnError: false }); } function getLogger() { if (!logger) { logger = createLogger(); } return logger; } var log = { error: (message, meta) => getLogger().error(message, meta), warn: (message, meta) => getLogger().warn(message, meta), info: (message, meta) => getLogger().info(message, meta), verbose: (message, meta) => getLogger().verbose(message, meta), debug: (message, meta) => getLogger().debug(message, meta), silly: (message, meta) => getLogger().silly(message, meta) }; // src/crypto/encryptor.ts var { encrypt, decrypt } = buildClient( CommitmentPolicy.REQUIRE_ENCRYPT_REQUIRE_DECRYPT ); async function encryptBuffer(plain) { const keyring = buildKeyring(); const endTimer = encryptHist.startTimer(); try { log.info("Encrypting buffer", { orgId: ConfigManager.cfg.orgId }); const { result, messageHeader } = await encrypt(keyring, plain, { encryptionContext: { org_id: ConfigManager.cfg.orgId } }); const bundle = { ciphertext: Buffer.from(result).toString("base64"), encryptedDataKey: Buffer.from( messageHeader.encryptedDataKeys[0].encryptedDataKey ).toString("base64"), suiteId: messageHeader.suiteId }; return bundle; } finally { endTimer(); } } async function decryptBundle(bundle) { const keyring = buildKeyring(); const { plaintext } = await decrypt( keyring, Buffer.from(bundle.ciphertext, "base64") ); return plaintext; } function computeLeaf(data) { if (data === void 0) { data = "null"; } return blake3(data).toString("hex"); } var encodeFn = null; var tokenCountHist2 = new Histogram({ name: "traceprompt_tokens_per_string", help: "Number of tokens counted per string passed to countTokens()", buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1e3], registers: [registry] }); function countTokens(text) { if (encodeFn) { const t = encodeFn(text); tokenCountHist2.observe(t); return t; } if (maybeInitTiktoken()) { const t = encodeFn(text); tokenCountHist2.observe(t); return t; } const words = text.trim().split(/\s+/g).length; const tokens = Math.ceil(words * 1.33); tokenCountHist2.observe(tokens); return tokens; } var triedTiktoken = false; function maybeInitTiktoken() { if (encodeFn || triedTiktoken) return !!encodeFn; triedTiktoken = true; try { const { encoding_for_model } = __require("@dqbd/tiktoken"); const enc = encoding_for_model("cl100k_base"); encodeFn = (s) => enc.encode(s).length; return true; } catch { return false; } } // src/utils/retry.ts async function retry(fn, attempts = 5, baseDelay = 250, onError) { let attempt = 0; while (true) { try { attempt++; return await fn(); } catch (err) { onError?.(err, attempt); if (attempt >= attempts) throw err; const exp = baseDelay * 2 ** (attempt - 1); const jitter = Math.random() * exp; await new Promise((res) => setTimeout(res, jitter)); } } } function deterministicStringify(obj) { if (obj === null || typeof obj !== "object") { return JSON.stringify(obj); } if (Array.isArray(obj)) { return "[" + obj.map(deterministicStringify).join(",") + "]"; } const keys = Object.keys(obj).sort(); const pairs = keys.map( (key) => JSON.stringify(key) + ":" + deterministicStringify(obj[key]) ); return "{" + pairs.join(",") + "}"; } function generateHmacSignature(payload, secret) { const payloadString = deterministicStringify(payload); const secretBuffer = Buffer.from(secret, "base64"); return createHmac("sha256", secretBuffer).update(payloadString).digest("hex"); } // src/network/transport.ts var Transport = { async post(path3, body, retries = 5, headers) { await sendJson({ path: path3, body, retries, method: "POST", headers }); } }; async function sendJson(opts) { const { ingestUrl, apiKey, hmacSecret } = ConfigManager.cfg; const url = new URL(opts.path, ingestUrl).toString(); const extra = opts.headers ?? {}; let requestBody = opts.body; if (opts.path === "/v1/ingest" && requestBody) { const records = requestBody.records; const hmacSignature = generateHmacSignature(records, hmacSecret); requestBody = { ...requestBody, hmacSignature }; } log.verbose(`Sending request to ${opts.path}`, { url, method: opts.method ?? "POST", retries: opts.retries ?? 5, hasBody: !!requestBody, hasHmacSignature: opts.path === "/v1/ingest" }); await retry( async () => { const res = await fetch$1(url, { method: opts.method ?? "POST", headers: { "content-type": "application/json", "user-agent": "traceprompt-sdk/0.1.0", "x-api-key": apiKey, ...extra }, body: JSON.stringify(requestBody) }); if (res.status >= 400) { const msg = await res.text(); const errorMessage = `HTTP ${res.status} - ${msg}`; if (res.status >= 500) { log.warn(`Server error (will retry): ${errorMessage}`, { status: res.status, url, response: msg }); } else if (res.status === 429) { log.warn(`Rate limited (will retry): ${errorMessage}`, { status: res.status, url, response: msg }); } else if (res.status === 401 || res.status === 403) { log.error(`Authentication/authorization error: ${errorMessage}`, { status: res.status, url, response: msg, hint: "Check your API key and organization permissions" }); } else { log.error(`Client error: ${errorMessage}`, { status: res.status, url, response: msg }); } throw new Error(`Traceprompt: ${errorMessage}`); } log.debug(`Request successful`, { status: res.status, url }); }, opts.retries ?? 5, 250, (error, attempt) => { log.verbose(`Request attempt ${attempt} failed, retrying...`, { error: error instanceof Error ? error.message : String(error), attempt, maxRetries: opts.retries ?? 5, url }); } ); log.verbose(`Request completed successfully`, { url }); } // src/queue/persistentBatcher.ts function getConfig() { return ConfigManager.cfg; } function getDir() { const cfg = getConfig(); return path2__default.resolve(cfg.dataDir ?? ".traceprompt", "queue"); } function getLogPath() { return path2__default.join(getDir(), "outbox.log"); } function getMaxRamRecords() { const cfg = getConfig(); return (cfg.batchSize || 10) * 2; } var MAX_FILE_BYTES = 5 * 1024 * 1024; var bootstrapDone = false; var pLimitPromise = null; var closing = false; async function getPLimit() { if (!pLimitPromise) { pLimitPromise = import('p-limit').then((module) => module.default); } return pLimitPromise; } async function bootstrap() { if (bootstrapDone) return; await fs2.mkdir(getDir(), { recursive: true }); bootstrapDone = true; } var ring = []; var head = 0; var len = 0; var ringInitialized = false; function initializeRing() { if (ringInitialized) return; const maxRecords = getMaxRamRecords(); ring = new Array(maxRecords); ringInitialized = true; } function ringPush(item) { initializeRing(); const maxRecords = getMaxRamRecords(); ring[(head + len) % maxRecords] = item; if (len < maxRecords) { len++; return; } head = (head + 1) % maxRecords; } function ringDrip(n) { initializeRing(); const maxRecords = getMaxRamRecords(); const out = []; while (out.length < n && len > 0) { out.push(ring[head]); head = (head + 1) % maxRecords; len--; } return out; } async function append(item) { if (closing) { throw new Error("Traceprompt SDK is shutting down, rejecting new events"); } await bootstrap(); initializeTimer(); const rec = JSON.stringify({ id: randomUUID(), ...item }) + "\n"; try { await fs2.appendFile(getLogPath(), rec, "utf8"); log.debug("Record appended to outbox", { outboxPath: getLogPath(), recordSize: rec.length }); } catch (error) { log.error("Failed to append record to outbox", { error: error instanceof Error ? error.message : String(error), outboxPath: getLogPath() }); throw error; } ringPush(item); queueGauge.set(len); log.verbose("Record added to ring buffer", { ringSize: len, maxRingSize: getMaxRamRecords() }); try { const { size } = await fs2.stat(getLogPath()); if (size > MAX_FILE_BYTES) { log.error("Outbox file size exceeded limit - applying backpressure", { currentSize: size, maxSize: MAX_FILE_BYTES, outboxPath: getLogPath() }); throw new Error( "Traceprompt SDK backpressure: local outbox full, ingest unreachable." ); } if (size > MAX_FILE_BYTES * 0.8) { log.warn("Outbox file size approaching limit", { currentSize: size, maxSize: MAX_FILE_BYTES, percentFull: Math.round(size / MAX_FILE_BYTES * 100), outboxPath: getLogPath() }); } } catch (e) { if (e.code !== "ENOENT") { log.warn("Failed to check outbox file size", { error: e instanceof Error ? e.message : String(e), outboxPath: getLogPath() }); throw e; } } } var limit = null; async function flushOnce() { await bootstrap(); initializeTimer(); if (!limit) { const pLimit = await getPLimit(); limit = pLimit(1); } return limit(async () => { const cfg = getConfig(); const batchSize = cfg.batchSize || 10; let batch = []; const ringRecords = ringDrip(batchSize); if (ringRecords.length > 0) { log.verbose("Using ring buffer records for flush", { ringRecords: ringRecords.length, batchSize }); batch = ringRecords.map((record) => ({ id: randomUUID(), ...record })); } let diskLines = []; let totalDiskRecords = 0; if (batch.length < batchSize) { const needed = batchSize - batch.length; try { const rl = createInterface({ input: createReadStream(getLogPath()) }); const diskBatch = []; for await (const line of rl) { if (!line.trim()) continue; if (diskBatch.length < needed) { diskBatch.push(JSON.parse(line)); } diskLines.push(line); totalDiskRecords++; if (diskBatch.length >= needed && totalDiskRecords >= needed * 2) { break; } } rl.close(); if (diskBatch.length > 0) { log.verbose("Supplementing with disk records", { ringRecords: batch.length, diskRecords: diskBatch.length, totalDiskRecordsRead: totalDiskRecords }); batch.push(...diskBatch); } } catch (error) { if (error.code === "ENOENT") { if (batch.length === 0) { log.debug("No records in ring buffer or disk, nothing to flush"); return; } } else { log.warn("Error reading outbox file", { error: error.message, outboxPath: getLogPath() }); } } } else { try { const rl = createInterface({ input: createReadStream(getLogPath()) }); for await (const line of rl) { if (line.trim()) { diskLines.push(line); totalDiskRecords++; } } rl.close(); } catch (error) { if (error.code !== "ENOENT") { log.warn("Error counting disk records", { error: error.message, outboxPath: getLogPath() }); } } } if (batch.length === 0) { log.debug("No records available for flush"); return; } const totalPending = totalDiskRecords + (ringRecords.length > batch.length ? 0 : len); queueGauge.set(totalPending); log.info("Starting batch flush", { batchSize: batch.length, fromRingBuffer: Math.min(ringRecords.length, batch.length), fromDisk: Math.max(0, batch.length - ringRecords.length), totalPendingAfterFlush: totalPending - batch.length, outboxPath: getLogPath() }); const body = { records: batch.map(({ payload, leafHash }) => ({ payload, leafHash })) }; try { await Transport.post("/v1/ingest", body, { "Idempotency-Key": batch[0].leafHash }); if (totalDiskRecords > 0) { const diskRecordsUsed = Math.max(0, batch.length - ringRecords.length); if (diskRecordsUsed > 0) { let allDiskLines; if (diskLines.length === totalDiskRecords) { allDiskLines = diskLines; } else { try { const text = await fs2.readFile(getLogPath(), "utf8"); allDiskLines = text.trim().split("\n").filter(Boolean); } catch (error) { log.error("Failed to read outbox file for cleanup", { error: error instanceof Error ? error.message : String(error), outboxPath: getLogPath() }); return; } } const remaining = allDiskLines.slice(diskRecordsUsed); if (remaining.length > 0) { await fs2.writeFile(getLogPath(), remaining.join("\n") + "\n"); log.info("Batch flush successful, updated outbox", { flushedRecords: batch.length, fromRingBuffer: ringRecords.length, fromDisk: diskRecordsUsed, remainingOnDisk: remaining.length }); queueGauge.set(totalPending - batch.length); } else { await fs2.writeFile(getLogPath(), ""); log.info("Batch flush successful, outbox cleared", { flushedRecords: batch.length, fromRingBuffer: ringRecords.length, fromDisk: diskRecordsUsed }); queueGauge.set(totalPending - batch.length); } } else { log.info("Batch flush successful, used only ring buffer", { flushedRecords: batch.length, diskRecordsRemaining: totalDiskRecords }); queueGauge.set(totalPending - batch.length); } } else { log.info("Batch flush successful, used only ring buffer", { flushedRecords: batch.length }); queueGauge.set(totalPending - batch.length); } } catch (e) { const errorMessage = e instanceof Error ? e.message : String(e); if (ringRecords.length > 0) { log.warn("Flush failed, restoring ring buffer records to disk", { ringRecordsToRestore: ringRecords.length }); const ringRecordsAsLines = ringRecords.map( (record) => JSON.stringify({ id: randomUUID(), ...record }) ); try { let existingContent = ""; try { existingContent = await fs2.readFile(getLogPath(), "utf8"); } catch { } const allLines = [...ringRecordsAsLines]; if (existingContent.trim()) { allLines.push( ...existingContent.trim().split("\n").filter(Boolean) ); } await fs2.writeFile(getLogPath(), allLines.join("\n") + "\n"); } catch (restoreError) { log.error("Failed to restore ring buffer records to disk", { error: restoreError instanceof Error ? restoreError.message : String(restoreError), lostRecords: ringRecords.length }); } } if (errorMessage.includes("HTTP 5")) { log.warn("Server error during batch flush, will retry", { error: errorMessage, batchSize: batch.length, totalPending }); } else if (errorMessage.includes("HTTP 429")) { log.warn("Rate limited during batch flush, will retry", { error: errorMessage, batchSize: batch.length, totalPending }); } else if (errorMessage.includes("HTTP 4")) { log.error("Client error during batch flush", { error: errorMessage, batchSize: batch.length, totalPending, hint: "Check API configuration and request format" }); } else { log.error("Network error during batch flush", { error: errorMessage, batchSize: batch.length, totalPending }); } flushFailures.inc(); throw e; } }); } var timerInitialized = false; var flushTimer = null; function initializeTimer() { if (timerInitialized) return; timerInitialized = true; const cfg = getConfig(); log.info("Initializing periodic flush timer", { flushIntervalMs: cfg.flushIntervalMs }); flushTimer = setInterval( () => flushOnce().catch((error) => { log.verbose("Periodic flush failed, will retry on next interval", { error: error instanceof Error ? error.message : String(error), nextRetryIn: cfg.flushIntervalMs }); }), cfg.flushIntervalMs ); flushTimer.unref(); } async function flushWithRetry(opts) { for (let attempt = 1; attempt <= opts.maxRetries; attempt++) { try { await flushOnce(); return; } catch (error) { if (attempt === opts.maxRetries) throw error; const delayMs = Math.min(500 * Math.pow(2, attempt - 1), 4e3); log.debug("Flush attempt failed, retrying", { attempt, maxRetries: opts.maxRetries, delayMs, error: error instanceof Error ? error.message : String(error) }); await new Promise((resolve2) => setTimeout(resolve2, delayMs)); } } } async function drainOutboxWithRetry(opts) { const startTime = Date.now(); let attempt = 0; while (Date.now() - startTime < opts.maxTimeoutMs) { attempt++; try { const outboxContent = await fs2.readFile(getLogPath(), "utf8").catch(() => ""); if (!outboxContent.trim()) { log.info("Outbox is empty, drain complete"); return; } await flushWithRetry({ maxRetries: opts.maxRetries }); } catch (error) { log.warn("Outbox drain attempt failed", { attempt, error: error instanceof Error ? error.message : String(error) }); const delayMs = Math.min(500 * Math.pow(2, attempt - 1), 4e3); await new Promise((resolve2) => setTimeout(resolve2, delayMs)); } } throw new Error(`Outbox drain timed out after ${opts.maxTimeoutMs}ms`); } async function gracefulShutdown() { log.info("Starting graceful shutdown"); closing = true; if (flushTimer) { clearInterval(flushTimer); log.debug("Cleared periodic flush timer"); } log.info("Flushing in-memory ring buffer"); await flushWithRetry({ maxRetries: 3 }); log.info("Draining persistent outbox"); await drainOutboxWithRetry({ maxRetries: 5, maxTimeoutMs: 3e4 }); log.info("Graceful shutdown completed successfully"); } process.on("SIGTERM", async () => { try { await gracefulShutdown(); process.exit(0); } catch (error) { log.error("Graceful shutdown failed", { error: error instanceof Error ? error.message : String(error) }); flushFailures.inc(); process.exit(1); } }); process.on("SIGINT", async () => { try { await gracefulShutdown(); process.exit(0); } catch (error) { log.error("Graceful shutdown failed", { error: error instanceof Error ? error.message : String(error) }); flushFailures.inc(); process.exit(1); } }); var PersistentBatcher = { enqueue: append, flush: flushOnce, gracefulShutdown }; // src/piiDetector/preprocessor.ts function preprocess(raw) { const norm = raw.normalize("NFC"); let cleaned = ""; const idx = []; for (let i = 0; i < norm.length; i++) { const ch = norm[i]; if (/\s/.test(ch)) { if (cleaned[cleaned.length - 1] !== " ") { cleaned += " "; idx.push(i); } } else { cleaned += ch; idx.push(i); } } const map = { origPos(n) { return idx[n] ?? n; } }; return { text: cleaned, map }; } // src/piiDetector/utils/luhn.ts function luhnValid(num) { const digits = num.replace(/\D+/g, "").split("").map(Number).reverse(); let sum = 0; for (let i = 0; i < digits.length; i++) { let n = digits[i]; if (i % 2 === 1) { n *= 2; if (n > 9) n -= 9; } sum += n; } return sum % 10 === 0; } // src/piiDetector/utils/aba.ts function abaValid(routing) { if (!/^\d{9}$/.test(routing)) return false; const weights = [3, 7, 1]; let sum = 0; for (let i = 0; i < 9; i++) { sum += +routing[i] * weights[i % 3]; } return sum % 10 === 0; } // src/piiDetector/recognizers/regexRecognizer.ts var EMAIL_RE = /\b[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/gu; var PHONE_RE = /(?<!account\s)(?<!sort\s?code\s)(?:\+?\d{1,3}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\s-]?)?(?:\(\d{2,4}\)|\d{2,4})[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\s-]?\d{3,4}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\s-]?\d{3,4}(?!\s+\d{4})\b/gu; var SSN_RE = /\b\d{3}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015-]\d{2}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015-]\d{4}\b/gu; var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\b/gu; var VISA_RE = /\b4\d{12}(\d{3})?\b/gu; var AMEX_RE = /\b3[47]\d{13}\b/gu; var CC_GROUP_RE = /\b(?:\d{4}[-\s]?){3}\d{4}\b/gu; var PAN_GENERIC_RE = /\b\d{4}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\s-]\d{4}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\s-]\d{4}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\s-]\d{3,7}\b/gu; var PAN_MASKED_RE = /\b\d{4}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\s-]?\d{2}\*{2}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\s-]?\*{4}[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\s-]?\d{4}\b/gu; var IBAN_RE = /\b[A-Z]{2}\d{2}(?:[ \dA-Z]){11,30}\b/gu; var SWIFT_RE = /\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\b/gu; var ROUTING_RE = /\b\d{3}[\s-]?\d{6}(?=\s*(?:routing|aba))|(?:routing|aba)\s+\d{3}[\s-]?\d{6}\b/giu; var SORT_ACC_RE = /\b\d{2}-\d{2}-\d{2}(?:\s+\w+){0,4}?\s*\d{8}\b/gi; var ACCT_RE = /(?:acct|account)\s+\d{8,12}\b/gi; var POSTCODE_RE = /\b[A-Z]{1,2}\d{1,2}[A-Z]?\s?\d[A-Z]{2}\b/gi; var NINO_RE = /\b[ABCEGHJ-PRSTW-ZQ]{2}\d{6}[A-D]\b/gi; var PASSPORT_RE = /\b(?:[A-Z]{1,2}\d{7}|[A-Z]\d{8}|[A-Z0-9]{9}|\d{9})\b/gu; var DL_RE = /\b\d{8,15}[A-Z]{0,2}\b/gu; var DL_CA_RE = /\bD\d{7}\b/gu; var MAC_RE = /\b(?:[0-9A-F]{2}[:-]){5}[0-9A-F]{2}\b/gi; var IMEI_RE = /\b\d{15}\b/g; var MRN_RE = /\b(?:MRN|Patient\s*(?:ID|No\.?))\s*#?\s*\d{6,10}\b/gi; var DOB_RE = /\b(?:DOB|Date\s*of\s*birth|D\.O\.B\.?)\s*[:\-]?\s*(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})\b/gi; var INS_RE = /\b(?:policy|member|insurance)\s*(?:id|no|number|#)\s*[:\-]?\s*[A-Z0-9]{3,}(?:[-–][A-Z0-9]{2,})?(?=\b|[,.;])(?!\s*card)\b/gi; function push(out, type, m, map, conf = 1) { out.push({ type, start: map.origPos(m.index), end: map.origPos(m.index + m[0].length), text: m[0], confidence: conf, source: "regex", risk: "general" }); } function overlapsExisting(list, s, e) { return list.some((x) => !(e <= x.start || s >= x.end)); } var regexRecognizer = { id: "regex", detect(text, map) { const out = []; for (const m of text.matchAll(EMAIL_RE)) push(out, "EMAIL", m, map); for (const m of text.matchAll(SSN_RE)) push(out, "SSN", m, map, 0.98); for (const m of text.matchAll(IPV4_RE)) push(out, "IP", m, map, 0.95); for (const m of text.matchAll(ROUTING_RE)) { const contextWindow = text.slice( Math.max(0, m.index - 10), m.index + m[0].length + 15 ); if (/routing|aba/i.test(contextWindow)) { if (abaValid(m[0])) push(out, "US_ROUTING", m, map, 0.9); } } for (const m of text.matchAll(VISA_RE)) if (luhnValid(m[0])) push(out, "CREDIT_CARD", m, map, 0.95); for (const m of text.matchAll(AMEX_RE)) if (luhnValid(m[0])) push(out, "CREDIT_CARD", m, map, 0.95); for (const m of text.matchAll(CC_GROUP_RE)) { const digits = m[0].replace(/\D+/g, ""); if (digits.length === 16 && luhnValid(digits)) { push(out, "CREDIT_CARD", m, map, 0.95); } } for (const m of text.matchAll(PAN_GENERIC_RE)) { const digits = m[0].replace(/\D+/g, ""); if (digits.length < 13 || digits.length > 19) continue; if (/[\*x]/i.test(m[0])) continue; if (luhnValid(digits)) { if (!overlapsExisting( out, map.origPos(m.index), map.origPos(m.index + m[0].length) )) push(out, "CREDIT_CARD", m, map, 0.95); } } for (const m of text.matchAll(PAN_MASKED_RE)) { if (!overlapsExisting( out, map.origPos(m.index), map.origPos(m.index + m[0].length) )) push(out, "CREDIT_CARD_PARTIAL", m, map, 0.7); } for (const m of text.matchAll(IBAN_RE)) push(out, "IBAN", m, map, 0.9); for (const m of text.matchAll(SWIFT_RE)) push(out, "SWIFT_BIC", m, map, 0.9); for (const m of text.matchAll(SORT_ACC_RE)) push(out, "UK_BANK_ACCT", m, map, 0.9); for (const m of text.matchAll(NINO_RE)) push(out, "NINO", m, map, 0.9); for (const m of text.matchAll(ROUTING_RE)) push(out, "US_ROUTING", m, map, 0.9); for (const m of text.matchAll(ACCT_RE)) push(out, "BANK_ACCOUNT", m, map, 0.85); for (const m of text.matchAll(MAC_RE)) { const ctx = text.slice( Math.max(0, m.index - 10), m.index + m[0].length + 10 ); if (/mac|address|ethernet|wifi|device/i.test(ctx)) { push(out, "MAC_ADDRESS", m, map, 0.85); } } for (const m of text.matchAll(IMEI_RE)) { const ctx = text.slice( Math.max(0, m.index - 10), m.index + m[0].length + 10 ); if (/imei|device|phone|mobile/i.test(ctx)) { push(out, "IMEI", m, map, 0.9); } } for (const m of text.matchAll(PHONE_RE)) { const digits = m[0].replace(/\D+/g, ""); if (digits.length < 9 || digits.length > 12) continue; if (overlapsExisting( out, map.origPos(m.index), map.origPos(m.index + m[0].length) )) continue; const pre = text.slice(Math.max(0, m.index - 15), m.index).toLowerCase(); if (/aba\s|acct\s|routing\s|checking\s|sin\s|ein\s|insee\s|dni\s|nhs\s|mbi\s|npi\s|svnr\s|ohip\s|medicare\s|mac\s|imei\s|member\s|plan\s|policy\s|insurance\s/.test( pre )) continue; push(out, "PHONE", m, map, 0.9); } for (const m of text.matchAll(PASSPORT_RE)) { const ctx = text.slice( Math.max(0, m.index - 15), m.index + m[0].length + 5 ); if (/passport|passeport|travel|document|P<|pièce/i.test(ctx)) { push(out, "PASSPORT", m, map, 0.9); } } for (const m of text.matchAll(DL_RE)) { const ctx = text.slice(Math.max(0, m.index - 12), m.index); if (/\bDL\b|driver|licen[cs]e/i.test(ctx)) push(out, "DRIVER_LICENSE", m, map, 0.8); } for (const m of text.matchAll(DL_CA_RE)) { const ctx = text.slice(Math.max(0, m.index - 10), m.index); if (/\bDL\b|driver/i.test(ctx)) push(out, "DRIVER_LICENSE", m, map, 0.8); } for (const m of text.matchAll(MRN_RE)) push(out, "MEDICAL_ID", m, map, 0.9); for (const m of text.matchAll(DOB_RE)) push(out, "DOB", m, map, 0.85); for (const m of text.matchAll(INS_RE)) push(out, "INSURANCE_ID", m, map, 0.8); for (const m of text.matchAll(POSTCODE_RE)) push(out, "POSTCODE", m, map, 0.7); return out; } }; var nlp = winkNLP(model); var nerRecognizer = { id: "wink-ner", detect(text, map) { const doc = nlp.readDoc(text); return doc.entities().out(its.detail).filter((e) => e.type === "PERSON" || e.type === "LOCATION").map((e) => ({ type: e.type === "PERSON" ? "FULL_NAME" : "ADDRESS", start: map.origPos(e.start), end: map.origPos(e.end), text: text.slice(e.start, e.end), confidence: 0.7, source: this.id, risk: "sensitive" })); } }; // src/piiDetector/idPatterns.json var idPatterns_default = [ { type: "DNI", regex: "\\b\\d{8}[A-Z]\\b", context: ["dni", "national"], validate: "dniCheck" }, { type: "INSEE_SSN", regex: "\\b[12]\\s?\\d{2}\\s?\\d{2}\\s?\\d{2}\\s?\\d{3}\\s?\\d{3}\\b", context: ["social security", "insee", "num\xE9ro"], validate: "inseeCheck" }, { type: "EU_NATIONAL_ID", regex: "\\b\\d{6}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?\\d{3}\\.\\d{2}\\b", context: [ "eid", "rijksregisternummer", "national", "belgian", "netherlands" ], validate: "beEidCheck" }, { type: "EIN", regex: "\\b\\d{2}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?\\d{7}\\b", context: ["ein", "tax id", "tin", "employer", "federal"], validate: null }, { type: "UK_DL", regex: "\\b[A-Z]{5}\\d{6}[A-Z0-9]{5}\\b", context: ["driving licence", "driver licence", "dvla"], validate: null }, { type: "ON_DL", regex: "\\b[A-Z]\\d{4}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015-]\\d{5}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015-]\\d{5}\\b", context: ["driver licence", "licence", "ontario"], validate: null }, { type: "CA_SIN", regex: "\\b\\d{3}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?\\d{3}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?\\d{3}\\b", context: ["sin", "social insurance"], validate: null }, { type: "PERSONNUMMER", regex: "\\b\\d{6}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?\\d{4}\\b", context: ["personnummer"], validate: "luhn10" }, { type: "NHS_NUMBER", regex: "\\b\\d{3}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?\\d{3}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?\\d{4}\\b", context: ["nhs number", "chi"], validate: "nhsMod11" }, { type: "MBI", regex: "\\b[0-9][A-HJ-NP-TV-Z][0-9A-HJ-NP-TV-Z][0-9][A-HJ-NP-TV-Z]{2}[0-9][A-HJ-NP-TV-Z]{2}[0-9]{2}\\b", context: ["medicare", "mbi"], validate: null }, { type: "NPI", regex: "\\b\\d{10}\\b", context: ["npi", "provider"], validate: "luhn10" }, { type: "ON_HEALTH", regex: "\\b\\d{10}\\b", context: ["health card", "ohip"], validate: "luhn10" }, { type: "SVNR", regex: "\\b\\d{2}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?\\d{6}[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?[A-Z][\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\s-]?\\d{3}\\b", context: ["svnr", "versicherungsnummer"], validate: "svnrMod11" } ]; // src/piiDetector/utils/checksums.ts function dniCheck(num) { const letters = "TRWAGMYFPDXBNJZSQVHLCKE"; const n = parseInt(num.slice(0, 8), 10); return num[8] === letters[n % 23]; } function inseeCheck(num) { const clean = num.replace(/\s/g, ""); const key = parseInt(clean.slice(-2), 10); const body = parseInt(clean.slice(0, -2), 10); return 97 - body % 97 === key; } function beEidCheck(num) { const clean = num.replace(/[-.]/g, ""); const base = parseInt(clean.slice(0, 9), 10); const chk = parseInt(clean.slice(9), 10); return 97 - base % 97 === chk; } function luhn10(num) { const clean = num.replace(/\D+/g, ""); const digits = clean.split("").map(Number).reverse(); let sum = 0; for (let i = 0; i < digits.length; i++) { let n = digits[i]; if (i % 2 === 1) { n *= 2; if (n > 9) n -= 9; } sum += n; } return sum % 10 === 0; } function nhsMod11(num) { const digits = num.replace(/\D/g, "").slice(0, 9); if (digits.length !== 9) return false; const sum = [...digits].reduce((acc, d, i) => acc + Number(d) * (10 - i), 0); const chk = 11 - sum % 11; const expectedCheck = chk === 11 ? 0 : chk; return expectedCheck === Number(num.replace(/\D/g, "")[9]); } function svnrMod11(num) { const digits = num.replace(/\D/g, "").slice(0, 10); if (digits.length !== 10) return false; let sum = 0; for (let i = 0; i < 10; i++) { sum += Number(digits[i]) * (2 + i); } const chk = sum % 11; return chk === Number(num.replace(/\D/g, "").slice(-1)); } function imeiLuhn(num) { return luhn10(num); } function npiLuhn(num) { return luhn10(num); } // src/piiDetector/recognizers/idRecognizer.ts var compiled = idPatterns_default.map((rule) => ({ ...rule, re: new RegExp(rule.regex, "gu") })); var validators = { dniCheck, inseeCheck, beEidCheck, luhn10, nhsMod11, svnrMod11, imeiLuhn, npiLuhn }; var idRecognizer = { id: "nat-id", detect(text, map) { const out = []; for (const rule of compiled) { for (const m of text.matchAll(rule.re)) { if (rule.context) { const contextPattern = new RegExp(rule.context.join("|"), "i"); const contextWindow = text.slice( Math.max(0, m.index - 25), m.index + m[0].length + 25 ); if (!contextPattern.test(contextWindow)) continue; if (rule.type === "EIN" && /routing|aba|acct|checking|bank/i.test(contextWindow)) continue; } let confidence = 0.9; if (rule.validate) { const validator = validators[rule.validate]; const cleanNum = m[0].replace( /[\s\u002D\u2010\u2011\u2012\u2013\u2014\u2015.-/]/g, "" ); if (validator && !validator(cleanNum)) { confidence = 0.7; } } out.push({ type: rule.type, start: map.origPos(m.index), end: map.origPos(m.index + m[0].length), text: m[0], confidence, source: this.id, risk: "critical" }); } } return out; } }; function findTextPositions(text, searchText) { const positions = []; let index = 0; while (index < text.length) { const foundIndex = text.indexOf(searchText, index); if (foundIndex === -1) break; positions.push({ start: foundIndex, end: foundIndex + searchText.length }); index = foundIndex + 1; } return positions; } var compromiseRecognizer = { id: "compromise", detect(text, map) { const entities = []; const doc = nlp2(text); const people = doc.people().json(); for (const person of people) { const personDoc = nlp2(person.text); if (personDoc.has("#Verb") || personDoc.has("#Adjective") || personDoc.has("#Adverb")) { continue; } const isInstructionalTerm = personDoc.has("#Imperative") || /^(emergency|health|monitor|symptoms|seek|call|avoid|inform|positioning|stay|sit)$/i.test( person.text.trim() ); if (isInstructionalTerm) continue; const positions = findTextPositions(text, person.text); for (const pos of positions) { entities.push({ type: person.text.includes(" ") ? "FULL_NAME" : "FIRST_NAME", start: map.origPos(pos.start), end: map.origPos(pos.end), text: person.text, confidence: 0.85, // High confidence from compromise.js people detection source: this.id, risk: "sensitive" }); } } const organizations = doc.organizations().json(); for (const org of organizations) { if (/^\([^)]*\)$/.test(org.text) || // Skip text in parentheses like "(e.g., 911 in the U.S.)" /^(U\.S\.|UK|USA|Canada|Europe)[\)\.]?$/i.test(org.text.trim())) { continue; } const positions = findTextPositions(text, org.text); for (const pos of positions) { entities.push({ type: "FULL_NAME", // We'll mark these differently with source start: map.origPos(pos.start), end: map.origPos(pos.end), text: org.text, confidence: 0.75, source: "compromise-org", // Different source to identify as business risk: "sensitive" }); } } return entities; } }; function isLikelyNotAName(text, fullContext) { try { const doc = nlp2(fullContext); const wordInContext = doc.match(text); if (!wordInContext.found) return false; if (wordInContext.has("#Verb")) return true; if (wordInContext.has("#Adjective")) return true; if (wordInContext.has("#Adverb")) return true; if (wordInContext.has("#Preposition")) return true; if (wordInContext.has("#Conjunction")) return true; if (wordInContext.has("#Determiner")) return true; if (wordInContext.has("#Modal")) return true; if (wordInContext.has("#Auxiliary")) return true; if (wordInContext.has("#Imperative")) return true; if (wordInContext.has("#CommonNoun")) return true; if (wordInContext.has("#Gerund")) return true; const medicalTerms = /^(emergency|health|medical|symptoms|monitor|treatment|therapy|diagnosis|prescription|medication|hospital|clinic|doctor|patient|procedure|surgery|examination|consultation|ambulance|paramedic|nurse|vital|condition|disease|illness|infection|virus|bacteria|fever|pain|breathing|respiratory|cardiac|blood|pressure|heart|lung|brain|liver|kidney|diabetes|cancer|stroke|seizure|allergy|injection|vaccine|test|scan|xray|mri|ultrasound|laboratory|specimen|sample|result|report|chart|record)$/i; if (medicalTerms.test(text.trim())) return true; const instructionalTerms = /^(seek|call|avoid|inform|position|stay|sit|take|give|provide|contact|reach|report|listen|watch|observe|check|verify|confirm|ensure|prevent|reduce|increase|decrease|improve|maintain|continue|stop|start|begin|end|finish|complete|follow|perform|execute|implement|apply|use|utilize|operate|handle|manage|control|direct|guide|assist|help|support|advise|recommend|suggest|indicate|show|demonstrate|explain|describe|discuss|review|examine|evaluate|assess|analyze|consider|determine|decide|choose|select|prefer|require|need|want|wish|hope|expect|anticipate|prepare|plan|organize|arrange|schedule|coordinate|communicate|inform|notify|alert|warn|remind|update|progress|develop|create|establish|build|construct)$/i; if (instructionalTerms.test(text.trim())) return true; const contextAroundWord = fullContext.substring( Math.max(0, fullContext.indexOf(text) - 20), fullContext.indexOf(text) + text.length + 20 ); if (/[0-9]+\.\s*\*?\*?/.test(contextAroundWord) || /#{1,6}\s/.test(contextAroundWord) || /\*\*.*\*\*/.test(contextAroundWord)) { return true; } const docData = wordInContext.json(); if (docData && docData.length > 0 && docData[0].terms) { const tags = docData[0].terms[0].tags || []; const nonNameTags = [ "Verb", "Adjective", "Adverb", "Preposition", "Conjunction", "Determiner", "Modal", "Auxiliary", "Imperative", "CommonNoun", "Gerund", "Infinitive", "PastTense", "PresentTense", "FutureTense", "Comparative", "Superlative", "Possessive", "Plural" ]; return tags.some((tag) => nonNameTags.includes(tag)); } return false; } catch (error) { console.warn("Error in isLikelyNotAName:", error); return false; } } // src/piiDetector/recognizers/nameRecognizer.ts var nlp3 = winkNLP(model); var NAME_PREFIXES = /* @__PURE__ */ new Set([ "mr", "mrs", "ms", "miss", "dr", "prof", "professor", "sir", "lady", "lord", "rev", "father", "sister", "brother", "captain", "major", "colonel", "general", // Additional titles from feedback "sen", "rep", "judge", "officer", "sgt", "st", // Saint "detective", "deputy", "chief" ]); var NAME_SUFFIXES = /* @__PURE__ */ new Set([ "jr", "sr", "ii", "iii", "iv", "phd", "md", "esq", "cpa", "rn" ]); var buildStopSet = (words) => new Set(words.map((w) => w.toLowerCase())); var COMMON_WORDS_RAW = [ "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "if", "in", "into", "is", "it", "its", "itself", "let", "me", "more", "most", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "should", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "with", "would", "you", "your", "yours", "yourself", "yourselves", // Days and months "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", // Common false positives "email", "phone", "address", "contact", "company", "team", "group", "department", "office", "building", "hello", "thanks", "please", "regards", "best", "dear", "sincerely", "yours", "welcome", "goodbye", "meeting", "call", "conference", "session", "appointment", "interview", "discussion", // Additional words commonly flagged as false positives "close", "update", "delete", "create", "remove", "add", "set", "get", "help",