traceprompt-node
Version:
Client-side encrypted, audit-ready logging for LLM applications
220 lines (203 loc) • 8.95 kB
text/typescript
import { Recognizer, Entity } from "../../types";
import { luhnValid } from "../utils/luhn";
import { abaValid } from "../utils/aba";
// ─── Structured PII patterns ────────────────────────────────────────────
const EMAIL_RE = /\b[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/gu;
const PHONE_RE =
/(? !(e <= x.start || s >= x.end));
}
// ─── Exported recogniser ────────────────────────────────────────────────
export const regexRecognizer: Recognizer = {
id: "regex",
detect(text, map) {
const out: Entity[] = [];
// basic patterns
for (const m of text.matchAll(EMAIL_RE)) push(out, "EMAIL", m, map);
for (const m of text.matchAll(SSN_RE)) push(out, "SSN", m, map, 0.98);
for (const m of text.matchAll(IPV4_RE)) push(out, "IP", m, map, 0.95);
// ABA routing (promote before phone to avoid conflicts)
for (const m of text.matchAll(ROUTING_RE)) {
const contextWindow = text.slice(
Math.max(0, m.index! - 10),
m.index! + m[0].length + 15
);
if (/routing|aba/i.test(contextWindow)) {
if (abaValid(m[0])) push(out, "US_ROUTING", m, map, 0.9);
}
}
// finance / IDs
for (const m of text.matchAll(VISA_RE))
if (luhnValid(m[0])) push(out, "CREDIT_CARD", m, map, 0.95);
for (const m of text.matchAll(AMEX_RE))
if (luhnValid(m[0])) push(out, "CREDIT_CARD", m, map, 0.95);
for (const m of text.matchAll(CC_GROUP_RE)) {
const digits = m[0].replace(/\D+/g, "");
if (digits.length === 16 && luhnValid(digits)) {
push(out, "CREDIT_CARD", m, map, 0.95);
}
}
// Generic PAN: covers Discover, JCB, fancy dashes, etc.
for (const m of text.matchAll(PAN_GENERIC_RE)) {
const digits = m[0].replace(/\D+/g, "");
// Ignore if <13 or >19 digits (after stripping separators)
if (digits.length < 13 || digits.length > 19) continue;
// Skip masked cards containing '*' or 'x'
if (/[\*x]/i.test(m[0])) continue;
if (luhnValid(digits)) {
// Prevent duplicate if already captured by VISA_RE/AMEX_RE/CC_GROUP_RE
if (
!overlapsExisting(
out,
map.origPos(m.index!),
map.origPos(m.index! + m[0].length)
)
)
push(out, "CREDIT_CARD", m, map, 0.95);
}
}
// Masked credit cards (optional - lower confidence)
for (const m of text.matchAll(PAN_MASKED_RE)) {
if (
!overlapsExisting(
out,
map.origPos(m.index!),
map.origPos(m.index! + m[0].length)
)
)
push(out, "CREDIT_CARD_PARTIAL", m, map, 0.7);
}
for (const m of text.matchAll(IBAN_RE)) push(out, "IBAN", m, map, 0.9);
for (const m of text.matchAll(SWIFT_RE))
push(out, "SWIFT_BIC", m, map, 0.9);
for (const m of text.matchAll(SORT_ACC_RE))
push(out, "UK_BANK_ACCT", m, map, 0.9);
for (const m of text.matchAll(NINO_RE)) push(out, "NINO", m, map, 0.9);
for (const m of text.matchAll(ROUTING_RE))
push(out, "US_ROUTING", m, map, 0.9);
for (const m of text.matchAll(ACCT_RE))
push(out, "BANK_ACCOUNT", m, map, 0.85);
// device identifiers (before phone to avoid conflicts)
for (const m of text.matchAll(MAC_RE)) {
const ctx = text.slice(
Math.max(0, m.index! - 10),
m.index! + m[0].length + 10
);
if (/mac|address|ethernet|wifi|device/i.test(ctx)) {
push(out, "MAC_ADDRESS", m, map, 0.85);
}
}
for (const m of text.matchAll(IMEI_RE)) {
const ctx = text.slice(
Math.max(0, m.index! - 10),
m.index! + m[0].length + 10
);
if (/imei|device|phone|mobile/i.test(ctx)) {
push(out, "IMEI", m, map, 0.9);
}
}
// phone with overlap and context guards
for (const m of text.matchAll(PHONE_RE)) {
const digits = m[0].replace(/\D+/g, "");
if (digits.length < 9 || digits.length > 12) continue; // skip CCNs
if (
overlapsExisting(
out,
map.origPos(m.index!),
map.origPos(m.index! + m[0].length)
)
)
continue;
const pre = text
.slice(Math.max(0, m.index! - 15), m.index!)
.toLowerCase();
if (
/aba\s|acct\s|routing\s|checking\s|sin\s|ein\s|insee\s|dni\s|nhs\s|mbi\s|npi\s|svnr\s|ohip\s|medicare\s|mac\s|imei\s|member\s|plan\s|policy\s|insurance\s/.test(
pre
)
)
continue;
push(out, "PHONE", m, map, 0.9);
}
// documents / personal IDs
for (const m of text.matchAll(PASSPORT_RE)) {
const ctx = text.slice(
Math.max(0, m.index! - 15),
m.index! + m[0].length + 5
);
if (/passport|passeport|travel|document|P<|pièce/i.test(ctx)) {
push(out, "PASSPORT", m, map, 0.9);
}
}
for (const m of text.matchAll(DL_RE)) {
const ctx = text.slice(Math.max(0, m.index! - 12), m.index!);
if (/\bDL\b|driver|licen[cs]e/i.test(ctx))
push(out, "DRIVER_LICENSE", m, map, 0.8);
}
// California DL
for (const m of text.matchAll(DL_CA_RE)) {
const ctx = text.slice(Math.max(0, m.index! - 10), m.index!);
if (/\bDL\b|driver/i.test(ctx)) push(out, "DRIVER_LICENSE", m, map, 0.8);
}
// health (NHS moved to idRecognizer)
for (const m of text.matchAll(MRN_RE)) push(out, "MEDICAL_ID", m, map, 0.9);
for (const m of text.matchAll(DOB_RE)) push(out, "DOB", m, map, 0.85);
for (const m of text.matchAll(INS_RE))
push(out, "INSURANCE_ID", m, map, 0.8);
// misc
for (const m of text.matchAll(POSTCODE_RE))
push(out, "POSTCODE", m, map, 0.7);
return out;
},
};