secuprompt
Version:
Protect your AI from Prompt Injection
76 lines (75 loc) • 2.42 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.score_integrity = void 0;
const data_1 = require("../data");
const embedding_1 = require("../core/embedding");
const esc = (txt) => txt.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const make_reg = (list) => new RegExp(list.map(esc).join("|"), "gi");
const modal_rules = [
[make_reg(data_1.modality_map.negative), -1],
[make_reg(data_1.modality_map.positive), 1]
];
const extract_directives = (txt) => {
const res = [];
const low = txt.toLowerCase();
for (const [reg, pol] of modal_rules) {
reg.lastIndex = 0;
let m = null;
while ((m = reg.exec(low))) {
const start = m.index + m[0].length;
const topic = low
.slice(start, start + 60)
.split(/[\.\!\?\,]/)[0]
.trim();
if (topic)
res.push({ topic, pol });
}
}
return res;
};
const detect_flip = (sys, user) => {
const sysd = extract_directives(sys);
const userd = extract_directives(user);
let flips = 0;
for (const s of sysd) {
for (const u of userd) {
if (u.topic && s.topic && u.topic.startsWith(s.topic.slice(0, 10))) {
if (Math.sign(s.pol) !== Math.sign(u.pol))
flips++;
}
}
}
return flips;
};
const clause_vecs = (txt) => (0, embedding_1.seg_text)(txt).map(embedding_1.embed);
const overlap_score = (sys, user) => {
const s = clause_vecs(sys);
const u = clause_vecs(user);
if (!s.length || !u.length)
return 0;
let sum = 0;
for (const uv of u) {
let best = 0;
for (const sv of s) {
best = Math.max(best, (0, embedding_1.cosine)(uv, sv));
}
sum += best;
}
return sum / u.length;
};
const score_integrity = (sys, user) => {
const overlap = overlap_score(sys, user);
const flips = detect_flip(sys, user);
const reasons = [];
if (flips)
reasons.push("modality_override");
if (overlap > 0.65)
reasons.push("high_instruction_overlap");
let score = 0;
if (flips)
score = Math.min(1, 0.7 + 0.1 * (flips - 1) + overlap * 0.3);
else
score = Math.max(0, overlap - 0.4);
return { score: (0, embedding_1.normalize)(score), detail: reasons };
};
exports.score_integrity = score_integrity;