secuprompt
Version:
Protect your AI from Prompt Injection
79 lines (78 loc) • 3.52 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.sanitize_user_input = exports.score_segments = exports.analyze_user_sentences = void 0;
const embedding_1 = require("../core/embedding");
const signature_1 = require("./signature");
const semantic_1 = require("./semantic");
const integrity_1 = require("./integrity");
const sentence_split = (txt) => txt
.split(/(?<=[\.!\?])/)
.map(s => s.trim())
.filter(Boolean);
const injection_hints = [
{ label: "hint_ignore_chain", reg: /ignore (all|any|previous).*(instruction|rule)/i },
{ label: "hint_reveal_system", reg: /reveal (the )?(system|developer) (prompt|message)/i },
{ label: "hint_role_swap", reg: /act as|pretend you are|from now on/i },
{ label: "hint_unrestricted", reg: /unfiltered|unrestricted|without limitation|no rules/i },
{ label: "hint_override_policy", reg: /override.*policy|bypass.*policy/i },
{ label: "hint_even_when_forbidden", reg: /even when (?:it\s)?is forbidden|obey me/i },
{ label: "hint_system_terms", reg: /developer|system prompt|policy stack|instruction set/i },
{ label: "hint_hidden", reg: /hidden directive|hidden instruction|unsafe payload/i }
];
const weight_signature = 0.55;
const weight_semantic = 0.25;
const weight_integrity = 0.2;
const removal_threshold = 0.1;
const analyze_sentence = (system, sentence) => {
const sig = (0, signature_1.score_signatures)(sentence);
const sem = (0, semantic_1.score_semantic)(sentence);
const integ = (0, integrity_1.score_integrity)(system, sentence);
const hints = injection_hints.filter(({ reg }) => reg.test(sentence));
const hint_bonus = Math.min(0.4, hints.length * 0.15);
let score = (0, embedding_1.normalize)(sig.score * weight_signature + sem.score * weight_semantic + integ.score * weight_integrity + hint_bonus);
if (hints.length)
score = 1;
const reasons = [
...sig.detail,
...sem.detail,
...integ.detail,
...hints.map(h => h.label)
];
return { text: sentence, score, reasons };
};
const analyze_user_sentences = (system, user) => {
const sentences = sentence_split(user);
return sentences.map(s => analyze_sentence(system, s));
};
exports.analyze_user_sentences = analyze_user_sentences;
const score_segments = (system, user) => {
const sentences = (0, exports.analyze_user_sentences)(system, user);
if (!sentences.length)
return { score: 0, detail: [] };
const maxScore = Math.max(...sentences.map(s => s.score));
const risky = sentences
.map((seg, idx) => ({ seg, idx }))
.filter(item => item.seg.score >= removal_threshold)
.map(({ seg, idx }) => `segment_${idx}_risk_${seg.score.toFixed(2)}`);
return { score: (0, embedding_1.normalize)(maxScore), detail: risky };
};
exports.score_segments = score_segments;
const sanitize_user_input = (system, user) => {
const sentences = (0, exports.analyze_user_sentences)(system, user);
if (!sentences.length)
return { sanitized: user.trim(), removed: [], changed: false };
const safe = [];
const removed = [];
sentences.forEach(seg => {
if (seg.score >= removal_threshold)
removed.push({ text: seg.text, reasons: seg.reasons });
else
safe.push(seg.text);
});
return {
sanitized: safe.join(" ").replace(/\s+/g, " ").trim(),
removed,
changed: removed.length > 0
};
};
exports.sanitize_user_input = sanitize_user_input;