vibe-guard
Version:
██ Vibe-Guard Security Scanner - 28 essential security rules to catch vulnerabilities before they catch you! Zero dependencies, instant setup, works everywhere, optimized performance. Detects SQL injection, XSS, exposed secrets, CSRF, CORS issues, contain
456 lines • 21.5 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.AiDataLeakagePreventionRule = void 0;
const types_1 = require("../types");
class AiDataLeakagePreventionRule extends types_1.BaseRule {
constructor() {
super(...arguments);
this.name = 'ai-data-leakage-prevention';
this.description = 'Detects potential data leakage in AI systems and training data exposure with context-aware analysis';
this.severity = 'high';
this.leakagePatterns = [
// Training data exposure: More specific patterns!
// Critical severity
{
pattern: /(?:training[_-]?data|dataset|corpus)\s*[:=]\s*['"`]?[^'"`]*(?:expose|leak|public|unrestricted)['"`]?/gi,
type: 'Training Data Exposure',
confidence: 0.9,
severity: 'critical',
validation: (text) => this.validateTrainingDataExposure(text)
},
// Critical severity
{
pattern: /(?:sensitive|confidential|proprietary)\s*[:=]\s*['"`]?[^'"`]*(?:training|dataset|corpus)['"`]?/gi,
type: 'Sensitive Training Data',
confidence: 0.85,
severity: 'critical',
validation: (text) => this.validateSensitiveTrainingData(text)
},
// High severity
// Model output containing sensitive data: more specific
{
pattern: /(?:model|ai|llm)\s*[:=]\s*['"`]?[^'"`]*(?:output|response|generation)\s*[:=]\s*['"`]?[^'"`]*(?:sensitive|confidential|proprietary)['"`]?/gi,
type: 'Sensitive Data in AI Output',
confidence: 0.8,
severity: 'high',
validation: (text) => this.validateSensitiveOutput(text)
},
// High severity
// Unfiltered AI responses: more specific
{
pattern: /(?:ai|model|llm)\s*[:=]\s*['"`]?[^'"`]*(?:unfiltered|unrestricted|raw)\s*[:=]\s*['"`]?[^'"`]*(?:output|response)['"`]?/gi,
type: 'Unfiltered AI Output',
confidence: 0.75,
severity: 'high',
validation: (text) => this.validateUnfilteredOutput(text)
},
// High severity
// Data classification bypass: more specific
{
pattern: /(?:bypass|ignore|skip)\s*[:=]\s*['"`]?[^'"`]*(?:classification|label|sensitivity)['"`]?/gi,
type: 'Data Classification Bypass',
confidence: 0.85,
severity: 'high',
validation: (text) => this.validateClassificationBypass(text)
},
// Critical severity
// AI model containing sensitive data: more specific
{
pattern: /(?:model|weights|parameters)\s*[:=]\s*['"`]?[^'"`]*(?:contain|include|embed)\s*[:=]\s*['"`]?[^'"`]*(?:sensitive|confidential)['"`]?/gi,
type: 'Sensitive Data in Model',
confidence: 0.9,
severity: 'critical',
validation: (text) => this.validateSensitiveModel(text)
},
// High severity
// Unencrypted AI artifacts: more specific
{
pattern: /(?:model|weights|artifacts)\s*[:=]\s*['"`]?[^'"`]*(?:unencrypted|plaintext|raw)['"`]?/gi,
type: 'Unencrypted AI Artifacts',
confidence: 0.8,
severity: 'high',
validation: (text) => this.validateUnencryptedArtifacts(text)
},
// High severity
// Logging sensitive data: more specific
{
pattern: /(?:log|console|print|echo)\s*[:=]\s*['"`]?[^'"`]*(?:sensitive|confidential|proprietary|personal)['"`]?/gi,
type: 'Sensitive Data Logging',
confidence: 0.85,
severity: 'high',
validation: (text) => this.validateSensitiveLogging(text)
},
// Medium severity
// Data export without filtering: more specific
{
pattern: /(?:export|save|write)\s*[:=]\s*['"`]?[^'"`]*(?:all|complete|full)\s*[:=]\s*['"`]?[^'"`]*(?:data|dataset)['"`]?/gi,
type: 'Unfiltered Data Export',
confidence: 0.7,
severity: 'medium',
validation: (text) => this.validateUnfilteredExport(text)
},
// Medium severity
// API response without sanitization: more specific
{
pattern: /(?:api|response|return)\s*[:=]\s*['"`]?[^'"`]*(?:raw|unfiltered|complete)\s*[:=]\s*['"`]?[^'"`]*(?:data|result)['"`]?/gi,
type: 'Unsanitized API Response',
confidence: 0.75,
severity: 'medium',
validation: (text) => this.validateUnsanitizedResponse(text)
}
];
this.falsePositivePatterns = [
// Development and testing patterns:
/example/i,
/demo/i,
/test/i,
/mock/i,
/sample/i,
/placeholder/i,
/comment/i,
/todo/i,
/fixme/i,
/\/\/.*/i,
/#.*/i,
/\/\*.*\*\//i,
/<!--.*-->/i,
/development/i,
/dev/i,
/staging/i,
/localhost/i,
// Documentation and tutorials:
/documentation/i,
/docs?/i,
/tutorial/i,
/guide/i,
/readme/i,
/example[_-]?config/i,
/sample[_-]?config/i,
/\.md$/i,
/\.rst$/i,
/\.txt$/i,
// Safe environments:
/sandbox/i,
/isolated/i,
/contained/i,
/restricted/i,
/safe[_-]?mode/i,
/demo[_-]?mode/i,
/test[_-]?environment/i,
// Data protection patterns (likely safe):
/encrypt/i,
/protect/i,
/secure/i,
/filter/i,
/sanitize/i,
/anonymize/i,
/mask/i,
/redact/i,
/dlp/i,
/data[_-]?loss[_-]?prevention/i
];
}
check(fileContent) {
const issues = [];
const language = this.detectLanguage(fileContent.path);
const framework = this.detectFramework(fileContent.content, language);
const hasDataProtection = this.hasDataProtection(fileContent.content);
const isProtectedEnvironment = this.isProtectedEnvironment(fileContent.content);
for (const { pattern, type, confidence, severity, validation } of this.leakagePatterns) {
const matches = this.findMatches(fileContent.content, pattern);
for (const { match, line, column, lineContent } of matches) {
const matchedText = match[0];
const context = this.analyzeContext(fileContent, line, column, language, framework, hasDataProtection, isProtectedEnvironment);
// Skips if in safe context
if (this.isSafeContext(context)) {
continue;
}
// Validates the data leakage issue
if (!validation(matchedText)) {
continue;
}
// Calculates final confidence based on context
const finalConfidence = this.calculateConfidence(confidence, context);
if (finalConfidence >= 0.5) {
issues.push(this.createIssue(fileContent.path, line, column, lineContent, `${severity.toUpperCase()}: ${type} detected (confidence: ${Math.round(finalConfidence * 100)}%): ${this.getLineContext(lineContent, column)}`, this.generateSuggestion(type, context), severity));
}
}
}
return issues;
}
analyzeContext(fileContent, line, column, language, framework, hasDataProtection, isProtectedEnvironment) {
const lines = fileContent.lines;
const currentLine = lines[line - 1] || '';
const surroundingLines = lines.slice(Math.max(0, line - 3), line + 2);
return {
isInComment: this.isInComment(currentLine, language),
isInString: this.isInString(currentLine, column),
isInTestFile: this.isInTestFile(fileContent.path),
isInDocumentation: this.isInDocumentation(fileContent.path),
isInDevelopment: this.isInDevelopment(surroundingLines),
surroundingCode: surroundingLines.join('\n'),
language,
framework,
hasDataProtection: hasDataProtection || false,
isProtectedEnvironment: isProtectedEnvironment || false
};
}
isSafeContext(context) {
if (context.isInComment)
return true;
if (context.isInTestFile)
return true;
if (context.isInDocumentation)
return true;
if (context.isInDevelopment)
return true;
if (this.falsePositivePatterns.some(pattern => pattern.test(context.surroundingCode))) {
return true;
}
if (context.hasDataProtection)
return true;
if (context.isProtectedEnvironment)
return true;
return false;
}
detectLanguage(filePath) {
const ext = filePath.split('.').pop()?.toLowerCase();
const languageMap = {
'js': 'javascript',
'jsx': 'javascript',
'ts': 'typescript',
'tsx': 'typescript',
'py': 'python',
'php': 'php',
'rb': 'ruby',
'go': 'go',
'java': 'java',
'cs': 'csharp'
};
return languageMap[ext || ''] || 'unknown';
}
detectFramework(content, language) {
if (language === 'javascript' || language === 'typescript') {
if (content.includes('openai') || content.includes('OpenAI'))
return 'openai';
if (content.includes('anthropic') || content.includes('Anthropic'))
return 'anthropic';
if (content.includes('langchain') || content.includes('LangChain'))
return 'langchain';
if (content.includes('transformers') || content.includes('Transformers'))
return 'transformers';
}
if (language === 'python') {
if (content.includes('openai') || content.includes('OpenAI'))
return 'openai';
if (content.includes('anthropic') || content.includes('Anthropic'))
return 'anthropic';
if (content.includes('langchain') || content.includes('LangChain'))
return 'langchain';
if (content.includes('transformers') || content.includes('Transformers'))
return 'transformers';
if (content.includes('pandas') || content.includes('Pandas'))
return 'pandas';
if (content.includes('numpy') || content.includes('NumPy'))
return 'numpy';
}
return undefined;
}
isInComment(line, language) {
const trimmed = line.trim();
if (language === 'javascript' || language === 'typescript') {
return trimmed.startsWith('//') || trimmed.startsWith('/*') || trimmed.startsWith('*');
}
if (language === 'python') {
return trimmed.startsWith('#');
}
if (language === 'php') {
return trimmed.startsWith('//') || trimmed.startsWith('/*') || trimmed.startsWith('#');
}
return false;
}
isInString(line, column) {
const before = line.substring(0, column);
const quotes = (before.match(/['"`]/g) || []).length;
return quotes % 2 === 1;
}
isInTestFile(filePath) {
return filePath.includes('test') || filePath.includes('spec') || filePath.includes('mock');
}
isInDocumentation(filePath) {
const docPatterns = [
/docs?\//i,
/documentation/i,
/examples?/i,
/samples?/i,
/tutorials?/i,
/guides?/i,
/readme/i,
/\.md$/i,
/\.rst$/i,
/\.txt$/i
];
return docPatterns.some(pattern => pattern.test(filePath));
}
isInDevelopment(lines) {
return lines.some(line => line.includes('development') ||
line.includes('dev') ||
line.includes('staging') ||
line.includes('localhost') ||
line.includes('127.0.0.1') ||
line.includes('NODE_ENV') ||
line.includes('DEBUG'));
}
hasDataProtection(content) {
const protectionPatterns = [
/encrypt/i,
/protect/i,
/secure/i,
/filter/i,
/sanitize/i,
/anonymize/i,
/mask/i,
/redact/i,
/dlp/i,
/data[_-]?loss[_-]?prevention/i,
/privacy/i,
/gdpr/i,
/compliance/i
];
return protectionPatterns.some(pattern => pattern.test(content));
}
isProtectedEnvironment(content) {
const protectedPatterns = [
/sandbox/i,
/isolated/i,
/contained/i,
/restricted/i,
/monitored/i,
/audited/i,
/logged/i,
/safe/i,
/protected/i
];
return protectedPatterns.some(pattern => pattern.test(content));
}
calculateConfidence(baseConfidence, context) {
let confidence = baseConfidence;
// Adjusts confidence based on context
if (context.hasDataProtection)
confidence *= 0.6; // Reduces if data protection present
if (context.isProtectedEnvironment)
confidence *= 0.7; // Reduces if in protected environment
if (context.framework)
confidence *= 1.1; // Increases for known frameworks
return Math.min(confidence, 1.0);
}
// Validation methods for different data leakage issues!
validateTrainingDataExposure(text) {
const trainingKeywords = ['training', 'dataset', 'corpus'];
const exposureKeywords = ['expose', 'leak', 'public', 'unrestricted'];
return trainingKeywords.some(training => text.toLowerCase().includes(training)) &&
exposureKeywords.some(exposure => text.toLowerCase().includes(exposure));
}
validateSensitiveTrainingData(text) {
const sensitiveKeywords = ['sensitive', 'confidential', 'proprietary'];
const trainingKeywords = ['training', 'dataset', 'corpus'];
return sensitiveKeywords.some(sensitive => text.toLowerCase().includes(sensitive)) &&
trainingKeywords.some(training => text.toLowerCase().includes(training));
}
validateSensitiveOutput(text) {
const aiKeywords = ['model', 'ai', 'llm'];
const outputKeywords = ['output', 'response', 'generation'];
const sensitiveKeywords = ['sensitive', 'confidential', 'proprietary'];
return aiKeywords.some(ai => text.toLowerCase().includes(ai)) &&
outputKeywords.some(output => text.toLowerCase().includes(output)) &&
sensitiveKeywords.some(sensitive => text.toLowerCase().includes(sensitive));
}
validateUnfilteredOutput(text) {
const aiKeywords = ['ai', 'model', 'llm'];
const unfilteredKeywords = ['unfiltered', 'unrestricted', 'raw'];
const outputKeywords = ['output', 'response'];
return aiKeywords.some(ai => text.toLowerCase().includes(ai)) &&
unfilteredKeywords.some(unfiltered => text.toLowerCase().includes(unfiltered)) &&
outputKeywords.some(output => text.toLowerCase().includes(output));
}
validateClassificationBypass(text) {
const bypassKeywords = ['bypass', 'ignore', 'skip'];
const classificationKeywords = ['classification', 'label', 'sensitivity'];
return bypassKeywords.some(bypass => text.toLowerCase().includes(bypass)) &&
classificationKeywords.some(classification => text.toLowerCase().includes(classification));
}
validateSensitiveModel(text) {
const modelKeywords = ['model', 'weights', 'parameters'];
const containKeywords = ['contain', 'include', 'embed'];
const sensitiveKeywords = ['sensitive', 'confidential'];
return modelKeywords.some(model => text.toLowerCase().includes(model)) &&
containKeywords.some(contain => text.toLowerCase().includes(contain)) &&
sensitiveKeywords.some(sensitive => text.toLowerCase().includes(sensitive));
}
validateUnencryptedArtifacts(text) {
const artifactKeywords = ['model', 'weights', 'artifacts'];
const unencryptedKeywords = ['unencrypted', 'plaintext', 'raw'];
return artifactKeywords.some(artifact => text.toLowerCase().includes(artifact)) &&
unencryptedKeywords.some(unencrypted => text.toLowerCase().includes(unencrypted));
}
validateSensitiveLogging(text) {
const loggingKeywords = ['log', 'console', 'print', 'echo'];
const sensitiveKeywords = ['sensitive', 'confidential', 'proprietary', 'personal'];
return loggingKeywords.some(logging => text.toLowerCase().includes(logging)) &&
sensitiveKeywords.some(sensitive => text.toLowerCase().includes(sensitive));
}
validateUnfilteredExport(text) {
const exportKeywords = ['export', 'save', 'write'];
const unfilteredKeywords = ['all', 'complete', 'full'];
const dataKeywords = ['data', 'dataset'];
return exportKeywords.some(exportKeyword => text.toLowerCase().includes(exportKeyword)) &&
unfilteredKeywords.some(unfiltered => text.toLowerCase().includes(unfiltered)) &&
dataKeywords.some(data => text.toLowerCase().includes(data));
}
validateUnsanitizedResponse(text) {
const responseKeywords = ['api', 'response', 'return'];
const unsanitizedKeywords = ['raw', 'unfiltered', 'complete'];
const dataKeywords = ['data', 'result'];
return responseKeywords.some(response => text.toLowerCase().includes(response)) &&
unsanitizedKeywords.some(unsanitized => text.toLowerCase().includes(unsanitized)) &&
dataKeywords.some(data => text.toLowerCase().includes(data));
}
getLineContext(lineContent, column) {
const start = Math.max(0, column - 20);
const end = Math.min(lineContent.length, column + 20);
return lineContent.substring(start, end).trim();
}
generateSuggestion(type, context) {
const suggestions = {
'Training Data Exposure': 'Implement data access controls and encryption for training data. Use secure data storage and access logging.',
'Sensitive Training Data': 'Apply data classification and encryption to sensitive training data. Use secure data pipelines and access controls.',
'Sensitive Data in AI Output': 'Implement output filtering and content moderation. Use data loss prevention (DLP) tools to detect and block sensitive data.',
'Unfiltered AI Output': 'Implement content filtering and output sanitization. Use AI safety measures and content moderation.',
'Data Classification Bypass': 'Enforce mandatory data classification. Implement automated classification and prevent bypass mechanisms.',
'Sensitive Data in Model': 'Implement model security measures. Use model encryption and secure model deployment practices.',
'Unencrypted AI Artifacts': 'Encrypt AI models and artifacts. Use secure model storage and transmission protocols.',
'Sensitive Data Logging': 'Implement secure logging practices. Use log encryption and sensitive data masking.',
'Unfiltered Data Export': 'Implement data export controls and filtering. Use data anonymization and access controls.',
'Unsanitized API Response': 'Implement API response sanitization. Use content filtering and data validation.'
};
let suggestion = suggestions[type] || 'Implement data loss prevention (DLP) policies. Use sensitivity labels, output filtering, and encryption for AI artifacts. Monitor AI outputs for sensitive data exposure.';
if (context.framework) {
suggestion += ` For ${context.framework}, consider using framework-specific data protection features.`;
if (context.framework === 'openai') {
suggestion += ' Use OpenAI content filtering and data handling best practices.';
}
else if (context.framework === 'anthropic') {
suggestion += ' Use Anthropic safety features and data protection measures.';
}
else if (context.framework === 'langchain') {
suggestion += ' Use LangChain data protection and output filtering capabilities.';
}
else if (context.framework === 'pandas') {
suggestion += ' Use Pandas data anonymization and filtering functions.';
}
}
return suggestion;
}
}
exports.AiDataLeakagePreventionRule = AiDataLeakagePreventionRule;
//# sourceMappingURL=ai-data-leakage-prevention.js.map