@himorishige/noren-core
Version:
Core PII detection, masking, and tokenization library built on Web Standards
321 lines (320 loc) • 11 kB
JavaScript
// JSON/structured data detection for Noren Core
// Streaming JSON parser with PII detection based on key names and values
// PII key patterns for different types of sensitive data
const PII_KEY_PATTERNS = {
email: new Set(['email', 'mail', 'contact', 'e_mail', 'メール', 'mailaddress']),
phone: new Set(['phone', 'tel', 'mobile', 'telephone', '電話', '携帯', 'cellphone']),
address: new Set(['address', 'addr', '住所', '所在地', 'location', 'street']),
credit_card: new Set(['card', 'card_number', 'cc', 'cardnum', 'カード番号', 'creditcard']),
ssn: new Set(['ssn', 'social_security', 'social', 'socialsecurity']),
name: new Set([
'name',
'fullname',
'first_name',
'last_name',
'氏名',
'名前',
'firstname',
'lastname',
]),
id: new Set(['id', 'user_id', 'customer_id', 'personal_id', 'identification']),
birthday: new Set(['birthday', 'birth_date', 'dob', 'date_of_birth', '生年月日', 'birthdate']),
};
/**
* Streaming JSON detector for PII data
* Uses key-based detection with fallback to text parsing
*/
export class JSONDetector {
currentPath = [];
hits = [];
arrayIndices = [];
/**
* Detect PII in JSON string
*/
detectInJson(jsonString, utils) {
this.hits = [];
this.currentPath = [];
this.arrayIndices = [];
try {
const parsed = JSON.parse(jsonString);
this.traverseObject(parsed, utils, 0, jsonString);
return {
hits: this.hits,
isValidJson: true,
fallbackToText: false,
};
}
catch (_error) {
// Try NDJSON (newline-delimited JSON)
if (this.tryNDJSON(jsonString, utils)) {
return {
hits: this.hits,
isValidJson: true,
fallbackToText: false,
};
}
// Fallback to text detection
return {
hits: [],
isValidJson: false,
fallbackToText: true,
};
}
}
/**
* Try parsing as NDJSON (newline-delimited JSON)
*/
tryNDJSON(text, utils) {
const lines = text.split('\n').filter((line) => line.trim());
for (let i = 0; i < lines.length; i++) {
try {
const parsed = JSON.parse(lines[i]);
this.currentPath = [`[${i}]`];
this.arrayIndices = [i];
this.traverseObject(parsed, utils, 0, text);
// Reset for next iteration
this.currentPath = [];
this.arrayIndices = [];
}
catch (_error) {
return false;
}
}
return true;
}
/**
* Recursively traverse object/array structures
*/
traverseObject(obj, utils, depth = 0, originalJson) {
// Prevent infinite recursion
if (depth > 10)
return;
if (Array.isArray(obj)) {
this.traverseArray(obj, utils, depth, originalJson);
}
else if (obj && typeof obj === 'object') {
this.traverseObjectProperties(obj, utils, depth, originalJson);
}
}
/**
* Traverse array elements
*/
traverseArray(arr, utils, depth, originalJson) {
for (let i = 0; i < arr.length; i++) {
this.currentPath.push(`[${i}]`);
this.arrayIndices.push(i);
this.traverseObject(arr[i], utils, depth + 1, originalJson);
this.currentPath.pop();
this.arrayIndices.pop();
}
}
/**
* Traverse object properties
*/
traverseObjectProperties(obj, utils, depth, originalJson) {
for (const [key, value] of Object.entries(obj)) {
this.currentPath.push(key);
if (typeof value === 'string' && value.length > 0) {
this.checkStringValue(key, value, utils, originalJson);
}
else if (value && typeof value === 'object') {
this.traverseObject(value, utils, depth + 1, originalJson);
}
this.currentPath.pop();
}
}
/**
* Check if string value contains PII based on key name and content
*/
checkStringValue(key, value, utils, originalJson) {
const keyLower = key.toLowerCase();
const jsonPath = this.getJsonPath();
// Calculate actual positions in original JSON string
const { start, end } = this.findValuePosition(value, originalJson || JSON.stringify(value));
// Detect PII type based on key name
const detectedType = this.detectPiiTypeFromKey(keyLower);
if (detectedType) {
// Key-based detection - higher confidence
this.addJsonHit({
type: detectedType,
value,
jsonPath,
keyName: key,
confidence: 0.9,
risk: this.getRiskLevel(detectedType),
reasons: ['json_key_match', `key_pattern_${detectedType}`],
features: {
keyBased: true,
detectedFromKey: key,
},
start,
end,
});
}
else {
// Content-based detection using existing detectors
this.detectInStringValue(value, key, jsonPath, utils, start, end);
}
}
/**
* Detect PII type from key name
*/
detectPiiTypeFromKey(keyLower) {
for (const [piiType, patterns] of Object.entries(PII_KEY_PATTERNS)) {
if (patterns.has(keyLower)) {
// Map phone to phone_e164 to match existing types
if (piiType === 'phone')
return 'phone_e164';
return piiType;
}
}
// Check partial matches
if (keyLower.includes('email') || keyLower.includes('mail'))
return 'email';
if (keyLower.includes('phone') || keyLower.includes('tel'))
return 'phone_e164';
if (keyLower.includes('card') || keyLower.includes('credit'))
return 'credit_card';
if (keyLower.includes('address') || keyLower.includes('addr'))
return 'address';
return null;
}
/**
* Use existing detectors on string values
*/
detectInStringValue(value, key, jsonPath, utils, start = 0, end = 0) {
// Create a mock DetectUtils for this specific value
const _mockUtils = {
src: value,
hasCtx: utils.hasCtx,
push: (hit) => {
this.addJsonHit({
type: hit.type,
value: hit.value,
jsonPath,
keyName: key,
confidence: (hit.confidence || 0.7) * 0.8, // Slightly lower confidence for content-based
risk: hit.risk,
reasons: [...(hit.reasons || []), 'json_content_match'],
features: {
...hit.features,
keyBased: false,
foundInKey: key,
},
start,
end,
});
},
canPush: utils.canPush,
};
// Run built-in detection on the value
// Note: We would need to import and use the actual detectors here
// For now, we'll do basic pattern matching
this.basicPatternMatch(value, key, jsonPath, ['json_content_match'], start, end);
}
/**
* Basic pattern matching for common PII types
*/
basicPatternMatch(value, key, jsonPath, additionalReasons = [], start = 0, end = 0) {
// Email pattern
const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
const emailMatches = value.matchAll(emailPattern);
for (const match of emailMatches) {
this.addJsonHit({
type: 'email',
value: match[0],
jsonPath,
keyName: key,
confidence: 0.8,
risk: 'medium',
reasons: [...additionalReasons, 'json_email_pattern', 'regex_match'],
features: {
keyBased: false,
foundInKey: key,
},
start,
end,
});
}
// Phone pattern (simple)
const phonePattern = /\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b/g;
const phoneMatches = value.matchAll(phonePattern);
for (const match of phoneMatches) {
this.addJsonHit({
type: 'phone_e164',
value: match[0],
jsonPath,
keyName: key,
confidence: 0.7,
risk: 'medium',
reasons: [...additionalReasons, 'json_phone_pattern', 'regex_match'],
features: {
keyBased: false,
foundInKey: key,
},
start,
end,
});
}
}
/**
* Add a JSON-specific hit
*/
addJsonHit(params) {
this.hits.push({
type: params.type,
start: params.start || 0,
end: params.end || params.value.length,
value: params.value,
risk: params.risk,
confidence: params.confidence,
reasons: params.reasons,
features: params.features,
jsonPath: params.jsonPath,
keyName: params.keyName,
});
}
/**
* Get current JSON path
*/
getJsonPath() {
if (this.currentPath.length === 0)
return '$';
return `$.${this.currentPath.join('.')}`;
}
/**
* Find the position of a value in the original JSON string
* Simple fallback - just return default positions for now
*/
findValuePosition(value, originalJson) {
// For now, just return simple positions
// This is a complex problem that would require more sophisticated JSON parsing
const index = originalJson.indexOf(value);
if (index >= 0) {
return { start: index, end: index + value.length };
}
return { start: 0, end: value.length };
}
/**
* Get risk level for PII type
*/
getRiskLevel(piiType) {
switch (piiType) {
case 'credit_card':
case 'ssn':
return 'high';
case 'email':
case 'phone_e164':
case 'name':
return 'medium';
default:
return 'low';
}
}
}
/**
* JSON detector factory function
*/
export function createJSONDetector() {
return new JSONDetector();
}