@himorishige/noren-plugin-jp
Version:
Japan-specific PII detection plugin for Noren (phone numbers, postal codes, MyNumber)
186 lines (185 loc) • 8.23 kB
JavaScript
import { validateMyNumber } from './validators.js';
/**
* Simple check if a potential postal code pattern is actually a phone number
*/
function isLikelyPhoneNumber(text, matchIndex, matchLength) {
const match = text.slice(matchIndex, matchIndex + matchLength);
// Phone numbers typically start with 0 (domestic) or +81 (international)
if (/^0\d/.test(match) || /^\+81/.test(match)) {
return true;
}
// Check if there's a phone-related label nearby (within 30 characters before)
const beforeText = text.slice(Math.max(0, matchIndex - 30), matchIndex).toLowerCase();
const phoneLabels = ['tel', '電話', 'phone', 'fax', '携帯', 'mobile'];
return phoneLabels.some((label) => beforeText.includes(label));
}
// Pre-compiled regex patterns for JP detectors with Japanese-aware boundaries
const JP_PATTERNS = {
postal: /(?<![0-90-9¥$€£¢])\d{3}-?\d{4}(?![0-90-9])/g,
cellPhone: /(?<![0-90-9])0(?:60|70|80|90)-?\d{4}-?\d{4}(?![0-90-9])/g,
landlinePhone: /(?<![0-90-9])0[1-9]\d?-?\d{3,4}-?\d{4}(?![0-90-9])/g,
internationalPhone: /\+81[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{3,4}(?![0-90-9])/g,
myNumber: /(?<![0-90-9])\d{12}(?![0-90-9])/g,
};
// Context hints as Sets for better performance (O(1) lookup)
const JP_CONTEXTS = {
postal: new Set(['〒', '住所', 'address', 'zip']),
phone: new Set(['tel', '電話', 'phone']),
myNumber: new Set(['マイナンバー', '個人番号', 'mynumber']),
};
export const detectors = [
{
id: 'jp.postal',
priority: 5, // Lower priority than phone detection
match: ({ src, push, hasCtx }) => {
const hasPostalContext = hasCtx(['〒', '郵便', '住所', 'zip', 'postal']);
for (const m of src.matchAll(JP_PATTERNS.postal)) {
if (m.index == null)
continue;
// Skip if this looks like a phone number
if (isLikelyPhoneNumber(src, m.index, m[0].length)) {
continue;
}
// Check for postal symbol (〒) within 10 characters before
const beforeText = src.slice(Math.max(0, m.index - 10), m.index);
const hasPostalSymbol = beforeText.includes('〒');
// Simple confidence calculation
const confidence = hasPostalSymbol ? 0.9 : hasPostalContext ? 0.6 : 0.4;
// Only report with reasonable confidence
if (confidence >= 0.4) {
push({
type: 'postal_jp',
start: m.index,
end: m.index + m[0].length,
value: m[0],
risk: 'low',
confidence,
reasons: [
'postal_pattern',
...(hasPostalSymbol ? ['has_postal_symbol'] : []),
...(hasPostalContext ? ['has_context'] : []),
],
features: {
hasPostalSymbol,
hasContext: hasPostalContext,
normalized: m[0].replace(/[^\d]/g, '').replace(/(\d{3})(\d{4})/, '$1-$2'),
},
});
}
}
},
},
{
id: 'jp.phone',
priority: 10, // Higher priority than postal detection
match: ({ src, push, hasCtx }) => {
const hasContext = hasCtx(Array.from(JP_CONTEXTS.phone));
// Cell phone detection
for (const m of src.matchAll(JP_PATTERNS.cellPhone)) {
if (m.index == null)
continue;
const confidence = 0.8; // Cell phones are fairly reliable
push({
type: 'phone_jp',
start: m.index,
end: m.index + m[0].length,
value: m[0],
risk: 'medium',
confidence,
reasons: ['cell_phone_pattern', hasContext ? 'context_match' : 'no_context'],
features: {
phoneType: 'cellular',
hasContext,
normalized: m[0].replace(/[^\d]/g, '').replace(/(\d{3})(\d{4})(\d{4})/, '$1-$2-$3'),
},
});
}
// Landline phone detection
for (const m of src.matchAll(JP_PATTERNS.landlinePhone)) {
if (m.index == null)
continue;
const confidence = hasContext ? 0.7 : 0.5;
push({
type: 'phone_jp',
start: m.index,
end: m.index + m[0].length,
value: m[0],
risk: 'low',
confidence,
reasons: ['landline_pattern', hasContext ? 'context_match' : 'no_context'],
features: {
phoneType: 'landline',
hasContext,
normalized: m[0].replace(/[^\d]/g, '').replace(/(\d{2,4})(\d{3,4})(\d{4})/, '$1-$2-$3'),
},
});
}
// International phone detection - +81 prefix is reliable without context
for (const m of src.matchAll(JP_PATTERNS.internationalPhone)) {
if (m.index == null)
continue;
// +81プレフィックスは高信頼度で単独検出可能
const hasInternationalPrefix = m[0].startsWith('+81');
if (hasContext || hasInternationalPrefix) {
push({
type: 'phone_jp',
start: m.index,
end: m.index + m[0].length,
value: m[0],
risk: 'medium',
confidence: hasInternationalPrefix ? 0.9 : 0.85,
reasons: [
'international_pattern',
hasContext ? 'context_match' : 'international_prefix',
],
features: {
phoneType: 'international',
hasContext,
hasInternationalPrefix,
normalized: m[0].replace(/[^\d+]/g, ''),
},
});
}
}
},
},
{
id: 'jp.mynumber',
priority: -10,
match: ({ src, push, hasCtx }) => {
const hasContext = hasCtx(Array.from(JP_CONTEXTS.myNumber));
if (!hasContext)
return;
for (const m of src.matchAll(JP_PATTERNS.myNumber)) {
const validation = validateMyNumber(m[0]);
// Only push if basic format is valid or context is strong
if (validation.valid || hasContext) {
push({
type: 'mynumber_jp',
start: m.index,
end: m.index + m[0].length,
value: m[0],
risk: 'high',
confidence: validation.confidence || (hasContext ? 0.6 : 0.3),
reasons: [
'mynumber_pattern',
validation.reason,
hasContext ? 'context_match' : 'no_context',
],
features: {
checksumValid: validation.valid,
hasContext,
normalized: validation.normalized || m[0],
validationReason: validation.reason,
},
});
}
}
},
},
];
export const maskers = {
postal_jp: () => '•••-••••',
phone_jp: (h) => h.value.replace(/\d/g, '•'),
mynumber_jp: () => '[REDACTED:MYNUMBER]',
};