@himorishige/noren-devtools
Version:
Development and testing tools for Noren PII detection library
385 lines (384 loc) • 15 kB
JavaScript
/**
* Accuracy measurement framework for PII detection evaluation
* Streamlined implementation with unified ground truth management and metrics calculation
*/
import { createEvaluationReport, printReport } from './report-common.js';
import { mean } from './stats-common.js';
// ===== Ground Truth Manager =====
export class GroundTruthManager {
entries = new Map();
addEntry(entry) {
// Validate annotation bounds
for (const annotation of entry.annotations) {
if (annotation.start < 0 ||
annotation.end > entry.text.length ||
annotation.start >= annotation.end) {
throw new Error(`Invalid annotation bounds in entry ${entry.id}`);
}
}
// Check for overlapping annotations
for (let i = 0; i < entry.annotations.length; i++) {
for (let j = i + 1; j < entry.annotations.length; j++) {
const annotation1 = entry.annotations[i];
const annotation2 = entry.annotations[j];
if ((annotation1.start < annotation2.end && annotation1.end > annotation2.start) ||
(annotation2.start < annotation1.end && annotation2.end > annotation1.start)) {
throw new Error(`Overlapping annotations in entry ${entry.id}`);
}
}
}
// Validate annotation values
for (const annotation of entry.annotations) {
const actualValue = entry.text.slice(annotation.start, annotation.end);
if (actualValue !== annotation.value) {
throw new Error(`Annotation value mismatch in entry ${entry.id}`);
}
}
this.entries.set(entry.id, entry);
}
getEntry(id) {
return this.entries.get(id);
}
getAllEntries() {
return Array.from(this.entries.values());
}
getEntriesByFilter(filter) {
return this.getAllEntries().filter(filter);
}
clear() {
this.entries.clear();
}
exportToJson() {
return JSON.stringify({
version: '1.0',
exported_at: Date.now(),
entries: this.getAllEntries(),
}, null, 2);
}
importFromJson(jsonData) {
const data = JSON.parse(jsonData);
if (!data.entries || !Array.isArray(data.entries)) {
throw new Error('Invalid JSON format: missing entries array');
}
for (const entry of data.entries) {
this.addEntry(entry);
}
}
}
// ===== Evaluation Engine =====
export class EvaluationEngine {
groundTruthManager;
config;
constructor(groundTruthManager, config) {
this.groundTruthManager = groundTruthManager;
this.config = config;
}
evaluateEntry(entryId, detections, config) {
if (!this.groundTruthManager) {
throw new Error('GroundTruthManager not provided to constructor');
}
const entry = this.groundTruthManager.getEntry(entryId);
if (!entry) {
throw new Error(`Ground truth entry not found: ${entryId}`);
}
return this.evaluateEntry_internal(entry, detections, { ...this.config, ...config });
}
async evaluateAgainstGroundTruth(registry, groundTruthManager, config = {}) {
const entries = groundTruthManager.getAllEntries();
const sampleSize = Math.min(config.sample_size || entries.length, entries.length);
const sampledEntries = entries.slice(0, sampleSize);
console.log(`🔍 Evaluating ${sampledEntries.length} entries against ground truth...`);
const results = [];
for (let i = 0; i < sampledEntries.length; i++) {
const entry = sampledEntries[i];
console.log(` Progress: ${(((i + 1) / sampledEntries.length) * 100).toFixed(1)}% (${i + 1}/${sampledEntries.length})`);
try {
const detections = await registry.detect(entry.text);
const detectionResults = detections.hits.map((hit) => ({
start: hit.start,
end: hit.end,
type: hit.type,
value: hit.value,
confidence: hit.confidence || 0.5,
risk: hit.risk,
}));
const result = this.evaluateEntry_internal(entry, detectionResults, config);
results.push(result);
}
catch (error) {
console.warn(`Skipping entry ${entry.id}: ${error}`);
}
}
const aggregate = this.aggregateResults(results);
// Generate report
this.printEvaluationReport(aggregate);
return { aggregate, results };
}
evaluateEntry_internal(entry, detections, config) {
const overlapThreshold = config.overlap_threshold || 0.5;
// Filter detections by confidence threshold
const filteredDetections = config.confidence_threshold
? detections.filter((d) => (d.confidence ?? 0) >= (config.confidence_threshold ?? 0))
: detections;
// Filter by excluded types
const finalDetections = config.exclude_types
? filteredDetections.filter((d) => !config.exclude_types?.includes(d.type))
: filteredDetections;
const annotations = config.exclude_types
? entry.annotations.filter((a) => !config.exclude_types?.includes(a.type))
: entry.annotations;
const truePositives = [];
const falsePositives = [];
const matchedAnnotations = new Set();
// Find true positives and false positives
for (const detection of finalDetections) {
let bestMatch = null;
for (let i = 0; i < annotations.length; i++) {
const annotation = annotations[i];
if (matchedAnnotations.has(i))
continue;
const overlap = this.calculateOverlap(detection, annotation);
if (overlap >= overlapThreshold && detection.type === annotation.type) {
if (!bestMatch || overlap > bestMatch.overlap) {
bestMatch = { annotation, index: i, overlap };
}
}
}
if (bestMatch) {
truePositives.push({
detected: detection,
ground_truth: bestMatch.annotation,
overlap_ratio: bestMatch.overlap,
});
matchedAnnotations.add(bestMatch.index);
}
else {
falsePositives.push(detection);
}
}
// Find false negatives (unmatched annotations)
const falseNegatives = annotations.filter((_, index) => !matchedAnnotations.has(index));
// Calculate metrics
const tp = truePositives.length;
const fp = falsePositives.length;
const fn = falseNegatives.length;
const precision = tp / Math.max(tp + fp, 1);
const recall = tp / Math.max(tp + fn, 1);
const f1_score = (2 * (precision * recall)) / Math.max(precision + recall, 1);
return {
entry_id: entry.id,
true_positives: truePositives,
false_positives: falsePositives,
false_negatives: falseNegatives,
precision,
recall,
f1_score,
};
}
aggregateResults(results) {
let totalTP = 0;
let totalFP = 0;
let totalFN = 0;
const typeMetrics = {};
const tpConfidences = [];
const fpConfidences = [];
for (const result of results) {
totalTP += result.true_positives.length;
totalFP += result.false_positives.length;
totalFN += result.false_negatives.length;
// Collect confidence data and per-type metrics
for (const tp of result.true_positives) {
tpConfidences.push(tp.detected.confidence);
const type = tp.ground_truth.type;
if (!typeMetrics[type])
typeMetrics[type] = { tp: 0, fp: 0, fn: 0 };
typeMetrics[type].tp++;
}
for (const fp of result.false_positives) {
fpConfidences.push(fp.confidence);
const type = fp.type;
if (!typeMetrics[type])
typeMetrics[type] = { tp: 0, fp: 0, fn: 0 };
typeMetrics[type].fp++;
}
for (const fn of result.false_negatives) {
const type = fn.type;
if (!typeMetrics[type])
typeMetrics[type] = { tp: 0, fp: 0, fn: 0 };
typeMetrics[type].fn++;
}
}
// Calculate aggregate scores
const precision = totalTP / Math.max(totalTP + totalFP, 1);
const recall = totalTP / Math.max(totalTP + totalFN, 1);
const f1_score = (2 * (precision * recall)) / Math.max(precision + recall, 1);
// Calculate per-type metrics
const finalTypeMetrics = {};
for (const [type, counts] of Object.entries(typeMetrics)) {
const typePrecision = counts.tp / Math.max(counts.tp + counts.fp, 1);
const typeRecall = counts.tp / Math.max(counts.tp + counts.fn, 1);
const typeF1 = (2 * (typePrecision * typeRecall)) / Math.max(typePrecision + typeRecall, 1);
finalTypeMetrics[type] = {
tp: counts.tp,
fp: counts.fp,
fn: counts.fn,
precision: typePrecision,
recall: typeRecall,
f1_score: typeF1,
};
}
// Confidence analysis using mean from stats-common
const avgTPConfidence = tpConfidences.length > 0 ? mean(tpConfidences) : 0;
const avgFPConfidence = fpConfidences.length > 0 ? mean(fpConfidences) : 0;
// Confidence buckets
const buckets = [
{ range: '0.0-0.5', tp: 0, fp: 0, precision: 0 },
{ range: '0.5-0.7', tp: 0, fp: 0, precision: 0 },
{ range: '0.7-0.9', tp: 0, fp: 0, precision: 0 },
{ range: '0.9-1.0', tp: 0, fp: 0, precision: 0 },
];
for (const conf of tpConfidences) {
const bucketIndex = conf < 0.5 ? 0 : conf < 0.7 ? 1 : conf < 0.9 ? 2 : 3;
buckets[bucketIndex].tp++;
}
for (const conf of fpConfidences) {
const bucketIndex = conf < 0.5 ? 0 : conf < 0.7 ? 1 : conf < 0.9 ? 2 : 3;
buckets[bucketIndex].fp++;
}
for (const bucket of buckets) {
bucket.precision = bucket.tp / Math.max(bucket.tp + bucket.fp, 1);
}
return {
total_entries: results.length,
total_annotations: totalTP + totalFN,
total_detections: totalTP + totalFP,
true_positives: totalTP,
false_positives: totalFP,
false_negatives: totalFN,
precision,
recall,
f1_score,
type_metrics: finalTypeMetrics,
confidence_analysis: {
avg_tp_confidence: avgTPConfidence,
avg_fp_confidence: avgFPConfidence,
confidence_buckets: buckets,
},
};
}
calculateOverlap(detection, annotation) {
const overlapStart = Math.max(detection.start, annotation.start);
const overlapEnd = Math.min(detection.end, annotation.end);
if (overlapStart >= overlapEnd)
return 0;
const overlapLength = overlapEnd - overlapStart;
const detectionLength = detection.end - detection.start;
const annotationLength = annotation.end - annotation.start;
// Use intersection over union (IoU)
const unionLength = detectionLength + annotationLength - overlapLength;
return overlapLength / unionLength;
}
printEvaluationReport(metrics) {
const accuracyMetrics = {
precision: metrics.precision,
recall: metrics.recall,
f1Score: metrics.f1_score,
truePositives: metrics.true_positives,
falsePositives: metrics.false_positives,
falseNegatives: metrics.false_negatives,
};
const report = createEvaluationReport('PII Detection Evaluation', accuracyMetrics);
printReport(report);
}
}
// ===== Test Dataset Builder =====
export function createSyntheticEntry(id, patterns) {
let text = `This is a test document (ID: ${id}) with PII patterns: `;
const annotations = [];
for (let i = 0; i < patterns.length; i++) {
const { type, pattern } = patterns[i];
const start = text.length;
text += pattern;
const end = text.length;
annotations.push({
start,
end,
type,
value: pattern,
confidence: 1.0,
metadata: {
annotator: 'synthetic',
difficulty: 'medium',
},
});
if (i < patterns.length - 1) {
text += ', ';
}
}
text += '. End of test document.';
return {
id,
text,
annotations,
metadata: {
source: 'synthetic',
domain: 'test',
language: 'en',
created_at: Date.now(),
},
};
}
export function createEmailTestDataset() {
const manager = new GroundTruthManager();
const testEntries = [
{
id: 'email_test_1',
text: 'Please contact john.doe@company.com for more information.',
annotations: [
{
start: 15,
end: 37,
type: 'email',
value: 'john.doe@company.com',
confidence: 1.0,
},
],
},
{
id: 'email_test_2',
text: 'Send reports to admin@example.org and backup@test.co.jp',
annotations: [
{
start: 16,
end: 33,
type: 'email',
value: 'admin@example.org',
confidence: 1.0,
},
{
start: 38,
end: 56,
type: 'email',
value: 'backup@test.co.jp',
confidence: 1.0,
},
],
},
];
for (const entry of testEntries) {
manager.addEntry({
...entry,
annotations: entry.annotations.map((a) => ({
...a,
metadata: { annotator: 'test', difficulty: 'easy' },
})),
metadata: {
source: 'test',
domain: 'email',
language: 'en',
created_at: Date.now(),
},
});
}
return manager;
}