UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

278 lines 12 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const index_1 = require("../../src/normalizers/index"); const index_2 = require("../../src/matchers/index"); describe('Semantic Normalizers Integration', () => { let normalizer; let matcher; beforeEach(() => { normalizer = new index_1.SemanticNormalizer(); matcher = new index_2.FuzzyMatcher({ threshold: 0.85 }); }); describe('Real-world data matching scenarios', () => { test('should match email variations across datasets', () => { const dataset1 = [ 'john.doe@gmail.com', 'jane.smith+newsletter@outlook.com', 'admin@company.org' ]; const dataset2 = [ 'johndoe@gmail.com', 'jane.smith@outlook.com', 'admin@company.org' ]; const matches = []; dataset1.forEach(email1 => { const norm1 = normalizer.normalizeEmail(email1); dataset2.forEach(email2 => { const norm2 = normalizer.normalizeEmail(email2); const fuzzyResult = matcher.compare(norm1.normalized, norm2.normalized); if (fuzzyResult.isMatch) { matches.push({ original: email1, normalized: norm1.normalized, match: email2, similarity: fuzzyResult.similarity }); } }); }); expect(matches).toHaveLength(3); expect(matches.every(m => m.similarity >= 0.85)).toBe(true); }); test('should handle international phone number matching', () => { const phones1 = [ '+1 (555) 123-4567', '+44 20 7123 4567', '+49 30 12345678' ]; const phones2 = [ '555-123-4567', '+442071234567', '03012345678' ]; const matches = []; phones1.forEach(phone1 => { const norm1 = normalizer.normalizePhone(phone1); phones2.forEach(phone2 => { const norm2 = normalizer.normalizePhone(phone2); if (norm1.isValid && norm2.isValid) { const fuzzyResult = matcher.compare(norm1.normalized, norm2.normalized); if (fuzzyResult.similarity > 0.9) { matches.push({ phone1, phone2, similarity: fuzzyResult.similarity }); } } }); }); expect(matches.length).toBeGreaterThan(0); }); test('should match name variations with cultural considerations', () => { const names1 = [ 'John Smith', 'María García-López', 'Dr. Michael O\'Connor Jr.', 'Sarah Johnson' ]; const names2 = [ 'Jon Smyth', 'Maria Garcia Lopez', 'Michael O\'Connor', 'Sara Johnson' ]; const matches = []; names1.forEach(name1 => { const norm1 = normalizer.normalizeName(name1); names2.forEach(name2 => { const norm2 = normalizer.normalizeName(name2); const fuzzyResult = matcher.compare(norm1.normalized, norm2.normalized); if (fuzzyResult.similarity >= 0.8) { matches.push({ name1, name2, similarity: fuzzyResult.similarity, confidence: fuzzyResult.confidence }); } }); }); expect(matches.length).toBeGreaterThanOrEqual(2); expect(matches.every(m => m.similarity >= 0.8)).toBe(true); }); test('should handle address normalization and matching', () => { const addresses1 = [ '123 Main St, New York, NY 10001', '456 Oak Ave, Los Angeles, California 90210' ]; const addresses2 = [ '123 Main Street, New York, New York 10001', '456 Oak Avenue, LA, CA 90210' ]; const matches = []; addresses1.forEach(addr1 => { const norm1 = normalizer.normalizeAddress(addr1); addresses2.forEach(addr2 => { const norm2 = normalizer.normalizeAddress(addr2); const fuzzyResult = matcher.compare(norm1.normalized, norm2.normalized); if (fuzzyResult.similarity >= 0.8) { matches.push({ addr1, addr2, similarity: fuzzyResult.similarity }); } }); }); expect(matches.length).toBeGreaterThanOrEqual(1); }); }); describe('Confidence scoring validation', () => { test('should provide accurate confidence scores', () => { const testCases = [ { type: 'email', input: 'john.doe+test@gmail.com', expectedConfidence: 0.8 }, { type: 'phone', input: '+1-555-123-4567', expectedConfidence: 0.8 }, { type: 'name', input: 'Dr. John Smith Jr.', expectedConfidence: 0.8 } ]; testCases.forEach(({ type, input, expectedConfidence }) => { const result = normalizer.normalizeField(input, type); expect(result.confidence).toBeGreaterThanOrEqual(expectedConfidence); }); }); }); describe('Accuracy validation', () => { test('should achieve >90% accuracy on fuzzy matching', () => { const testPairs = [ // Should match (true positives) ['hello', 'helo', true], ['John Smith', 'Jon Smyth', true], ['Microsoft', 'Mircosoft', true], ['test@gmail.com', 'test@googlemail.com', true], ['color', 'colour', true], // Should not match (true negatives) ['hello', 'world', false], ['John Smith', 'Jane Doe', false], ['Microsoft', 'Apple', false], ['test@gmail.com', 'admin@yahoo.com', false], ['red', 'blue', false] ]; let correct = 0; const threshold = 0.85; testPairs.forEach(([s1, s2, shouldMatch]) => { const result = matcher.compare(s1, s2); const isMatch = result.similarity >= threshold; if (isMatch === shouldMatch) { correct++; } }); const accuracy = correct / testPairs.length; console.log(`Fuzzy matching accuracy: ${(accuracy * 100).toFixed(1)}%`); expect(accuracy).toBeGreaterThanOrEqual(0.9); }); }); describe('Error handling and edge cases', () => { test('should handle malformed input gracefully', () => { const malformedInputs = [ { type: 'email', value: 'not-an-email' }, { type: 'phone', value: 'abc-def-ghij' }, { type: 'uuid', value: 'not-a-uuid' } ]; malformedInputs.forEach(({ type, value }) => { expect(() => { const result = normalizer.normalizeField(value, type); expect(result.confidence).toBeLessThan(0.5); }).not.toThrow(); }); }); test('should handle empty and null inputs', () => { const emptyInputs = ['', ' ', '\t\n']; emptyInputs.forEach(input => { expect(() => { normalizer.normalizeEmail(input); normalizer.normalizePhone(input); normalizer.normalizeName(input); normalizer.normalizeAddress(input); normalizer.normalizeUuid(input); }).not.toThrow(); }); }); }); describe('Performance under load', () => { test('should maintain performance with large datasets', () => { const largeDataset = Array.from({ length: 1000 }, (_, i) => ({ email: `user${i}@example${i % 10}.com`, phone: `555${String(i).padStart(7, '0')}`, name: `User ${i}` })); const start = performance.now(); largeDataset.forEach(record => { normalizer.normalizeEmail(record.email); normalizer.normalizePhone(record.phone); normalizer.normalizeName(record.name); }); const end = performance.now(); const totalTime = end - start; const avgTime = totalTime / largeDataset.length; console.log(`Large dataset processing: ${totalTime.toFixed(2)}ms total, ${avgTime.toFixed(3)}ms per record`); expect(avgTime).toBeLessThan(1.0); // Less than 1ms per record }); }); describe('International data support', () => { test('should handle international characters correctly', () => { const internationalData = [ { email: 'müller@münchen.de', expected: true }, { name: 'José María García', expected: true }, { phone: '+33 1 42 34 56 78', expected: true }, { address: 'Champs-Élysées, Paris, France', expected: true } ]; internationalData.forEach(({ email, name, phone, address, expected }) => { if (email) { const result = normalizer.normalizeEmail(email); expect(result.confidence > 0.8).toBe(expected); } if (name) { const result = normalizer.normalizeName(name); expect(result.confidence > 0.8).toBe(expected); } if (phone) { const result = normalizer.normalizePhone(phone); expect(result.isValid).toBe(expected); } if (address) { const result = normalizer.normalizeAddress(address); expect(result.confidence > 0.8).toBe(expected); } }); }); }); describe('Configuration flexibility', () => { test('should allow custom configuration per normalizer', () => { const customNormalizer = new index_1.SemanticNormalizer({ email: { removeDotsGmail: false }, phone: { defaultCountryCode: '44' }, name: { removeMiddleInitials: true } }); const emailResult = customNormalizer.normalizeEmail('john.doe@gmail.com'); expect(emailResult.normalized).toBe('john.doe@gmail.com'); const phoneResult = customNormalizer.normalizePhone('2071234567'); expect(phoneResult.countryCode).toBe('44'); const nameResult = customNormalizer.normalizeName('John M. Doe'); expect(nameResult.normalized).not.toContain('M.'); }); }); }); //# sourceMappingURL=semantic-normalizers.test.js.map