semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
256 lines • 8.81 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.NameNormalizer = void 0;
exports.normalizeName = normalizeName;
const PREFIXES = new Map([
['mr', 'Mr.'],
['mrs', 'Mrs.'],
['ms', 'Ms.'],
['miss', 'Miss'],
['dr', 'Dr.'],
['prof', 'Prof.'],
['professor', 'Prof.'],
['doctor', 'Dr.'],
['rev', 'Rev.'],
['reverend', 'Rev.'],
['sir', 'Sir'],
['lord', 'Lord'],
['lady', 'Lady'],
['hon', 'Hon.'],
['honorable', 'Hon.'],
['sen', 'Sen.'],
['senator', 'Sen.'],
['rep', 'Rep.'],
['representative', 'Rep.'],
['gov', 'Gov.'],
['governor', 'Gov.'],
['pres', 'Pres.'],
['president', 'Pres.'],
['capt', 'Capt.'],
['captain', 'Capt.'],
['col', 'Col.'],
['colonel', 'Col.'],
['gen', 'Gen.'],
['general', 'Gen.'],
['lt', 'Lt.'],
['lieutenant', 'Lt.'],
['maj', 'Maj.'],
['major', 'Maj.'],
['sgt', 'Sgt.'],
['sergeant', 'Sgt.']
]);
const SUFFIXES = new Map([
['jr', 'Jr.'],
['junior', 'Jr.'],
['sr', 'Sr.'],
['senior', 'Sr.'],
['ii', 'II'],
['iii', 'III'],
['iv', 'IV'],
['v', 'V'],
['vi', 'VI'],
['vii', 'VII'],
['viii', 'VIII'],
['ix', 'IX'],
['x', 'X'],
['md', 'M.D.'],
['phd', 'Ph.D.'],
['dds', 'D.D.S.'],
['dvm', 'D.V.M.'],
['jd', 'J.D.'],
['cpa', 'C.P.A.'],
['esq', 'Esq.'],
['esquire', 'Esq.'],
['rn', 'R.N.'],
['lpn', 'L.P.N.'],
['pa', 'P.A.'],
['np', 'N.P.']
]);
const PARTICLE_PREFIXES = new Set([
'de', 'del', 'della', 'delle', 'di', 'da', 'dal', 'dalla',
'von', 'van', 'der', 'den', 'ter', 'te',
'le', 'la', 'les', 'du', 'des',
'al', 'el', 'bin', 'ibn', 'abu',
'mac', 'mc', "o'", 'ó', 'ní', 'nic'
]);
class NameNormalizer {
options;
constructor(options = {}) {
this.options = {
removeMiddleInitials: false,
standardizePrefixes: true,
standardizeSuffixes: true,
handleHyphenated: true,
normalizeCase: true,
removeAccents: false,
...options
};
}
normalize(name) {
const original = name.trim();
if (!original) {
return {
normalized: '',
original,
confidence: 0,
components: {},
variations: []
};
}
let normalized = original;
const variations = [];
if (this.options.removeAccents) {
normalized = this.removeAccents(normalized);
}
if (this.options.normalizeCase) {
normalized = this.normalizeCase(normalized);
}
const components = this.parseNameComponents(normalized);
const assembledName = this.assembleName(components);
if (assembledName !== normalized) {
variations.push(assembledName);
}
this.generateVariations(components, variations);
const confidence = this.calculateConfidence(original, assembledName, components);
return {
normalized: assembledName,
original,
confidence,
components,
variations: [...new Set(variations)]
};
}
removeAccents(text) {
return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
}
normalizeCase(name) {
return name.toLowerCase().replace(/\b\w/g, char => char.toUpperCase());
}
parseNameComponents(name) {
const parts = name.split(/\s+/).filter(part => part.length > 0);
const components = {};
if (parts.length === 0)
return components;
let startIndex = 0;
let endIndex = parts.length - 1;
if (this.options.standardizePrefixes && this.isPrefix(parts[0])) {
components.prefix = PREFIXES.get(parts[0].toLowerCase().replace(/\./g, '')) || parts[0];
startIndex = 1;
}
if (this.options.standardizeSuffixes && endIndex >= startIndex && this.isSuffix(parts[endIndex])) {
components.suffix = SUFFIXES.get(parts[endIndex].toLowerCase().replace(/\./g, '')) || parts[endIndex];
endIndex--;
}
const nameParts = parts.slice(startIndex, endIndex + 1);
if (nameParts.length === 1) {
components.firstName = nameParts[0];
}
else if (nameParts.length === 2) {
components.firstName = nameParts[0];
components.lastName = nameParts[1];
}
else if (nameParts.length >= 3) {
components.firstName = nameParts[0];
if (this.options.removeMiddleInitials && nameParts[1].length === 1) {
components.lastName = nameParts.slice(2).join(' ');
}
else {
const middleParts = nameParts.slice(1, -1);
const lastPart = nameParts[nameParts.length - 1];
components.middleName = middleParts.join(' ');
components.lastName = lastPart;
}
}
return components;
}
isPrefix(word) {
const clean = word.toLowerCase().replace(/\./g, '');
return PREFIXES.has(clean);
}
isSuffix(word) {
const clean = word.toLowerCase().replace(/\./g, '');
return SUFFIXES.has(clean);
}
assembleName(components) {
const parts = [];
if (components.prefix)
parts.push(components.prefix);
if (components.firstName)
parts.push(components.firstName);
if (components.middleName && !this.options.removeMiddleInitials) {
parts.push(components.middleName);
}
if (components.lastName)
parts.push(components.lastName);
if (components.suffix)
parts.push(components.suffix);
return parts.join(' ');
}
generateVariations(components, variations) {
const { prefix, firstName, middleName, lastName, suffix } = components;
if (firstName && lastName) {
variations.push(`${lastName}, ${firstName}`);
if (middleName) {
variations.push(`${lastName}, ${firstName} ${middleName}`);
const middleInitial = middleName.split(' ').map(part => part[0] + '.').join(' ');
variations.push(`${firstName} ${middleInitial} ${lastName}`);
variations.push(`${lastName}, ${firstName} ${middleInitial}`);
}
if (!prefix) {
variations.push(`${firstName} ${lastName}`);
}
if (!suffix) {
const parts = [prefix, firstName, middleName, lastName].filter(Boolean);
variations.push(parts.join(' '));
}
}
if (firstName && middleName && lastName) {
const firstInitial = firstName[0] + '.';
variations.push(`${firstInitial} ${middleName} ${lastName}`);
variations.push(`${firstInitial} ${lastName}`);
}
if (this.options.handleHyphenated && lastName && lastName.includes('-')) {
const hyphenatedParts = lastName.split('-');
for (const part of hyphenatedParts) {
if (firstName) {
variations.push(`${firstName} ${part}`);
}
}
}
}
calculateConfidence(original, normalized, components) {
if (original === normalized)
return 1.0;
let confidence = 0.9;
const originalParts = original.split(/\s+/).length;
const componentCount = Object.keys(components).filter(key => components[key]).length;
if (componentCount < 2) {
confidence *= 0.7;
}
if (components.firstName && components.lastName) {
confidence *= 1.1;
}
const lengthDifference = Math.abs(original.length - normalized.length);
confidence -= (lengthDifference / original.length) * 0.1;
return Math.max(0.1, Math.min(1.0, confidence));
}
getDisplayName(components) {
if (components.firstName && components.lastName) {
return `${components.firstName} ${components.lastName}`;
}
return this.assembleName(components);
}
getLastFirst(components) {
if (components.firstName && components.lastName) {
const middleInitial = components.middleName ? ` ${components.middleName[0]}.` : '';
return `${components.lastName}, ${components.firstName}${middleInitial}`;
}
return this.assembleName(components);
}
}
exports.NameNormalizer = NameNormalizer;
function normalizeName(name, options) {
const normalizer = new NameNormalizer(options);
return normalizer.normalize(name);
}
//# sourceMappingURL=name.js.map