semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
344 lines • 14.2 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.AddressNormalizer = void 0;
exports.normalizeAddress = normalizeAddress;
const STREET_TYPES = new Map([
['st', 'Street'], ['str', 'Street'], ['street', 'Street'],
['ave', 'Avenue'], ['av', 'Avenue'], ['avenue', 'Avenue'],
['blvd', 'Boulevard'], ['boul', 'Boulevard'], ['boulevard', 'Boulevard'],
['rd', 'Road'], ['road', 'Road'],
['dr', 'Drive'], ['drv', 'Drive'], ['drive', 'Drive'],
['ln', 'Lane'], ['lane', 'Lane'],
['ct', 'Court'], ['crt', 'Court'], ['court', 'Court'],
['cir', 'Circle'], ['circ', 'Circle'], ['circle', 'Circle'],
['pl', 'Place'], ['plc', 'Place'], ['place', 'Place'],
['way', 'Way'], ['wy', 'Way'],
['ter', 'Terrace'], ['terr', 'Terrace'], ['terrace', 'Terrace'],
['pkwy', 'Parkway'], ['pky', 'Parkway'], ['parkway', 'Parkway'],
['hwy', 'Highway'], ['highway', 'Highway'],
['fwy', 'Freeway'], ['freeway', 'Freeway'],
['expy', 'Expressway'], ['expressway', 'Expressway'],
['trl', 'Trail'], ['trail', 'Trail'],
['path', 'Path'], ['pth', 'Path'],
['walk', 'Walk'], ['wlk', 'Walk'],
['sq', 'Square'], ['square', 'Square'],
['loop', 'Loop'], ['lp', 'Loop'],
['bend', 'Bend'], ['bnd', 'Bend'],
['crk', 'Creek'], ['creek', 'Creek'],
['xing', 'Crossing'], ['crossing', 'Crossing'],
['pt', 'Point'], ['point', 'Point'],
['ridge', 'Ridge'], ['rdg', 'Ridge'],
['hill', 'Hill'], ['hl', 'Hill'],
['valley', 'Valley'], ['vly', 'Valley'],
['grove', 'Grove'], ['grv', 'Grove'],
['park', 'Park'], ['pk', 'Park'],
['gardens', 'Gardens'], ['gdns', 'Gardens'],
['heights', 'Heights'], ['hts', 'Heights'],
['meadows', 'Meadows'], ['mdws', 'Meadows'],
['woods', 'Woods'], ['wds', 'Woods']
]);
const DIRECTIONS = new Map([
['n', 'North'], ['no', 'North'], ['north', 'North'],
['s', 'South'], ['so', 'South'], ['south', 'South'],
['e', 'East'], ['ea', 'East'], ['east', 'East'],
['w', 'West'], ['we', 'West'], ['west', 'West'],
['ne', 'Northeast'], ['northeast', 'Northeast'],
['nw', 'Northwest'], ['northwest', 'Northwest'],
['se', 'Southeast'], ['southeast', 'Southeast'],
['sw', 'Southwest'], ['southwest', 'Southwest']
]);
const US_STATES = new Map([
['al', 'Alabama'], ['alabama', 'Alabama'],
['ak', 'Alaska'], ['alaska', 'Alaska'],
['az', 'Arizona'], ['arizona', 'Arizona'],
['ar', 'Arkansas'], ['arkansas', 'Arkansas'],
['ca', 'California'], ['california', 'California'],
['co', 'Colorado'], ['colorado', 'Colorado'],
['ct', 'Connecticut'], ['connecticut', 'Connecticut'],
['de', 'Delaware'], ['delaware', 'Delaware'],
['fl', 'Florida'], ['florida', 'Florida'],
['ga', 'Georgia'], ['georgia', 'Georgia'],
['hi', 'Hawaii'], ['hawaii', 'Hawaii'],
['id', 'Idaho'], ['idaho', 'Idaho'],
['il', 'Illinois'], ['illinois', 'Illinois'],
['in', 'Indiana'], ['indiana', 'Indiana'],
['ia', 'Iowa'], ['iowa', 'Iowa'],
['ks', 'Kansas'], ['kansas', 'Kansas'],
['ky', 'Kentucky'], ['kentucky', 'Kentucky'],
['la', 'Louisiana'], ['louisiana', 'Louisiana'],
['me', 'Maine'], ['maine', 'Maine'],
['md', 'Maryland'], ['maryland', 'Maryland'],
['ma', 'Massachusetts'], ['massachusetts', 'Massachusetts'],
['mi', 'Michigan'], ['michigan', 'Michigan'],
['mn', 'Minnesota'], ['minnesota', 'Minnesota'],
['ms', 'Mississippi'], ['mississippi', 'Mississippi'],
['mo', 'Missouri'], ['missouri', 'Missouri'],
['mt', 'Montana'], ['montana', 'Montana'],
['ne', 'Nebraska'], ['nebraska', 'Nebraska'],
['nv', 'Nevada'], ['nevada', 'Nevada'],
['nh', 'New Hampshire'], ['new hampshire', 'New Hampshire'],
['nj', 'New Jersey'], ['new jersey', 'New Jersey'],
['nm', 'New Mexico'], ['new mexico', 'New Mexico'],
['ny', 'New York'], ['new york', 'New York'],
['nc', 'North Carolina'], ['north carolina', 'North Carolina'],
['nd', 'North Dakota'], ['north dakota', 'North Dakota'],
['oh', 'Ohio'], ['ohio', 'Ohio'],
['ok', 'Oklahoma'], ['oklahoma', 'Oklahoma'],
['or', 'Oregon'], ['oregon', 'Oregon'],
['pa', 'Pennsylvania'], ['pennsylvania', 'Pennsylvania'],
['ri', 'Rhode Island'], ['rhode island', 'Rhode Island'],
['sc', 'South Carolina'], ['south carolina', 'South Carolina'],
['sd', 'South Dakota'], ['south dakota', 'South Dakota'],
['tn', 'Tennessee'], ['tennessee', 'Tennessee'],
['tx', 'Texas'], ['texas', 'Texas'],
['ut', 'Utah'], ['utah', 'Utah'],
['vt', 'Vermont'], ['vermont', 'Vermont'],
['va', 'Virginia'], ['virginia', 'Virginia'],
['wa', 'Washington'], ['washington', 'Washington'],
['wv', 'West Virginia'], ['west virginia', 'West Virginia'],
['wi', 'Wisconsin'], ['wisconsin', 'Wisconsin'],
['wy', 'Wyoming'], ['wyoming', 'Wyoming'],
['dc', 'District of Columbia'], ['district of columbia', 'District of Columbia']
]);
const UNIT_TYPES = new Set([
'apt', 'apartment', 'unit', 'ste', 'suite', 'floor', 'fl', 'room', 'rm',
'bldg', 'building', 'lot', 'space', 'spc', 'trailer', 'trlr'
]);
class AddressNormalizer {
options;
constructor(options = {}) {
this.options = {
standardizeStreetTypes: true,
standardizeDirections: true,
standardizeStates: true,
removeExtraSpaces: true,
normalizeCase: true,
expandAbbreviations: true,
...options
};
}
normalize(address) {
const original = address.trim();
if (!original) {
return {
normalized: '',
original,
confidence: 0,
components: {},
variations: []
};
}
let normalized = original;
const variations = [];
if (this.options.removeExtraSpaces) {
normalized = normalized.replace(/\s+/g, ' ');
}
if (this.options.normalizeCase) {
normalized = this.normalizeCase(normalized);
}
const components = this.parseAddressComponents(normalized);
const assembledAddress = this.assembleAddress(components);
if (assembledAddress !== normalized) {
variations.push(assembledAddress);
}
this.generateVariations(components, variations);
const confidence = this.calculateConfidence(original, assembledAddress, components);
return {
normalized: assembledAddress,
original,
confidence,
components,
variations: [...new Set(variations)]
};
}
normalizeCase(address) {
return address.toLowerCase().replace(/\b\w/g, char => char.toUpperCase());
}
parseAddressComponents(address) {
const components = {};
const postalCodeMatch = address.match(/\b(\d{5}(?:-\d{4})?)\b/);
if (postalCodeMatch) {
components.postalCode = postalCodeMatch[1];
address = address.replace(postalCodeMatch[0], '').trim();
}
const stateMatch = address.match(/\b([A-Za-z]{2}|[A-Za-z\s]+)\s*$/);
if (stateMatch && this.options.standardizeStates) {
const stateCandidate = stateMatch[1].toLowerCase().trim();
const standardState = US_STATES.get(stateCandidate);
if (standardState) {
components.state = standardState;
address = address.replace(stateMatch[0], '').trim();
}
}
const parts = address.split(',').map(part => part.trim());
if (parts.length >= 2) {
components.city = parts[parts.length - 1];
const streetPart = parts.slice(0, -1).join(', ');
this.parseStreetAddress(streetPart, components);
}
else {
this.parseStreetAddress(address, components);
}
return components;
}
parseStreetAddress(street, components) {
const unitMatch = street.match(/\b(apt|apartment|unit|ste|suite|floor|fl|room|rm|bldg|building|lot|space|spc|trailer|trlr)\.?\s*([a-z0-9-]+)\b/i);
if (unitMatch) {
components.unit = `${unitMatch[1].charAt(0).toUpperCase() + unitMatch[1].slice(1)} ${unitMatch[2]}`;
street = street.replace(unitMatch[0], '').trim();
}
const numberMatch = street.match(/^(\d+[a-z]?)\s+/);
if (numberMatch) {
components.streetNumber = numberMatch[1];
street = street.replace(numberMatch[0], '').trim();
}
const streetParts = street.split(/\s+/);
if (streetParts.length > 0) {
const lastPart = streetParts[streetParts.length - 1].toLowerCase().replace(/\./g, '');
if (this.options.standardizeStreetTypes && STREET_TYPES.has(lastPart)) {
components.streetType = STREET_TYPES.get(lastPart);
components.streetName = streetParts.slice(0, -1).join(' ');
}
else {
components.streetName = streetParts.join(' ');
}
if (this.options.standardizeDirections) {
this.normalizeDirections(components);
}
}
}
normalizeDirections(components) {
if (components.streetName) {
const words = components.streetName.split(/\s+/);
const normalizedWords = words.map(word => {
const lower = word.toLowerCase().replace(/\./g, '');
return DIRECTIONS.get(lower) || word;
});
components.streetName = normalizedWords.join(' ');
}
}
assembleAddress(components) {
const parts = [];
if (components.streetNumber) {
parts.push(components.streetNumber);
}
if (components.streetName) {
parts.push(components.streetName);
}
if (components.streetType) {
parts.push(components.streetType);
}
if (components.unit) {
parts.push(components.unit);
}
const streetAddress = parts.join(' ');
const addressParts = [];
if (streetAddress) {
addressParts.push(streetAddress);
}
if (components.city) {
addressParts.push(components.city);
}
if (components.state) {
addressParts.push(components.state);
}
if (components.postalCode) {
addressParts.push(components.postalCode);
}
return addressParts.join(', ');
}
generateVariations(components, variations) {
const { streetNumber, streetName, streetType, unit, city, state, postalCode } = components;
if (streetNumber && streetName) {
if (streetType) {
const shortType = this.getShortStreetType(streetType);
if (shortType !== streetType) {
const shortAddress = `${streetNumber} ${streetName} ${shortType}`;
if (city)
variations.push(`${shortAddress}, ${city}`);
if (city && state)
variations.push(`${shortAddress}, ${city}, ${state}`);
}
}
if (!unit) {
const baseAddress = [streetNumber, streetName, streetType].filter(Boolean).join(' ');
variations.push(baseAddress);
if (city)
variations.push(`${baseAddress}, ${city}`);
}
}
if (state && this.options.standardizeStates) {
const stateAbbrev = this.getStateAbbreviation(state);
if (stateAbbrev !== state) {
const parts = [city, stateAbbrev, postalCode].filter(Boolean);
if (parts.length > 0) {
variations.push(parts.join(', '));
}
}
}
}
getShortStreetType(streetType) {
for (const [abbrev, full] of STREET_TYPES.entries()) {
if (full === streetType) {
return abbrev.charAt(0).toUpperCase() + abbrev.slice(1);
}
}
return streetType;
}
getStateAbbreviation(state) {
for (const [abbrev, full] of US_STATES.entries()) {
if (full === state) {
return abbrev.toUpperCase();
}
}
return state;
}
calculateConfidence(original, normalized, components) {
if (original === normalized)
return 1.0;
let confidence = 0.8;
const componentCount = Object.keys(components).filter(key => components[key]).length;
if (componentCount >= 4) {
confidence += 0.15;
}
else if (componentCount >= 2) {
confidence += 0.1;
}
if (components.streetNumber && components.streetName) {
confidence += 0.1;
}
if (components.city && components.state) {
confidence += 0.1;
}
if (components.postalCode) {
confidence += 0.05;
}
const lengthDifference = Math.abs(original.length - normalized.length);
confidence -= (lengthDifference / original.length) * 0.05;
return Math.max(0.1, Math.min(1.0, confidence));
}
getShortForm(components) {
const parts = [];
if (components.streetNumber && components.streetName) {
const streetType = components.streetType ? this.getShortStreetType(components.streetType) : '';
parts.push([components.streetNumber, components.streetName, streetType].filter(Boolean).join(' '));
}
if (components.city) {
parts.push(components.city);
}
if (components.state) {
const stateAbbrev = this.getStateAbbreviation(components.state);
parts.push(stateAbbrev);
}
if (components.postalCode) {
parts.push(components.postalCode);
}
return parts.join(', ');
}
}
exports.AddressNormalizer = AddressNormalizer;
function normalizeAddress(address, options) {
const normalizer = new AddressNormalizer(options);
return normalizer.normalize(address);
}
//# sourceMappingURL=address.js.map