pelias-openaddresses
Version:
Pelias import pipeline for OpenAddresses.
73 lines (58 loc) • 2.33 kB
JavaScript
const _ = require('lodash');
const fs = require('fs');
const path = require('path');
/**
load a libpostal dictionary from disk
eg: https://raw.githubusercontent.com/openvenues/libpostal/master/resources/dictionaries/en/street_types.txt
libpostal format:
The leftmost string is treated as the canonical/normalized version.
Synonyms if any, are appended to the right, delimited by the pipe character.
see: https://github.com/openvenues/libpostal/tree/master/resources/dictionaries
output example:
{
'bruecke': 'bruecke',
'brücke': 'bruecke',
'brucke': 'bruecke',
'br.': 'bruecke'
}
*/
// regular expression to target removal of common punctuation
const PUNCTUATION_REGEX = /[.,\/#!$%\^&\*;:{}=\-_`~()]/g;
module.exports = (opts) => {
/**
* options
*
* countryCode (string) -- country-code corresponding to a subdirectory in the the ./directories folder
* filename (string) -- the name of the file to load inside the directory mentioed above
* includeSelfReferences (bool) -- whether to also include the canonical synonym in the map
* minLength (int) -- minimum valid length for a synonym in the dictionary
*/
const options = _.defaults({}, opts, {
includeSelfReferences: false,
minLength: 0
});
try {
const filepath = path.resolve(__dirname, 'dictionaries', options.countryCode, options.filename);
const file = fs.readFileSync(filepath).toString();
const lines = file.trim().split('\n');
const map = lines.reduce((obj, line) => {
var cols = line.trim().split('|');
// remove multi-word synonyms from all but the first position
cols = cols.filter((col, pos) => (pos === 0) || !/[\s]/.test(col));
cols.forEach((col, pos) => {
if (!options.includeSelfReferences && 0 === pos) { return; } // skip first column ( the expansion )
if (col.replace(PUNCTUATION_REGEX).length < (options.minLength || 0)) { return; } // skip very short synonyms
// warn user when duplicate terms are added to the map
if (obj.hasOwnProperty(col)){
console.warn(`[${options.filename}] trying to replace ${col}=>${obj[col]} with ${col}=>${cols[0]}`);
}
obj[col] = cols[0];
});
return obj;
}, {});
return map;
}
catch (e) {
return {};
}
};