UNPKG

pelias-csv-importer

Version:

Pelias import pipeline for data from CSV files

236 lines (192 loc) 6.45 kB
'use strict'; const _ = require("lodash"); const through = require( 'through2' ); const logger = require( 'pelias-logger' ).get( 'csv-importer' ); const peliasModel = require( 'pelias-model' ); const NAME_REGEX = /^name(_json)?_([a-z]{2})$/i; function getPopularity(record) { const popularityString = getCaseInsensitive('popularity', record); const popularity = parseInt(popularityString); if (popularityString && (isNaN(popularity) || !/^\d+$/.test(popularityString))) { throw new Error("Popularity must be an int, got " + popularityString) } else { return popularity; } } function getPostalCode(record) { return getCaseInsensitive('zipcode', record) || getCaseInsensitive('postcode', record) || getCaseInsensitive('postalcode', record); } function _getByStringOrJson(prefix) { return (record, suffix = '') => { let result = [] const value = getCaseInsensitive(`${prefix}${suffix}`, record); if (value && typeof value === 'string') { result.push(value.trim()); } const value_json = getCaseInsensitiveAsJSON(`${prefix}_json${suffix}`, record); if (value_json && value_json instanceof Array) { result = result.concat(value_json); } return result; } } const getNames = _getByStringOrJson('name'); const getCategories = _getByStringOrJson('category'); function getLayer(record) { return getCaseInsensitive('layer', record) || getCaseInsensitive('layer_id', record) || 'venue'; } function getSource(record) { return getCaseInsensitive('source', record) || 'csv'; } function getHousenumber(record) { return getCaseInsensitive('housenumber', record) || getCaseInsensitive('number', record); } function getStreet(record) { return getCaseInsensitive('street', record); } function getCrossStreet(record) { return getCaseInsensitive('cross_street', record); } function getCaseInsensitive(field, record) { if (typeof field !== 'string') { return; } const lower = field.toLowerCase(); const upper = field.toUpperCase(); return record[lower] || record[upper]; } function getCaseInsensitiveAsJSON(field, record) { const value = getCaseInsensitive(field, record); if (value && typeof value === 'string') { return JSON.parse(value); } return value; } function getPrefixedFields(prefix, record) { let result = {}; Object.keys(record).forEach(function(field) { if (field.startsWith(prefix)) { const newKey = field.replace(prefix, ''); result[newKey] = record[field]; } }); return result; } function getMultiLangNames(record) { const result = { default: record }; Object.keys(record) .filter(field => /^name_/i.test(field)) .forEach(field => { const value = NAME_REGEX.exec(field) const isJson = value && value[1]; let lang = value && value[2]; if (lang && typeof lang === 'string') { lang = lang.toLowerCase(); result[lang] = result[lang] || {}; if (isJson && typeof isJson === 'string') { result[lang][`name_json_${lang}`] = record[field]; } else { result[lang][`name_${lang}`] = record[field]; } } }) return Object.entries(result); } function getCentroid(record) { const lat = getCaseInsensitive('lat', record); const lon = getCaseInsensitive('lon', record); if (lat && lon) { return { lat: lat, lon: lon }; } } function processRecord(record, next_uid, stats) { const id_number = getCaseInsensitive('id', record) || getCaseInsensitive('hash', record) || next_uid; const model_id = `${id_number}`; try { const layer = getLayer(record); const source = getSource(record); const pelias_document = new peliasModel.Document( source, layer, model_id ); const parent = getCaseInsensitiveAsJSON( `parent_json`, record ); if (_.isPlainObject(parent)) { _.each(parent, (values, field) => { _.each(values, (value) => { pelias_document.addParent(field, value.name, value.id, value.abbr, value.source) }) }) } getMultiLangNames(record).forEach(([lang, value]) => { const names = getNames(value, lang === 'default' ? '' : `_${lang}`); if (names && names.length > 0) { pelias_document.setName(lang, names[0]); for (let i = 1; i < names.length; i++) { pelias_document.setNameAlias(lang, names[i]); } } }) const centroid = getCentroid(record); if (centroid) { pelias_document.setCentroid( centroid ); } else { throw 'Invalid centroid'; //centroid is required } const street = getStreet(record); if (street) { pelias_document.setAddress('street', street); } const cross_street = getCrossStreet(record) if (cross_street) { pelias_document.setAddress('cross_street', cross_street); } const houseNumber = getHousenumber(record); if (houseNumber) { pelias_document.setAddress('number', houseNumber); } const postcode = getPostalCode(record); if (postcode) { pelias_document.setAddress('zip', postcode); } const popularity = getPopularity(record); if (popularity) { pelias_document.setPopularity(popularity); } const addendumFields = getPrefixedFields('addendum_json_', record); Object.keys(addendumFields).forEach(function(namespace) { if (addendumFields[namespace]) { pelias_document.setAddendum(namespace, JSON.parse(addendumFields[namespace])); } }); getCategories(record) .forEach(category => pelias_document.addCategory(category)); stats.goodRecordCount++; return pelias_document; } catch ( ex ){ logger.warn(`Record import failed on line ${next_uid} due to ${ex.message || ex}.`, record) stats.badRecordCount++; } } /* * Create a stream of Documents from valid, cleaned CSV records */ function createDocumentStream(id_prefix, stats) { /** * Used to track the UID of individual records passing through the stream if * there is no HASH that can be used as a more unique identifier. See * `peliasModel.Document.setId()` for information about UIDs. */ let uid = 0; return through.obj( function write( record, enc, next ){ const pelias_document = processRecord(record, uid, stats); uid++; if (pelias_document) { this.push( pelias_document ); } next(); } ); } module.exports = { create: createDocumentStream };