UNPKG

lrs-xml-parser

Version:

TCO 16 Megahack - XML Parser

431 lines (387 loc) 15.9 kB
/* * Copyright (C) 2016 TopCoder, Inc. All Rights Reserved. */ /** * Contains the core functions for parsing the LRS XML files. * * @author TCSDEVELOPER * @version 1.0 */ 'use strict'; const config = require('../config/config'); const _ = require('underscore'); const async = require('async'); const fs = require('fs'); const parse = require('xml-parser'); const path = require('path'); const logger = require('winston'); const NaicsSchema = require('./models/Naics'); const ProgramSchema = require('./models/Program'); const RuleSchema = require('./models/Rule'); const mongoose = require('mongoose'); const Naics = mongoose.model('Naics', NaicsSchema); const Program = mongoose.model('Program', ProgramSchema); const Rule = mongoose.model('Rule', RuleSchema); mongoose.connect(config.mongo_uri); module.exports = { getNaicsCodes, getPrograms, getCleanAirActRules }; /** * Get the regulation ID from the specified rdfId string. * * @param {String} rdfId the rdfId retrieved from the XML file * * @return {String} the regulation ID */ function getRegulationId(rdfId) { return rdfId.substring(1, rdfId.indexOf('-')); } /** * Helper method to get the value of a parsed XML node. * * @param {String} name the name of the node * @param {Array} children the list of children to retrieve the value from * * @return {String} the node value, or null if the node was not found */ function getXmlNodeValue(name, children) { for (let i = 0; i < children.length; i++) { if (name === children[i].name) { return children[i].content; } } return null; } /** * Check if the specified value falls between 'Part 50' and 'Part 98' inclusive. * * @param {String} value the value to check * * @return {Boolean} true if the value is a valid part, false otherwise */ function isValidPart(value) { if (!value) { return false; } if ('Part' !== value.substring(0, 4)) { return false; } const arr = value.split(' '); if (arr.length > 2) { return false; } const partNumber = parseInt(arr[1], 10); return (partNumber >= 50 && partNumber <= 98); } /** * Get a list of NAICS codes from the specified XML file and persist to MongoDB. * * @param {String} xmlFilename the full path to the XML file containing the codes * @param {Function} callback to be executed upon completion */ function getNaicsCodes(xmlFilename, callback) { async.waterfall([ (cb) => { fs.readFile(xmlFilename, 'utf8', cb); }, (xml, cb) => { const object = parse(xml); const children = object.root.children; const termIds = []; const naicsCodesMap = {}; const naicsTitlesMap = {}; // First pass, get the NAICS codes, names and the linking term IDs for (let i = 0; i < children.length; i++) { const child = children[i]; if ('skos:Concept' === child.name) { const termId = getXmlNodeValue('zthes:termID', child.children); if (termId && !_.contains(termIds, termId)) { termIds.push(termId); } } if ('rdf:Description' === child.name && child.attributes && child.attributes['rdf:about']) { if ('NAICSUSCode' === child.attributes['rdf:about'].substring(0, 11)) { naicsCodesMap[child.attributes['rdf:about']] = getXmlNodeValue('zthes:label', child.children); } if ('NAICSUSTitle' === child.attributes['rdf:about'].substring(0, 12)) { naicsTitlesMap[child.attributes['rdf:about']] = getXmlNodeValue('zthes:label', child.children); } } } // Second pass, build the list of codes and const naicsCodes = []; for (let i = 0; i < termIds.length; i++) { const linkId = termIds[i]; const codeKey = 'NAICSUSCode-' + linkId; const titleKey = 'NAICSUSTitle-' + linkId; if (naicsCodesMap[codeKey]) { naicsCodes.push({ code: naicsCodesMap[codeKey], title: naicsTitlesMap[titleKey], termId: linkId }); } } cb(null, naicsCodes); }, (naicsCodes, cb) => { // Save the naicsCodes to MongoDB Naics.collection.insert(naicsCodes, cb); } ], (err, naicsCodes) => { if (err) { logger.error(err); return callback(err, null); } return callback(null, naicsCodes); }); } /** * Get a list of programs from the specified XML file and the associated rules / regulation IDs * and persist to MongoDB. * * @param {String} xmlFilename the full path to the XML file containing the programs * @param {String} baseFolder the full path to the folder containing the CFR2015Title40XXX files * @param {Function} callback to be executed upon completion */ function getPrograms(xmlFilename, baseFolder, callback) { const termIds = []; const termIdPrograms = {}; async.waterfall([ (cb) => { fs.readFile(xmlFilename, 'utf8', cb); }, (xml, cb) => { const object = parse(xml); const children = object.root.children; // First pass, get the program names and the linking term IDs for (let i = 0; i < children.length; i++) { const child = children[i]; if ('skos:Concept' === child.name) { const label = getXmlNodeValue('skos:prefLabel', child.children); const termId = getXmlNodeValue('zthes:termID', child.children); if (label && termId) { termIdPrograms[termId] = { name: label, rules: [], regulationIds: [] }; if (!_.contains(termIds, termId)) { termIds.push(termId); } } } } cb(); }, (cb) => { fs.readdir(baseFolder, cb); }, (filenames, cb) => { const cfrFilenames = []; for (let i = 0; i < filenames.length; i++) { const filename = filenames[i]; if ('CFR2015Title40' === filename.substring(0, 14)) { cfrFilenames.push(filename); } } cb(null, cfrFilenames); }, (cfrFilenames, cb) => { async.each(cfrFilenames, (filename, ecb) => { const cfrFile = path.join(baseFolder, filename); getProgramRules(cfrFile, termIds, (err, rulesMap) => { for (let termId in rulesMap) { if (rulesMap.hasOwnProperty(termId)) { termIdPrograms[termId].rules = rulesMap[termId]; for (let i = 0; i < rulesMap[termId].length; i++) { termIdPrograms[termId].regulationIds.push(getRegulationId(rulesMap[termId][i])); } } } ecb(); }); }, (err) => { if (err) { return cb(err, null); } const programs = []; for (let termId in termIdPrograms) { if (termIdPrograms.hasOwnProperty(termId)) { programs.push(termIdPrograms[termId]); } } return cb(null, programs); }); }, (programs, cb) => { Program.collection.insert(programs, cb); } ], (err, programs) => { if (err) { logger.error(err); return callback(err, null); } logger.info(JSON.stringify(programs, null, 2)); return callback(null, programs); }); } /** * Get a list of programs rules from the specified file and associate them with their respective termIds. * * @param {String} xmlFilename the full path to the XML file containing the regulations * @param {String} termIds the list of term IDs to check for association * @param {Function} callback to be executed upon completion */ function getProgramRules(xmlFilename, termIds, callback) { async.waterfall([ (cb) => { fs.readFile(xmlFilename, 'utf8', cb); }, (xml, cb) => { const object = parse(xml); const children = object.root.children; const termIdRules = {}; for (let i = 0; i < termIds.length; i++) { const termId = termIds[i]; if (!termIdRules[termId]) { termIdRules[termId] = []; } for (let j = 0; j < children.length; j++) { const child = children[j]; if ('skos:Concept' === child.name) { for (let k = 0; k < child.children.length; k++) { const skosChild = child.children[k]; if ('skm:PC' === skosChild.name) { if (('#' + termId) === skosChild.attributes['rdf:resource'] && skosChild.attributes['rdf:ID']) { termIdRules[termId].push(skosChild.attributes['rdf:ID']); } } } } } } cb(null, termIdRules); } ], (err, rulesMap) => { if (err) { logger.error(err); return callback(err, null); } return callback(null, rulesMap); }); } /** * Get a list of rules from XML files in the specified base foolder. * * @param {String} baseFolder the full path to the folder containing the CFR2015Title40XXX files * @param {Function} callback to be executed upon completion */ function getCleanAirActRules(baseFolder, callback) { const cleanAirRules = []; const termIdRules = {}; const cfrTitlesMap = {}; const cfrUrlsMap = {}; const cfrHeadingsMap = {}; async.waterfall([ (cb) => { fs.readdir(baseFolder, cb); }, (filenames, cb) => { const cfrFilenames = []; for (let i = 0; i < filenames.length; i++) { const filename = filenames[i]; if ('CFR2015Title40' === filename.substring(0, 14)) { cfrFilenames.push(filename); } } cb(null, cfrFilenames); }, (cfrFilenames, cb) => { async.each(cfrFilenames, (filename, ecb) => { const cfrFile = path.join(baseFolder, filename); async.waterfall([ (wcb) => { fs.readFile(cfrFile, 'utf8', wcb); }, (xml, wcb) => { const object = parse(xml); const children = object.root.children; for (let i = 0; i < children.length; i++) { const child = children[i]; if ('skos:Concept' === child.name) { for (let k = 0; k < child.children.length; k++) { const skosChild = child.children[k]; if ('skm:PC' === skosChild.name && skosChild.attributes['rdf:resource'] && '#' === skosChild.attributes['rdf:resource'].substring(0, 1) && skosChild.attributes['rdf:ID']) { const termId = skosChild.attributes['rdf:resource'].substring(1); if (!termIdRules[termId]) { termIdRules[termId] = []; } termIdRules[termId].push(skosChild.attributes['rdf:ID']); } } } if ('rdf:Description' === child.name && child.attributes && child.attributes['rdf:about']) { const attribValue = child.attributes['rdf:about']; if ('Title-' === attribValue.substring(0, 6)) { cfrTitlesMap[attribValue] = getXmlNodeValue('zthes:label', child.children); } if ('URL-' === attribValue.substring(0, 4)) { cfrUrlsMap[attribValue] = getXmlNodeValue('zthes:label', child.children); } if ('Heading-' === attribValue.substring(0, 8)) { cfrHeadingsMap[attribValue] = getXmlNodeValue('zthes:label', child.children); } } } wcb(); } ], (err) => { if (err) { logger.error(err); return ecb(err, null); } return ecb(); }); }, (err) => { if (err) { return cb(err, null); } for (let termId in termIdRules) { if (termIdRules.hasOwnProperty(termId)) { const rules = termIdRules[termId]; for (let i = 0; i < rules.length; i++) { const regulationId = getRegulationId(rules[i]); const titleKey = 'Title-' + regulationId; const urlKey = 'URL-' + regulationId; const headingKey = 'Heading-' + regulationId; if (cfrTitlesMap[titleKey] && cfrUrlsMap[urlKey] && isValidPart(cfrHeadingsMap[headingKey])) { cleanAirRules.push({ regulationId: regulationId, termId: termId, title: cfrTitlesMap[titleKey], url: cfrUrlsMap[urlKey] }); } } } } return cb(null, cleanAirRules); }); }, (rules, cb) => { // Save the rules to MongoDB Rule.collection.insert(rules, cb); } ], (err, rules) => { if (err) { logger.error(err); return callback(err, null); } logger.info(JSON.stringify(cleanAirRules, null, 2)); return callback(null, rules); }); }