lrs-xml-parser
Version:
TCO 16 Megahack - XML Parser
431 lines (387 loc) • 15.9 kB
JavaScript
/*
* Copyright (C) 2016 TopCoder, Inc. All Rights Reserved.
*/
/**
* Contains the core functions for parsing the LRS XML files.
*
* @author TCSDEVELOPER
* @version 1.0
*/
;
const config = require('../config/config');
const _ = require('underscore');
const async = require('async');
const fs = require('fs');
const parse = require('xml-parser');
const path = require('path');
const logger = require('winston');
const NaicsSchema = require('./models/Naics');
const ProgramSchema = require('./models/Program');
const RuleSchema = require('./models/Rule');
const mongoose = require('mongoose');
const Naics = mongoose.model('Naics', NaicsSchema);
const Program = mongoose.model('Program', ProgramSchema);
const Rule = mongoose.model('Rule', RuleSchema);
mongoose.connect(config.mongo_uri);
module.exports = {
getNaicsCodes,
getPrograms,
getCleanAirActRules
};
/**
* Get the regulation ID from the specified rdfId string.
*
* @param {String} rdfId the rdfId retrieved from the XML file
*
* @return {String} the regulation ID
*/
function getRegulationId(rdfId) {
return rdfId.substring(1, rdfId.indexOf('-'));
}
/**
* Helper method to get the value of a parsed XML node.
*
* @param {String} name the name of the node
* @param {Array} children the list of children to retrieve the value from
*
* @return {String} the node value, or null if the node was not found
*/
function getXmlNodeValue(name, children) {
for (let i = 0; i < children.length; i++) {
if (name === children[i].name) {
return children[i].content;
}
}
return null;
}
/**
* Check if the specified value falls between 'Part 50' and 'Part 98' inclusive.
*
* @param {String} value the value to check
*
* @return {Boolean} true if the value is a valid part, false otherwise
*/
function isValidPart(value) {
if (!value) {
return false;
}
if ('Part' !== value.substring(0, 4)) {
return false;
}
const arr = value.split(' ');
if (arr.length > 2) {
return false;
}
const partNumber = parseInt(arr[1], 10);
return (partNumber >= 50 && partNumber <= 98);
}
/**
* Get a list of NAICS codes from the specified XML file and persist to MongoDB.
*
* @param {String} xmlFilename the full path to the XML file containing the codes
* @param {Function} callback to be executed upon completion
*/
function getNaicsCodes(xmlFilename, callback) {
async.waterfall([
(cb) => {
fs.readFile(xmlFilename, 'utf8', cb);
},
(xml, cb) => {
const object = parse(xml);
const children = object.root.children;
const termIds = [];
const naicsCodesMap = {};
const naicsTitlesMap = {};
// First pass, get the NAICS codes, names and the linking term IDs
for (let i = 0; i < children.length; i++) {
const child = children[i];
if ('skos:Concept' === child.name) {
const termId = getXmlNodeValue('zthes:termID', child.children);
if (termId && !_.contains(termIds, termId)) {
termIds.push(termId);
}
}
if ('rdf:Description' === child.name && child.attributes && child.attributes['rdf:about']) {
if ('NAICSUSCode' === child.attributes['rdf:about'].substring(0, 11)) {
naicsCodesMap[child.attributes['rdf:about']] = getXmlNodeValue('zthes:label', child.children);
}
if ('NAICSUSTitle' === child.attributes['rdf:about'].substring(0, 12)) {
naicsTitlesMap[child.attributes['rdf:about']] = getXmlNodeValue('zthes:label', child.children);
}
}
}
// Second pass, build the list of codes and
const naicsCodes = [];
for (let i = 0; i < termIds.length; i++) {
const linkId = termIds[i];
const codeKey = 'NAICSUSCode-' + linkId;
const titleKey = 'NAICSUSTitle-' + linkId;
if (naicsCodesMap[codeKey]) {
naicsCodes.push({
code: naicsCodesMap[codeKey],
title: naicsTitlesMap[titleKey],
termId: linkId
});
}
}
cb(null, naicsCodes);
},
(naicsCodes, cb) => {
// Save the naicsCodes to MongoDB
Naics.collection.insert(naicsCodes, cb);
}
], (err, naicsCodes) => {
if (err) {
logger.error(err);
return callback(err, null);
}
return callback(null, naicsCodes);
});
}
/**
* Get a list of programs from the specified XML file and the associated rules / regulation IDs
* and persist to MongoDB.
*
* @param {String} xmlFilename the full path to the XML file containing the programs
* @param {String} baseFolder the full path to the folder containing the CFR2015Title40XXX files
* @param {Function} callback to be executed upon completion
*/
function getPrograms(xmlFilename, baseFolder, callback) {
const termIds = [];
const termIdPrograms = {};
async.waterfall([
(cb) => {
fs.readFile(xmlFilename, 'utf8', cb);
},
(xml, cb) => {
const object = parse(xml);
const children = object.root.children;
// First pass, get the program names and the linking term IDs
for (let i = 0; i < children.length; i++) {
const child = children[i];
if ('skos:Concept' === child.name) {
const label = getXmlNodeValue('skos:prefLabel', child.children);
const termId = getXmlNodeValue('zthes:termID', child.children);
if (label && termId) {
termIdPrograms[termId] = { name: label, rules: [], regulationIds: [] };
if (!_.contains(termIds, termId)) {
termIds.push(termId);
}
}
}
}
cb();
},
(cb) => {
fs.readdir(baseFolder, cb);
},
(filenames, cb) => {
const cfrFilenames = [];
for (let i = 0; i < filenames.length; i++) {
const filename = filenames[i];
if ('CFR2015Title40' === filename.substring(0, 14)) {
cfrFilenames.push(filename);
}
}
cb(null, cfrFilenames);
},
(cfrFilenames, cb) => {
async.each(cfrFilenames, (filename, ecb) => {
const cfrFile = path.join(baseFolder, filename);
getProgramRules(cfrFile, termIds, (err, rulesMap) => {
for (let termId in rulesMap) {
if (rulesMap.hasOwnProperty(termId)) {
termIdPrograms[termId].rules = rulesMap[termId];
for (let i = 0; i < rulesMap[termId].length; i++) {
termIdPrograms[termId].regulationIds.push(getRegulationId(rulesMap[termId][i]));
}
}
}
ecb();
});
}, (err) => {
if (err) {
return cb(err, null);
}
const programs = [];
for (let termId in termIdPrograms) {
if (termIdPrograms.hasOwnProperty(termId)) {
programs.push(termIdPrograms[termId]);
}
}
return cb(null, programs);
});
},
(programs, cb) => {
Program.collection.insert(programs, cb);
}
], (err, programs) => {
if (err) {
logger.error(err);
return callback(err, null);
}
logger.info(JSON.stringify(programs, null, 2));
return callback(null, programs);
});
}
/**
* Get a list of programs rules from the specified file and associate them with their respective termIds.
*
* @param {String} xmlFilename the full path to the XML file containing the regulations
* @param {String} termIds the list of term IDs to check for association
* @param {Function} callback to be executed upon completion
*/
function getProgramRules(xmlFilename, termIds, callback) {
async.waterfall([
(cb) => {
fs.readFile(xmlFilename, 'utf8', cb);
},
(xml, cb) => {
const object = parse(xml);
const children = object.root.children;
const termIdRules = {};
for (let i = 0; i < termIds.length; i++) {
const termId = termIds[i];
if (!termIdRules[termId]) {
termIdRules[termId] = [];
}
for (let j = 0; j < children.length; j++) {
const child = children[j];
if ('skos:Concept' === child.name) {
for (let k = 0; k < child.children.length; k++) {
const skosChild = child.children[k];
if ('skm:PC' === skosChild.name) {
if (('#' + termId) === skosChild.attributes['rdf:resource'] &&
skosChild.attributes['rdf:ID']) {
termIdRules[termId].push(skosChild.attributes['rdf:ID']);
}
}
}
}
}
}
cb(null, termIdRules);
}
], (err, rulesMap) => {
if (err) {
logger.error(err);
return callback(err, null);
}
return callback(null, rulesMap);
});
}
/**
* Get a list of rules from XML files in the specified base foolder.
*
* @param {String} baseFolder the full path to the folder containing the CFR2015Title40XXX files
* @param {Function} callback to be executed upon completion
*/
function getCleanAirActRules(baseFolder, callback) {
const cleanAirRules = [];
const termIdRules = {};
const cfrTitlesMap = {};
const cfrUrlsMap = {};
const cfrHeadingsMap = {};
async.waterfall([
(cb) => {
fs.readdir(baseFolder, cb);
},
(filenames, cb) => {
const cfrFilenames = [];
for (let i = 0; i < filenames.length; i++) {
const filename = filenames[i];
if ('CFR2015Title40' === filename.substring(0, 14)) {
cfrFilenames.push(filename);
}
}
cb(null, cfrFilenames);
},
(cfrFilenames, cb) => {
async.each(cfrFilenames, (filename, ecb) => {
const cfrFile = path.join(baseFolder, filename);
async.waterfall([
(wcb) => {
fs.readFile(cfrFile, 'utf8', wcb);
},
(xml, wcb) => {
const object = parse(xml);
const children = object.root.children;
for (let i = 0; i < children.length; i++) {
const child = children[i];
if ('skos:Concept' === child.name) {
for (let k = 0; k < child.children.length; k++) {
const skosChild = child.children[k];
if ('skm:PC' === skosChild.name &&
skosChild.attributes['rdf:resource'] &&
'#' === skosChild.attributes['rdf:resource'].substring(0, 1) &&
skosChild.attributes['rdf:ID']) {
const termId = skosChild.attributes['rdf:resource'].substring(1);
if (!termIdRules[termId]) {
termIdRules[termId] = [];
}
termIdRules[termId].push(skosChild.attributes['rdf:ID']);
}
}
}
if ('rdf:Description' === child.name && child.attributes && child.attributes['rdf:about']) {
const attribValue = child.attributes['rdf:about'];
if ('Title-' === attribValue.substring(0, 6)) {
cfrTitlesMap[attribValue] = getXmlNodeValue('zthes:label', child.children);
}
if ('URL-' === attribValue.substring(0, 4)) {
cfrUrlsMap[attribValue] = getXmlNodeValue('zthes:label', child.children);
}
if ('Heading-' === attribValue.substring(0, 8)) {
cfrHeadingsMap[attribValue] = getXmlNodeValue('zthes:label', child.children);
}
}
}
wcb();
}
], (err) => {
if (err) {
logger.error(err);
return ecb(err, null);
}
return ecb();
});
}, (err) => {
if (err) {
return cb(err, null);
}
for (let termId in termIdRules) {
if (termIdRules.hasOwnProperty(termId)) {
const rules = termIdRules[termId];
for (let i = 0; i < rules.length; i++) {
const regulationId = getRegulationId(rules[i]);
const titleKey = 'Title-' + regulationId;
const urlKey = 'URL-' + regulationId;
const headingKey = 'Heading-' + regulationId;
if (cfrTitlesMap[titleKey] && cfrUrlsMap[urlKey] &&
isValidPart(cfrHeadingsMap[headingKey])) {
cleanAirRules.push({
regulationId: regulationId,
termId: termId,
title: cfrTitlesMap[titleKey],
url: cfrUrlsMap[urlKey]
});
}
}
}
}
return cb(null, cleanAirRules);
});
},
(rules, cb) => {
// Save the rules to MongoDB
Rule.collection.insert(rules, cb);
}
], (err, rules) => {
if (err) {
logger.error(err);
return callback(err, null);
}
logger.info(JSON.stringify(cleanAirRules, null, 2));
return callback(null, rules);
});
}