calfire
Version:
Unofficial CalFire data scraper.
166 lines (145 loc) • 5.92 kB
JavaScript
;
const cheerio = require('cheerio');
const fs = require('fs');
const defaults = require('json-schema-defaults');
const slugify = require('slug');
const path = require('path');
const incidentSchema = require('incident-schema');
const TYPE = {
string(obj){
// Sometimes, when using the YAML parser, a valid string
// will seem like a K/V pair to the parser. For strings
// that are likely to contain colons with multiple lines,
// this helps to undo the damage.
if(typeof obj === 'object'){
let str = '';
Object.keys(obj).forEach((k) => {
let v = obj[k];
str += `${k}:\n${v}`;
});
return str;
} else {
return obj;
}
},
number(obj) {
// try to convert string to a number
// or fall back on original object
return (obj * 1) || obj;
},
numberString(obj){
if(!obj){return undefined};
let number = this.number(obj);
if(number || number === 0){
return `${this.number(obj)}`
} else {
return undefined;
}
},
date(obj) {
let d = new Date(obj);
if ( d.toString() === "[object Date]" ) {
// it is a date
if (isNaN(d.getTime())) { // d.valueOf() could also work
return undefined;
}
else {
return d;
}
}
else {
return undefined;
}
}
}
function parseIncident(html){
let $ = cheerio.load(html);
// incident._id = (($('.list_decorative_blue a, .list_decorative_dark a')
// .first().attr('href') || "")
// .match(/\?incident_id=(\d+)/) || [])[1];
let incident = defaults(incidentSchema);
incident.name = $('h3.incident_h3').text();
// What???
$('td.emphasized').map(function(i, el) {
el.children[0].data = `${el.children[0].data.trim().replace(/:/, "")}:`;
return el;
});
let json = {};
// Translate the incident information table to an object.
$('table#incident_information tr:not(.header_tr)').map((i, el) => {
let $el = $(el);
let key = $el.find('td.emphasized').text().trim().replace(/:/, "");
let value = $el.find('td:not(.emphasized)').first().text().trim();
json[key] = value;
});
// Gather relevant links from the table.
$('table#incident_information a').each((i, el) => {
let $el = $(el);
let link = {
href: $el.attr('href'),
description: $el.text(),
tags: []
};
if(link.href.match(/inciweb\.nwcg\.gov\/incident/)){
link.title = "Inciweb Incident Page";
link.tags.push("inciweb");
} else if (link.href.match(/rvcfire\.org/)) {
link.title = "Riverside County FD Incident Page";
link.tags.push("riverside");
} else if (link.href.match(/twitter/)) {
link.title = `${$el.text()} Twitter`;
link.tags.push("twitter");
}
incident.links.push(link);
});
// Basics
incident.updatedAt = json['Last Updated'] ? new Date(json['Last Updated']) : undefined;
incident.startedAt = json['Date/Time Started'] ? new Date(json['Date/Time Started']) : undefined;
incident.administrativeUnit = json['Administrative Unit'];
incident.cause = json['Cause'];
incident.cooperatingAgencies = json['Cooperating Agencies'];
incident.roadClosures = TYPE.string(json['Road Closures']);
incident.schoolClosures = TYPE.string(json['School Closures']);
incident.evacuations = TYPE.string(json['Evacuation Info'] || json['Evacuations']);
let year = new Date(incident.startedAt || incident.updatedAt || new Date()).getFullYear();
let county = json['County'];
incident.slug = slugify(`${incident.name} ${county} ${year}`, {lower: true});
// Stats - Damage
incident.stats.damage.acres = TYPE.numberString(((json['Acres Burned - Containment'] || '').match(/(\d+) acres/i) || [])[1]);
incident.stats.damage.contained = ((json['Acres Burned - Containment'] || '').match(/(\d{1,3}%) contained/i) || [])[1];
incident.stats.damage.contained = json['Estimated - Containment'] ? json['Estimated - Containment'] : incident.stats.damage.contained;
incident.stats.damage.injuries = json['Injuries'];
incident.stats.damage.structuresDestroyed = json['Structures Destroyed'];
incident.stats.damage.structuresThreatened = json['Structures Threatened'];
// Stats - Resources
incident.stats.resources.airtankers = json['Total Airtankers'];
incident.stats.resources.dozers = json['Total Dozers'];
incident.stats.resources.fireCrews = json['Total Fire Crews'];
incident.stats.resources.fireEngines = json['Total Fire Engines'];
incident.stats.resources.firefighters = json['Total Fire Personnel'];
incident.stats.resources.helicopters = json['Total Helicopters'];
incident.stats.resources.waterTenders = json['Total Water Tenders'];
// Location
incident.location.county = ((json['County'] || '').match(/([A-z|\s]+) County/i) || [])[1];
incident.location.description = TYPE.string(json['Location']);
let coordinates = (json['Long/Lat'] || '').match(/(\-*\d+\.\d+)\/(\-*\d+\.\d+)/) || [];
incident.location.coordinates = {
lat: coordinates[2],
long: coordinates[1]
};
// Parse phone numbers
$("table#incident_information tr:contains('Phone Numbers')").each((i, el) => {
let $el = $(el);
// let key = $el.find('td.emphasized').text().trim().replace(/:/, "");
let value = $el.find('td:not(.emphasized)').first().text().trim();
let phoneNumber = (value || "").match(/(\(*[\d|A-Z]{3}\)*[\s|\-][\d|A-Z]{3}[\s|\-][\d|A-Z]{4})\s*\(([A-z|\s|&|\w|\-|\_]*)\s*\)*/);
if(phoneNumber){
incident.contact.phoneNumbers.push({
title: phoneNumber[2],
number: phoneNumber[1]
});
}
});
return incident;
}
module.exports = parseIncident;