UNPKG

factbook

Version:

Serves as an independent data scraping module, complete with ontology and full scraping ability for the CIA World Factbook site

112 lines (111 loc) 6.78 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); var funktologies_1 = require("funktologies"); var getUuid = require("uuid-by-string"); var constants_1 = require("../constants/constants"); var globalStore_1 = require("../constants/globalStore"); function parseSingleCoordinates(cheerio, country, countryId) { var geoId = constants_1.consts.ONTOLOGY.INST_GEO_LOCATION + getUuid(country); var locations = globalStore_1.store.countries[countryId].objectProperties .filter(function (objProp) { return objProp[constants_1.consts.ONTOLOGY.HAS_LOCATION]; }); var foundEntityContainer = locations.find(function (loc) { return loc && loc[constants_1.consts.ONTOLOGY.HAS_LOCATION]['@id'] === geoId; }); var geoAttr = foundEntityContainer && foundEntityContainer[constants_1.consts.ONTOLOGY.HAS_LOCATION]; var content = cheerio.find('div.category_data.subfield.text').text().trim(); var objectProp = {}; if (!geoAttr) { if (globalStore_1.store.locations[geoId]) { objectProp[constants_1.consts.ONTOLOGY.HAS_LOCATION] = globalStore_1.store.locations[geoId]; } else { objectProp = funktologies_1.entityMaker(constants_1.consts.ONTOLOGY.HAS_LOCATION, constants_1.consts.ONTOLOGY.ONT_GEO_LOCATION, geoId, "Geographic Location for " + country); globalStore_1.store.locations[geoId] = objectProp[constants_1.consts.ONTOLOGY.HAS_LOCATION]; } geoAttr = objectProp[constants_1.consts.ONTOLOGY.HAS_LOCATION]; globalStore_1.store.countries[countryId].objectProperties.push(funktologies_1.entityRefMaker(constants_1.consts.ONTOLOGY.HAS_LOCATION, objectProp)); } else { geoAttr = globalStore_1.store.locations[geoId]; } if (content) { var coords = content.split(','); var latSplit = coords[0].trim().split(' '); var lat = (latSplit[latSplit.length - 1].includes('S') ? -1 : 1) * Number(latSplit[0].trim() + '.' + latSplit[1].trim()); var lngSplit = coords[1].trim().split(' '); var lng = (lngSplit[lngSplit.length - 1].includes('W') ? -1 : 1) * Number(lngSplit[0].trim() + '.' + lngSplit[1].trim()); var datatypeProp = {}; if (geoAttr.datatypeProperties) { geoAttr.datatypeProperties[constants_1.consts.WGS84_POS.LAT] = lat; geoAttr.datatypeProperties[constants_1.consts.WGS84_POS.LONG] = lng; geoAttr.datatypeProperties[constants_1.consts.WGS84_POS.LAT_LONG] = lat + ", " + lng; } else { datatypeProp[constants_1.consts.WGS84_POS.LAT] = lat; datatypeProp[constants_1.consts.WGS84_POS.LONG] = lng; datatypeProp[constants_1.consts.WGS84_POS.LAT_LONG] = lat + ", " + lng; geoAttr.datatypeProperties = datatypeProp; } } } function parseMultipleCoordinates(cheerioElem, country, countryId, scope) { cheerioElem(scope).find('p').each(function (index, element) { var content = cheerioElem(element).text().trim().split(':')[1]; var strongTag = cheerioElem(element).find('strong').text().trim().slice(0, -1); var locations = globalStore_1.store.countries[countryId].objectProperties .filter(function (objProp) { return objProp[constants_1.consts.ONTOLOGY.HAS_LOCATION]; }) .map(function (objProp) { return objProp[constants_1.consts.ONTOLOGY.HAS_LOCATION]; }); var objectProp = {}; if (strongTag) { var geoId_1 = constants_1.consts.ONTOLOGY.INST_GEO_LOCATION + getUuid(country) + getUuid(strongTag); var geoAttr = locations.find(function (loc) { return loc && loc['@id'] === geoId_1; }); if (!geoAttr) { if (globalStore_1.store.locations[geoId_1]) { objectProp[constants_1.consts.ONTOLOGY.HAS_LOCATION] = globalStore_1.store.locations[geoId_1]; } else { objectProp = funktologies_1.entityMaker(constants_1.consts.ONTOLOGY.HAS_LOCATION, constants_1.consts.ONTOLOGY.ONT_GEO_LOCATION, geoId_1, "Geographic Location for " + country + " - " + strongTag); globalStore_1.store.locations[geoId_1] = objectProp[constants_1.consts.ONTOLOGY.HAS_LOCATION]; } geoAttr = objectProp[constants_1.consts.ONTOLOGY.HAS_LOCATION]; globalStore_1.store.countries[countryId].objectProperties.push(funktologies_1.entityRefMaker(constants_1.consts.ONTOLOGY.HAS_LOCATION, objectProp)); } else { geoAttr = globalStore_1.store.locations[geoId_1]; } if (content) { var coords = content.split(','); var latSplit = coords[0].trim().split(' '); var lat = (latSplit[latSplit.length - 1].includes('S') ? -1 : 1) * Number(latSplit[0].trim() + '.' + latSplit[1].trim()); var lngSplit = coords[1].trim().split(' '); var lng = (lngSplit[lngSplit.length - 1].includes('W') ? -1 : 1) * Number(lngSplit[0].trim() + '.' + lngSplit[1].trim()); var datatypeProp = {}; if (geoAttr && geoAttr.datatypeProperties) { geoAttr.datatypeProperties[constants_1.consts.WGS84_POS.LAT] = lat; geoAttr.datatypeProperties[constants_1.consts.WGS84_POS.LONG] = lng; geoAttr.datatypeProperties[constants_1.consts.WGS84_POS.LAT_LONG] = lat + ", " + lng; } else { datatypeProp[constants_1.consts.WGS84_POS.LAT] = lat; datatypeProp[constants_1.consts.WGS84_POS.LONG] = lng; datatypeProp[constants_1.consts.WGS84_POS.LAT_LONG] = lat + ", " + lng; geoAttr.datatypeProperties = datatypeProp; } } } }); } function getGeographyCoordinates(cheerioElem, country, countryId) { cheerioElem('#field-geographic-coordinates').each(function (index, element) { var hasMultLocations = cheerioElem(element).find('div.category_data.subfield.text > p'); // Multiple p tags suggests the nation has multiple locations in different parts of the world. // This means distinct description and geographic coordinates. Each must be handled separately. if (hasMultLocations.length) { parseMultipleCoordinates(cheerioElem, country, countryId, element); } else { parseSingleCoordinates(cheerioElem(element), country, countryId); } return; }); } exports.getGeographyCoordinates = getGeographyCoordinates; ;