wikipedia-airport-scraper
Version:
Get airport codes and flight connections from Wikipedia airport pages
213 lines (175 loc) • 7.52 kB
JavaScript
import * as cheerio from 'cheerio'
import dayjs from 'dayjs'
import round from 'lodash.round'
const dateRegex = /([0-9]{1,2} )?(January|February|March|April|May|June|July|August|September|October|November|December) ([0-9]{1,2}, )?[0-9]{4}/i
const scrape = (body) => {
const $ = cheerio.load(body)
// Get name and IATA/ICAO codes
const name = $('h1 span.mw-page-title-main').text().trim()
const iataCode = (/[A-Z]{3}/.exec($('span.nickname', $('li a[title="IATA airport code"]').parent()).text()) || [null])[0]
const icaoCode = (/[A-Z]{4}/.exec($('span.nickname', $('li a[title="ICAO airport code"]').parent()).text()) || [null])[0]
// Get coordinates
const $coordinates = $('span.geo-dms')
const coordsRegex = /([0-9]{1,3})°([0-9]{1,2}\.?[0-9]{0,3}′)?([0-9]{1,2}\.?[0-9]{0,3}″)?([ENSW])/
const latitude = convertDegrees(...coordsRegex.exec($('span.latitude', $coordinates).text().trim()).slice(1))
const longitude = convertDegrees(...coordsRegex.exec($('span.longitude', $coordinates).text().trim()).slice(1))
const coordinates = {
latitude,
longitude
}
// Get flights
const $passengerTable = getPassengerTable($)
// Check headings to make sure we have the right table
const isCorrectTable = $('th', $passengerTable).eq(0).text() === 'Airlines' && $('th', $passengerTable).eq(1).text() === 'Destinations'
const flights = isCorrectTable ? getFlights($passengerTable, $) : []
return {
name,
iataCode,
icaoCode,
coordinates,
flights
}
}
const convertDegrees = (degrees, minutes, seconds, direction) => {
const degreesDecimal = round((parseInt(degrees) + ((parseInt(minutes) || 0) / 60) + ((parseInt(seconds) || 0) / 3600)), 6)
return ['S', 'W'].includes(direction) ? degreesDecimal * -1 : degreesDecimal
}
const getPassengerTable = ($) => {
const headlineTextIds = [
'Airlines_and_destinations',
'Airlines_and_Destinations',
'Airline_and_destination',
'Airline_and_Destination',
'Airline_and_destinations',
'Airline_and_Destinations',
'Charters_and_destinations',
'Charters_and_Destinations',
'Destinations'
]
const tablesAndHeadings = $('div.mw-heading2').has(headlineTextIds.map((id) => `h2#${id}`).join(', '))
.nextUntil('div.mw-heading2')
.filter('div.mw-heading3, table')
.map(function () {
const $this = $(this)
const tagName = $this.prop('tagName')
const heading = $('h3', $this).text() || null
return {
tagName,
heading,
$this
}
}).get()
return tablesAndHeadings[tablesAndHeadings.findIndex(({ heading }) => heading === 'Passenger') + 1].$this
}
const getFlights = ($passengerTable, $) => {
const $rows = $('tbody tr:not(:has(th))', $passengerTable)
return $rows.map(function () {
const $cols = $('td', $(this))
const $airlineCol = $cols.eq(0)
const $destinationsCol = $cols.eq(1)
// Get airline data
const $airlineLink = $('a[title]', $airlineCol)
const rawAirlineLink = $airlineLink.attr('href') || null
const hasNoPage = /action=edit/.test(rawAirlineLink)
const airlineLink = hasNoPage || rawAirlineLink === null ? null : rawAirlineLink.replace('/wiki/', '')
const airline = {
name: $airlineLink.text() || $('span.nowrap', $airlineCol).text() || $airlineCol.text().replace(/\[[0-9a-z]{1,}\]/, '').trim(),
link: airlineLink
}
const destinationsNodes = $destinationsCol.contents()
.map(function () {
let tagName = $(this).prop('tagName') || null
const airportLink = $(this).attr('href') || null
let value = this.nodeValue ? this.nodeValue.trim() : $(this).text()
// Some airports will be listed with no link. We should try to detect those cases
if (tagName === null && value !== ',' && value.includes(',') && !value.startsWith('(')) {
tagName = 'A'
value = value.split(',').filter((d) => d !== '').map((d) => d.trim()) // value.replaceAll(',', '').trim()
}
return {
tagName,
link: airportLink,
value
}
})
.get()
.filter((d) => !['BR', 'SUP'].includes(d.tagName))
.filter(({ tagName, value }) => !(tagName === null && ['', ',', ', '].includes(value)))
.map((d, index) => ({ index, ...d }))
.reduce((acc, curr, i) => {
if (i === 0 || curr.tagName === 'B') acc.push([])
acc[acc.length - 1].push(curr)
return acc
}, [])
.flatMap((nodes, blockIndex) => nodes.map((node) => ({ ...node, blockIndex })))
// Get markers and modifiers
const markers = destinationsNodes.filter((node) => node.tagName === 'B')
const modifiers = destinationsNodes
.filter((node) => node.tagName === null)
.map((modifier) => {
return {
...modifier,
formattedDate: dateRegex.test(modifier.value) ? dayjs(modifier.value.match(dateRegex)[0]).format('YYYY-MM-DD') : null
}
})
// Get airports
const airportEntries = destinationsNodes.filter((node) => node.tagName === 'A')
const destinations = airportEntries.flatMap((airport) => {
const { index, blockIndex, value: shortName, link } = airport
// Process markers
const isCharter = markers.find((marker) => (/^([A-Za-z\s]+)?[c|C]harter/.test(marker.value) || /^Hajj\s?&\s?Umrah/.test(marker.value)) && marker.blockIndex === blockIndex) !== undefined
const isSeasonal = markers.find((marker) => (/^Seasonal/.test(marker.value) || /^Hajj\s?&\s?Umrah/.test(marker.value)) && marker.blockIndex === blockIndex) !== undefined
// Process modifiers
const suspended = modifiers.find((modifier) => /\((temporarily )?suspended/.test(modifier.value) && modifier.index === index + 1) !== undefined ||
modifiers.find((modifier) => /\((both suspended|all suspended)/.test(modifier.value) && modifier.blockIndex === blockIndex) !== undefined
const startDate = (modifiers.find((modifier) => /\((begins|resumes)/.test(modifier.value) && modifier.index === index + 1)?.formattedDate || null) ||
(modifiers.find((modifier) => /\((both begin|both resume)/.test(modifier.value) && modifier.blockIndex === blockIndex)?.formattedDate || null)
const endDate = modifiers.find((modifier) => /\((ends)/.test(modifier.value) && modifier.index === index + 1)?.formattedDate || null
const destination = {
shortName,
...getFullNameAndLink(link),
isCharter,
isSeasonal,
suspended,
startDate,
endDate
}
return Array.isArray(destination.shortName)
? destination.shortName.map((sn) => {
const { shortName, ...rest } = destination
return {
airline,
destination: {
shortName: sn,
...rest
}
}
})
: {
airline,
destination
}
})
return destinations
}).toArray()
}
const getFullNameAndLink = (link) => {
if (!link) {
return {
fullName: null,
link: null
}
} else {
const searchParams = new URL(link, 'https://en.m.wikipedia.org/').searchParams
return searchParams.size > 0
? {
fullName: searchParams.get('title').replaceAll('_', ' '),
link: null
}
: {
fullName: decodeURI(link.replace('/wiki/', '').replaceAll('_', ' ')),
link: link.replace('/wiki/', '')
}
}
}
export default scrape