UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

149 lines (148 loc) 5.73 kB
import { JSDOM } from "jsdom"; import { DateTime } from "luxon"; import path from "path"; import { ID_DATE_FORMAT, STANDARD_DATE_FORMAT } from "../scripts/datautil"; const FR_TZ = "Europe/Paris"; function eventIsSeance(eventElement) { return eventElement.classList.contains("evt-seance"); } function getEventType(eventClasses) { const typeClass = [...eventClasses].find((className) => className.startsWith("evt-")) || null; switch (typeClass) { case "evt-seance": return "Séance publique"; case "evt-instanz": return "Commissions"; case "evt-cemi": return "Mission de contrôle"; case "evt-deleg": return "Offices et délégations"; case "evt-bureau": return "Instances décisionnelles"; } return null; } function getUrlDossierSenat(lienElements) { const urlElement = [...lienElements].find((lienElement) => lienElement.textContent?.includes("dossier législatif")); return urlElement ? urlElement.getAttribute("href") : null; } function getQuantieme(eventElement, seancesElements) { const seanceIndex = seancesElements.indexOf(eventElement); if (seancesElements.length === 1 && seanceIndex === 0) { return "Unique"; } else { switch (seanceIndex) { case 0: return "Première"; case 1: return "Deuxième"; case 2: return "Troisième"; case 3: return "Quatrième"; case 4: return "Cinquième"; } } return "Non défini"; } /** * Normalize time string to become a simple start time ("H'h'mm") or a duration ("'de 'H'h'mm' à 'H'h'mm"). */ function normalizeTime(timeStr) { return timeStr ?.replace(/^À l'issue de l'espace réservé .* et au plus tard\s/i, "") // Must be processed first ?.replace(/^(?:le )?matin/i, "10h00") // We chose "matin" to mean 10h00 ?.replace(/^(?:l')?après-midi/i, "16h00") // We chose "après-midi" to mean 16h00 ?.replace(/^(?:le )?soir/i, "20h00") // We chose "soir" to mean 20h00 ?.replace(/^(?:la )?nuit/i, "22h00") // We chose "nuit" to mean 22h00 ?.replace(/^à\s/gi, "") ?.replace(/heures/gi, "h00") ?.replace(/\set.*/i, "") ?.replace(/,.*/, "") ?.replace(/\s\(hors hémicycle\)/i, "") ?.replace(/\s*h\s*/gi, "h"); } export function getStartAndEndTimes(timeStr, dateISO) { const normalizedTime = normalizeTime(timeStr); if (!normalizedTime) { return { startTime: null, endTime: null }; } const rangeMatch = normalizedTime.match(/^de (?<start>\d{1,2}h\d{2}) à (?<end>\d{1,2}h\d{2})$/i); const toUtcTimeOnly = (value) => { if (!value) return null; const time = DateTime.fromFormat(value, "H'h'mm", { zone: FR_TZ }); if (!time.isValid) return null; const local = DateTime.fromISO(dateISO, { zone: FR_TZ }).set({ hour: time.hour, minute: time.minute, second: 0, millisecond: 0, }); if (!local.isValid) return null; return local.toUTC().toFormat("HH:mm:ss.SSS'Z'"); }; if (rangeMatch?.groups) { const { start, end } = rangeMatch.groups; return { startTime: toUtcTimeOnly(start), endTime: toUtcTimeOnly(end), }; } return { startTime: toUtcTimeOnly(normalizedTime), endTime: null, }; } function transformAgenda(document, fileName) { const agendaEvents = []; const eventElements = document.querySelectorAll(".evt"); const seanceElements = Array.from(eventElements).filter((eventElement) => eventIsSeance(eventElement)); for (const eventElement of eventElements) { const id = eventElement.previousElementSibling?.getAttribute("name") || null; if (!id) { continue; } const type = getEventType(eventElement.classList); const date = DateTime.fromFormat(fileName, ID_DATE_FORMAT).toFormat(STANDARD_DATE_FORMAT); const timeOriginal = eventElement.querySelector(".time")?.textContent || null; const { startTime, endTime } = getStartAndEndTimes(timeOriginal, date); const titre = eventElement.querySelector(".titre")?.textContent?.trim() || ""; const organe = eventElement.querySelector(".organe")?.textContent?.trim() || null; const objet = eventElement.querySelector(".objet")?.textContent?.trim()?.replace(/^- /, "") || null; const lieu = eventElement.querySelector(".lieu")?.textContent || null; const videoElement = eventElement.querySelector(".video"); const urlDossierSenat = getUrlDossierSenat(eventElement.querySelectorAll(".lien a")); agendaEvents.push({ id, type, date, startTime, endTime, timeOriginal, titre, organe, objet, lieu, captationVideo: videoElement !== null, urlDossierSenat: urlDossierSenat, quantieme: eventIsSeance(eventElement) ? getQuantieme(eventElement, seanceElements) : null, }); } return agendaEvents; } export async function parseAgendaFromFile(htmlFilePath) { try { const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window; const fileName = path.parse(htmlFilePath).name; return transformAgenda(document, fileName); } catch (error) { console.error(`Could not parse texte with error ${error}`); } return null; }