UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

109 lines (108 loc) 4.37 kB
import { JSDOM } from "jsdom"; import { DateTime } from "luxon"; import path from "path"; import { ID_DATE_FORMAT, STANDARD_DATE_FORMAT } from "../scripts/datautil"; const FR_TZ = "Europe/Paris"; function getEventType(eventClasses) { const typeClass = [...eventClasses] .find(className => className.startsWith("evt-")) || null; switch (typeClass) { case "evt-seance": return "Séance publique"; case "evt-instanz": return "Commissions"; case "evt-cemi": return "Mission de contrôle"; case "evt-deleg": return "Offices et délégations"; case "evt-bureau": return "Instances décisionnelles"; } return null; } function getUrlDossierSenat(lienElements) { const urlElement = [...lienElements] .find(lienElement => lienElement.textContent?.includes("dossier législatif")); return urlElement ? urlElement.getAttribute("href") : null; } /** * Normalize time string to become a simple start time ("H'h'mm") or a duration ("'de 'H'h'mm' à 'H'h'mm"). */ function normalizeTime(timeStr) { return timeStr ?.replace(/^À l'issue de l'espace réservé .* et au plus tard\s/i, "") // Must be processed first ?.replace(/^(?:le )?matin/i, "10h00") // We chose "matin" to mean 10h00 ?.replace(/^(?:l')?après-midi/i, "16h00") // We chose "après-midi" to mean 16h00 ?.replace(/^(?:le )?soir/i, "20h00") // We chose "soir" to mean 20h00 ?.replace(/^(?:la )?nuit/i, "22h00") // We chose "nuit" to mean 22h00 ?.replace(/^à\s/ig, "") ?.replace(/heures/ig, "h00") ?.replace(/\set.*/i, "") ?.replace(/,.*/, "") ?.replace(/\s\(hors hémicycle\)/i, "") ?.replace(/\s*h\s*/ig, "h"); } function getStartAndEndTimes(timeStr) { const normalizedTime = normalizeTime(timeStr); const timeMatches = normalizedTime ?.match(/^de (?<startTime>\d{2}h\d{2}) à (?<endTime>\d{2}h\d{2})$/i); if (timeMatches?.groups) { const { startTime, endTime } = timeMatches.groups; return { startTime: startTime ? DateTime.fromFormat(startTime, "H'h'mm", { zone: FR_TZ }).toISOTime() : null, endTime: endTime ? DateTime.fromFormat(endTime, "H'h'mm", { zone: FR_TZ }).toISOTime() : null, }; } return { startTime: normalizedTime ? DateTime.fromFormat(normalizedTime, "H'h'mm", { zone: FR_TZ }).toISOTime() : null, endTime: null, }; } function transformAgenda(document, fileName) { const agendaEvents = []; const eventElements = document.querySelectorAll(".evt"); for (const eventElement of eventElements) { const id = eventElement.previousElementSibling?.getAttribute("name") || null; if (!id) { continue; } const type = getEventType(eventElement.classList); const date = DateTime.fromFormat(fileName, ID_DATE_FORMAT).toFormat(STANDARD_DATE_FORMAT); const timeOriginal = eventElement.querySelector(".time")?.textContent || null; const { startTime, endTime } = getStartAndEndTimes(timeOriginal); const titre = eventElement.querySelector(".titre")?.textContent?.trim() || null; const organe = eventElement.querySelector(".organe")?.textContent?.trim() || null; const objet = eventElement.querySelector(".objet")?.textContent ?.trim() ?.replace(/^- /, "") || null; const lieu = eventElement.querySelector(".lieu")?.textContent || null; const url_dossier_senat = getUrlDossierSenat(eventElement.querySelectorAll(".lien a")); agendaEvents.push({ id, type, date, startTime, endTime, timeOriginal, titre, organe, objet, lieu, url_dossier_senat, }); } return agendaEvents; } export async function parseAgendaFromFile(htmlFilePath) { try { const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window; const fileName = path.parse(htmlFilePath).name; return transformAgenda(document, fileName); } catch (error) { console.error(`Could not parse texte with error ${error}`); } return null; }