@tricoteuses/senat
Version:
Handle French Sénat's open data
109 lines (108 loc) • 4.37 kB
JavaScript
import { JSDOM } from "jsdom";
import { DateTime } from "luxon";
import path from "path";
import { ID_DATE_FORMAT, STANDARD_DATE_FORMAT } from "../scripts/datautil";
const FR_TZ = "Europe/Paris";
function getEventType(eventClasses) {
const typeClass = [...eventClasses]
.find(className => className.startsWith("evt-"))
|| null;
switch (typeClass) {
case "evt-seance":
return "Séance publique";
case "evt-instanz":
return "Commissions";
case "evt-cemi":
return "Mission de contrôle";
case "evt-deleg":
return "Offices et délégations";
case "evt-bureau":
return "Instances décisionnelles";
}
return null;
}
function getUrlDossierSenat(lienElements) {
const urlElement = [...lienElements]
.find(lienElement => lienElement.textContent?.includes("dossier législatif"));
return urlElement ? urlElement.getAttribute("href") : null;
}
/**
* Normalize time string to become a simple start time ("H'h'mm") or a duration ("'de 'H'h'mm' à 'H'h'mm").
*/
function normalizeTime(timeStr) {
return timeStr
?.replace(/^À l'issue de l'espace réservé .* et au plus tard\s/i, "") // Must be processed first
?.replace(/^(?:le )?matin/i, "10h00") // We chose "matin" to mean 10h00
?.replace(/^(?:l')?après-midi/i, "16h00") // We chose "après-midi" to mean 16h00
?.replace(/^(?:le )?soir/i, "20h00") // We chose "soir" to mean 20h00
?.replace(/^(?:la )?nuit/i, "22h00") // We chose "nuit" to mean 22h00
?.replace(/^à\s/ig, "")
?.replace(/heures/ig, "h00")
?.replace(/\set.*/i, "")
?.replace(/,.*/, "")
?.replace(/\s\(hors hémicycle\)/i, "")
?.replace(/\s*h\s*/ig, "h");
}
function getStartAndEndTimes(timeStr) {
const normalizedTime = normalizeTime(timeStr);
const timeMatches = normalizedTime
?.match(/^de (?<startTime>\d{2}h\d{2}) à (?<endTime>\d{2}h\d{2})$/i);
if (timeMatches?.groups) {
const { startTime, endTime } = timeMatches.groups;
return {
startTime: startTime ? DateTime.fromFormat(startTime, "H'h'mm", { zone: FR_TZ }).toISOTime() : null,
endTime: endTime ? DateTime.fromFormat(endTime, "H'h'mm", { zone: FR_TZ }).toISOTime() : null,
};
}
return {
startTime: normalizedTime ? DateTime.fromFormat(normalizedTime, "H'h'mm", { zone: FR_TZ }).toISOTime() : null,
endTime: null,
};
}
function transformAgenda(document, fileName) {
const agendaEvents = [];
const eventElements = document.querySelectorAll(".evt");
for (const eventElement of eventElements) {
const id = eventElement.previousElementSibling?.getAttribute("name") || null;
if (!id) {
continue;
}
const type = getEventType(eventElement.classList);
const date = DateTime.fromFormat(fileName, ID_DATE_FORMAT).toFormat(STANDARD_DATE_FORMAT);
const timeOriginal = eventElement.querySelector(".time")?.textContent || null;
const { startTime, endTime } = getStartAndEndTimes(timeOriginal);
const titre = eventElement.querySelector(".titre")?.textContent?.trim() || null;
const organe = eventElement.querySelector(".organe")?.textContent?.trim() || null;
const objet = eventElement.querySelector(".objet")?.textContent
?.trim()
?.replace(/^- /, "")
|| null;
const lieu = eventElement.querySelector(".lieu")?.textContent || null;
const url_dossier_senat = getUrlDossierSenat(eventElement.querySelectorAll(".lien a"));
agendaEvents.push({
id,
type,
date,
startTime,
endTime,
timeOriginal,
titre,
organe,
objet,
lieu,
url_dossier_senat,
});
}
return agendaEvents;
}
export async function parseAgendaFromFile(htmlFilePath) {
try {
const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window;
const fileName = path.parse(htmlFilePath).name;
return transformAgenda(document, fileName);
}
catch (error) {
console.error(`Could not parse texte with error ${error}`);
}
return null;
}