@tricoteuses/senat
Version:
Handle French Sénat's open data
149 lines (148 loc) • 5.73 kB
JavaScript
import { JSDOM } from "jsdom";
import { DateTime } from "luxon";
import path from "path";
import { ID_DATE_FORMAT, STANDARD_DATE_FORMAT } from "../scripts/datautil";
const FR_TZ = "Europe/Paris";
function eventIsSeance(eventElement) {
return eventElement.classList.contains("evt-seance");
}
function getEventType(eventClasses) {
const typeClass = [...eventClasses].find((className) => className.startsWith("evt-")) || null;
switch (typeClass) {
case "evt-seance":
return "Séance publique";
case "evt-instanz":
return "Commissions";
case "evt-cemi":
return "Mission de contrôle";
case "evt-deleg":
return "Offices et délégations";
case "evt-bureau":
return "Instances décisionnelles";
}
return null;
}
function getUrlDossierSenat(lienElements) {
const urlElement = [...lienElements].find((lienElement) => lienElement.textContent?.includes("dossier législatif"));
return urlElement ? urlElement.getAttribute("href") : null;
}
function getQuantieme(eventElement, seancesElements) {
const seanceIndex = seancesElements.indexOf(eventElement);
if (seancesElements.length === 1 && seanceIndex === 0) {
return "Unique";
}
else {
switch (seanceIndex) {
case 0:
return "Première";
case 1:
return "Deuxième";
case 2:
return "Troisième";
case 3:
return "Quatrième";
case 4:
return "Cinquième";
}
}
return "Non défini";
}
/**
* Normalize time string to become a simple start time ("H'h'mm") or a duration ("'de 'H'h'mm' à 'H'h'mm").
*/
function normalizeTime(timeStr) {
return timeStr
?.replace(/^À l'issue de l'espace réservé .* et au plus tard\s/i, "") // Must be processed first
?.replace(/^(?:le )?matin/i, "10h00") // We chose "matin" to mean 10h00
?.replace(/^(?:l')?après-midi/i, "16h00") // We chose "après-midi" to mean 16h00
?.replace(/^(?:le )?soir/i, "20h00") // We chose "soir" to mean 20h00
?.replace(/^(?:la )?nuit/i, "22h00") // We chose "nuit" to mean 22h00
?.replace(/^à\s/gi, "")
?.replace(/heures/gi, "h00")
?.replace(/\set.*/i, "")
?.replace(/,.*/, "")
?.replace(/\s\(hors hémicycle\)/i, "")
?.replace(/\s*h\s*/gi, "h");
}
export function getStartAndEndTimes(timeStr, dateISO) {
const normalizedTime = normalizeTime(timeStr);
if (!normalizedTime) {
return { startTime: null, endTime: null };
}
const rangeMatch = normalizedTime.match(/^de (?<start>\d{1,2}h\d{2}) à (?<end>\d{1,2}h\d{2})$/i);
const toUtcTimeOnly = (value) => {
if (!value)
return null;
const time = DateTime.fromFormat(value, "H'h'mm", { zone: FR_TZ });
if (!time.isValid)
return null;
const local = DateTime.fromISO(dateISO, { zone: FR_TZ }).set({
hour: time.hour,
minute: time.minute,
second: 0,
millisecond: 0,
});
if (!local.isValid)
return null;
return local.toUTC().toFormat("HH:mm:ss.SSS'Z'");
};
if (rangeMatch?.groups) {
const { start, end } = rangeMatch.groups;
return {
startTime: toUtcTimeOnly(start),
endTime: toUtcTimeOnly(end),
};
}
return {
startTime: toUtcTimeOnly(normalizedTime),
endTime: null,
};
}
function transformAgenda(document, fileName) {
const agendaEvents = [];
const eventElements = document.querySelectorAll(".evt");
const seanceElements = Array.from(eventElements).filter((eventElement) => eventIsSeance(eventElement));
for (const eventElement of eventElements) {
const id = eventElement.previousElementSibling?.getAttribute("name") || null;
if (!id) {
continue;
}
const type = getEventType(eventElement.classList);
const date = DateTime.fromFormat(fileName, ID_DATE_FORMAT).toFormat(STANDARD_DATE_FORMAT);
const timeOriginal = eventElement.querySelector(".time")?.textContent || null;
const { startTime, endTime } = getStartAndEndTimes(timeOriginal, date);
const titre = eventElement.querySelector(".titre")?.textContent?.trim() || "";
const organe = eventElement.querySelector(".organe")?.textContent?.trim() || null;
const objet = eventElement.querySelector(".objet")?.textContent?.trim()?.replace(/^- /, "") || null;
const lieu = eventElement.querySelector(".lieu")?.textContent || null;
const videoElement = eventElement.querySelector(".video");
const urlDossierSenat = getUrlDossierSenat(eventElement.querySelectorAll(".lien a"));
agendaEvents.push({
id,
type,
date,
startTime,
endTime,
timeOriginal,
titre,
organe,
objet,
lieu,
captationVideo: videoElement !== null,
urlDossierSenat: urlDossierSenat,
quantieme: eventIsSeance(eventElement) ? getQuantieme(eventElement, seanceElements) : null,
});
}
return agendaEvents;
}
export async function parseAgendaFromFile(htmlFilePath) {
try {
const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window;
const fileName = path.parse(htmlFilePath).name;
return transformAgenda(document, fileName);
}
catch (error) {
console.error(`Could not parse texte with error ${error}`);
}
return null;
}