@tricoteuses/senat
Version:
Handle French Sénat's open data
211 lines (210 loc) • 7.19 kB
JavaScript
import { DateTime } from "luxon";
import { buildOdj } from "./reunion_odj_building";
import { norm } from "./string_cleaning";
const PARIS = "Europe/Paris";
const STOPWORDS = new Set([
"de",
"du",
"des",
"la",
"le",
"les",
"l",
"d",
"et",
"en",
"au",
"aux",
"pour",
"sur",
"sous",
"à",
"a",
"aux",
]);
function toReunion(e, dossierBySenatUrl, uid) {
const date = norm(e.date) ?? e.date;
const { startISO, endISO } = deriveTimesForEvent(e);
const startTime = startISO ?? e.startTime ?? null;
const endTime = endISO ?? e.endTime ?? null;
return {
uid,
chambre: "SN",
date,
type: e.type || "",
organe: e.organe || undefined,
startTime,
endTime,
captationVideo: e.captationVideo === true,
titre: e.titre,
objet: e.objet || "",
events: [e], // TODO remove
odj: buildOdj([e], dossierBySenatUrl),
lieu: e.lieu || undefined,
};
}
export function buildReunionsByBucket(events, dossierBySenatUrl) {
const out = { IDS: [], IDC: [], IDM: [], IDO: [], IDI: [] };
if (!events?.length)
return out;
for (const e of events) {
const kind = classifyAgendaType(e?.type);
if (!kind) {
continue;
console.warn("Can't determine type of reunion");
}
const bucket = typeToSuffixStrict(kind);
const uid = makeReunionUid(e.date, kind, e.id, e.organe ?? null);
out[bucket].push(toReunion(e, dossierBySenatUrl, uid));
}
// Tri stable par bucket (date + heure, inconnus à la fin)
for (const k of Object.keys(out)) {
out[k].sort((a, b) => {
const da = DateTime.fromISO(`${a.date}T${a.startTime || "23:59:59.999+02:00"}`, { zone: PARIS }).toMillis();
const db = DateTime.fromISO(`${b.date}T${b.startTime || "23:59:59.999+02:00"}`, { zone: PARIS }).toMillis();
return da - db || (a.organe || "").localeCompare(b.organe || "") || (a.titre || "").localeCompare(b.titre || "");
});
}
return out;
}
function normalizeNoAccents(s) {
return (s || "")
.trim()
.normalize("NFKD")
.replace(/[\u0300-\u036f]/g, "");
}
function classifyAgendaType(typeLabel) {
const s = normalizeNoAccents(typeLabel || "").toLowerCase();
if (/\bseance\b.*\bpublique\b/.test(s))
return "SP";
if (/\bcommissions\b/.test(s))
return "COM";
if (/\bmission\b.*\bcontrole\b/.test(s))
return "MC";
if (/\boffices\b|\bdelegations\b/.test(s))
return "OD";
if (/\instances\b|\decisionelles\b/.test(s))
return "ID";
return null;
}
function typeToSuffixStrict(kind) {
switch (kind) {
case "SP":
return "IDS";
case "COM":
return "IDC";
case "MC":
return "IDM";
case "OD":
return "IDO";
case "ID":
return "IDI";
}
}
function organeInitials(input, maxLen = 8) {
if (!input)
return "";
const clean = normalizeNoAccents(input)
.replace(/['’]/g, " ")
.replace(/[^A-Za-z0-9\s]/g, " ")
.replace(/\s+/g, " ")
.trim();
if (!clean)
return "";
const parts = clean.split(" ");
const letters = [];
for (const raw of parts) {
const w = raw.toLowerCase();
if (!w)
continue;
if (STOPWORDS.has(w))
continue;
// Take two first letter if alphanumeric
const two = raw.slice(0, 2);
if (/[A-Za-z0-9]/.test(two))
letters.push(two.toUpperCase());
}
const out = letters.join("");
return out.slice(0, maxLen);
}
export function makeReunionUid(dateISO, kind, agendaEventId, organe) {
const ymd = dateISO ? formatYYYYMMDD(dateISO) : "00000000";
const suffix = typeToSuffixStrict(kind);
const org = organe && suffix !== "IDS" ? organeInitials(organe) : "";
let base = `RUSN${ymd}${suffix}${org}${agendaEventId}`;
return base;
}
export function formatYYYYMMDD(dateYYYYMMDD) {
const [y, m, d] = dateYYYYMMDD.split("-");
return `${y}${m}${d}`;
}
// Extract hours/minutes from French text like "à 10 h 30", "de 10 h à 12 h", etc.
function parseTimeOriginalFR(timeOriginal) {
if (!timeOriginal)
return { start: null, end: null };
const txt = (timeOriginal || "")
.replace(/\u00A0/g, " ") // nbsp → space
.replace(/\s+/g, " ") // espaces multiples
.toLowerCase()
.trim();
// 1) "de 10 h 30 à 12 heures", "de 10h30 à 12h", "de 9 h à 11 h 15", etc.
const reRange = /\bde\s+(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?\s*à\s*(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?/i;
const mRange = txt.match(reRange);
if (mRange) {
const h1 = clampHour(+mRange[1]), m1 = clampMinute(mRange[2] ? +mRange[2] : 0);
const h2 = clampHour(+mRange[3]), m2 = clampMinute(mRange[4] ? +mRange[4] : 0);
return { start: toIsoTime(h1, m1), end: toIsoTime(h2, m2) };
}
// 2) "à 10 h 30", "à 10h", "A 10h30", "A 9 heures", etc.
const reAt = /\b(?:a|à)\s*(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?/i;
const mAt = txt.match(reAt);
if (mAt) {
const h = clampHour(+mAt[1]), m = clampMinute(mAt[2] ? +mAt[2] : 0);
return { start: toIsoTime(h, m), end: null };
}
// 3) "10 h 30", "15h", "9 heures" sans 'à' / 'de ... à ...'
const reBare = /\b(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?\b/;
const mBare = txt.match(reBare);
if (mBare) {
const h = clampHour(+mBare[1]), m = clampMinute(mBare[2] ? +mBare[2] : 0);
return { start: toIsoTime(h, m), end: null };
}
return { start: null, end: null };
}
function clampHour(h) {
return Math.max(0, Math.min(23, h));
}
function clampMinute(m) {
return Math.max(0, Math.min(59, m));
}
function toIsoTime(h, m) {
return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:00.000+02:00`;
}
export function deriveTimesForEvent(ev) {
const directStart = ev.startTime ?? null;
const directEnd = ev.endTime ?? null;
const fromText = parseTimeOriginalFR(ev.timeOriginal);
const startISO = directStart ?? fromText.start ?? null;
const endISO = directEnd ?? fromText.end ?? null;
return { startISO, endISO };
}
export function extractSommaireBlocks($, idx) {
const blocks = [];
// lignes du sommaire avec lien
$("cri\\:tm5 a[href^='#'], cri\\:tm3 a[href^='#'], p.tm5 a[href^='#'], p.tm3 a[href^='#']").each((_, a) => {
const href = $(a).attr("href") || "";
const targetId = href.startsWith("#") ? href.slice(1) : null;
const text = norm($(a).text() || "");
const startIndex = idx.get(a) ?? idx.get($(a).closest("p")[0]) ?? null;
if (!text || startIndex == null)
return;
blocks.push({ text, startIndex, targetId });
});
return blocks;
}
export function parseISO(iso) {
if (!iso)
return null;
const dt = DateTime.fromISO(iso, { setZone: true, zone: PARIS });
return dt.isValid ? dt : null;
}