UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

211 lines (210 loc) 7.19 kB
import { DateTime } from "luxon"; import { buildOdj } from "./reunion_odj_building"; import { norm } from "./string_cleaning"; const PARIS = "Europe/Paris"; const STOPWORDS = new Set([ "de", "du", "des", "la", "le", "les", "l", "d", "et", "en", "au", "aux", "pour", "sur", "sous", "à", "a", "aux", ]); function toReunion(e, dossierBySenatUrl, uid) { const date = norm(e.date) ?? e.date; const { startISO, endISO } = deriveTimesForEvent(e); const startTime = startISO ?? e.startTime ?? null; const endTime = endISO ?? e.endTime ?? null; return { uid, chambre: "SN", date, type: e.type || "", organe: e.organe || undefined, startTime, endTime, captationVideo: e.captationVideo === true, titre: e.titre, objet: e.objet || "", events: [e], // TODO remove odj: buildOdj([e], dossierBySenatUrl), lieu: e.lieu || undefined, }; } export function buildReunionsByBucket(events, dossierBySenatUrl) { const out = { IDS: [], IDC: [], IDM: [], IDO: [], IDI: [] }; if (!events?.length) return out; for (const e of events) { const kind = classifyAgendaType(e?.type); if (!kind) { continue; console.warn("Can't determine type of reunion"); } const bucket = typeToSuffixStrict(kind); const uid = makeReunionUid(e.date, kind, e.id, e.organe ?? null); out[bucket].push(toReunion(e, dossierBySenatUrl, uid)); } // Tri stable par bucket (date + heure, inconnus à la fin) for (const k of Object.keys(out)) { out[k].sort((a, b) => { const da = DateTime.fromISO(`${a.date}T${a.startTime || "23:59:59.999+02:00"}`, { zone: PARIS }).toMillis(); const db = DateTime.fromISO(`${b.date}T${b.startTime || "23:59:59.999+02:00"}`, { zone: PARIS }).toMillis(); return da - db || (a.organe || "").localeCompare(b.organe || "") || (a.titre || "").localeCompare(b.titre || ""); }); } return out; } function normalizeNoAccents(s) { return (s || "") .trim() .normalize("NFKD") .replace(/[\u0300-\u036f]/g, ""); } function classifyAgendaType(typeLabel) { const s = normalizeNoAccents(typeLabel || "").toLowerCase(); if (/\bseance\b.*\bpublique\b/.test(s)) return "SP"; if (/\bcommissions\b/.test(s)) return "COM"; if (/\bmission\b.*\bcontrole\b/.test(s)) return "MC"; if (/\boffices\b|\bdelegations\b/.test(s)) return "OD"; if (/\instances\b|\decisionelles\b/.test(s)) return "ID"; return null; } function typeToSuffixStrict(kind) { switch (kind) { case "SP": return "IDS"; case "COM": return "IDC"; case "MC": return "IDM"; case "OD": return "IDO"; case "ID": return "IDI"; } } function organeInitials(input, maxLen = 8) { if (!input) return ""; const clean = normalizeNoAccents(input) .replace(/['’]/g, " ") .replace(/[^A-Za-z0-9\s]/g, " ") .replace(/\s+/g, " ") .trim(); if (!clean) return ""; const parts = clean.split(" "); const letters = []; for (const raw of parts) { const w = raw.toLowerCase(); if (!w) continue; if (STOPWORDS.has(w)) continue; // Take two first letter if alphanumeric const two = raw.slice(0, 2); if (/[A-Za-z0-9]/.test(two)) letters.push(two.toUpperCase()); } const out = letters.join(""); return out.slice(0, maxLen); } export function makeReunionUid(dateISO, kind, agendaEventId, organe) { const ymd = dateISO ? formatYYYYMMDD(dateISO) : "00000000"; const suffix = typeToSuffixStrict(kind); const org = organe && suffix !== "IDS" ? organeInitials(organe) : ""; let base = `RUSN${ymd}${suffix}${org}${agendaEventId}`; return base; } export function formatYYYYMMDD(dateYYYYMMDD) { const [y, m, d] = dateYYYYMMDD.split("-"); return `${y}${m}${d}`; } // Extract hours/minutes from French text like "à 10 h 30", "de 10 h à 12 h", etc. function parseTimeOriginalFR(timeOriginal) { if (!timeOriginal) return { start: null, end: null }; const txt = (timeOriginal || "") .replace(/\u00A0/g, " ") // nbsp → space .replace(/\s+/g, " ") // espaces multiples .toLowerCase() .trim(); // 1) "de 10 h 30 à 12 heures", "de 10h30 à 12h", "de 9 h à 11 h 15", etc. const reRange = /\bde\s+(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?\s*à\s*(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?/i; const mRange = txt.match(reRange); if (mRange) { const h1 = clampHour(+mRange[1]), m1 = clampMinute(mRange[2] ? +mRange[2] : 0); const h2 = clampHour(+mRange[3]), m2 = clampMinute(mRange[4] ? +mRange[4] : 0); return { start: toIsoTime(h1, m1), end: toIsoTime(h2, m2) }; } // 2) "à 10 h 30", "à 10h", "A 10h30", "A 9 heures", etc. const reAt = /\b(?:a|à)\s*(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?/i; const mAt = txt.match(reAt); if (mAt) { const h = clampHour(+mAt[1]), m = clampMinute(mAt[2] ? +mAt[2] : 0); return { start: toIsoTime(h, m), end: null }; } // 3) "10 h 30", "15h", "9 heures" sans 'à' / 'de ... à ...' const reBare = /\b(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?\b/; const mBare = txt.match(reBare); if (mBare) { const h = clampHour(+mBare[1]), m = clampMinute(mBare[2] ? +mBare[2] : 0); return { start: toIsoTime(h, m), end: null }; } return { start: null, end: null }; } function clampHour(h) { return Math.max(0, Math.min(23, h)); } function clampMinute(m) { return Math.max(0, Math.min(59, m)); } function toIsoTime(h, m) { return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:00.000+02:00`; } export function deriveTimesForEvent(ev) { const directStart = ev.startTime ?? null; const directEnd = ev.endTime ?? null; const fromText = parseTimeOriginalFR(ev.timeOriginal); const startISO = directStart ?? fromText.start ?? null; const endISO = directEnd ?? fromText.end ?? null; return { startISO, endISO }; } export function extractSommaireBlocks($, idx) { const blocks = []; // lignes du sommaire avec lien $("cri\\:tm5 a[href^='#'], cri\\:tm3 a[href^='#'], p.tm5 a[href^='#'], p.tm3 a[href^='#']").each((_, a) => { const href = $(a).attr("href") || ""; const targetId = href.startsWith("#") ? href.slice(1) : null; const text = norm($(a).text() || ""); const startIndex = idx.get(a) ?? idx.get($(a).closest("p")[0]) ?? null; if (!text || startIndex == null) return; blocks.push({ text, startIndex, targetId }); }); return blocks; } export function parseISO(iso) { if (!iso) return null; const dt = DateTime.fromISO(iso, { setZone: true, zone: PARIS }); return dt.isValid ? dt : null; }