UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

266 lines (265 loc) 9.38 kB
import path from "path"; import * as cheerio from "cheerio"; import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders"; import fs from "fs-extra"; import { sessionStartYearFromDate } from "../model/seance"; import { frDateToISO, hourShortToStartTime } from "./date"; import { normalizeSpaces } from "./string_cleaning"; // Convert "quinze heures trente", "15 heures 30", "dix-sept heures moins le quart", etc. en "HHMM" function parseFrenchClockToHHMM(input) { const s = (input || "") .toLowerCase() .normalize("NFKD") .replace(/[\u0300-\u036f]/g, "") .trim(); if (!s) return undefined; const digitMatch = s.match(/(\d{1,2})\s*heures?(?:\s*(\d{1,2}))?/); if (digitMatch) { const h = Math.min(24, Math.max(0, parseInt(digitMatch[1], 10))); const m = digitMatch[2] ? Math.min(59, Math.max(0, parseInt(digitMatch[2], 10))) : 0; return `${String(h).padStart(2, "0")}${String(m).padStart(2, "0")}`; } const NUM = new Map([ ["zero", 0], ["une", 1], ["un", 1], ["deux", 2], ["trois", 3], ["quatre", 4], ["cinq", 5], ["six", 6], ["sept", 7], ["huit", 8], ["neuf", 9], ["dix", 10], ["onze", 11], ["douze", 12], ["treize", 13], ["quatorze", 14], ["quinze", 15], ["seize", 16], ["dix-sept", 17], ["dix sept", 17], ["dix-huit", 18], ["dix huit", 18], ["dix-neuf", 19], ["dix neuf", 19], ["vingt", 20], ["vingt et une", 21], ["vingt-et-une", 21], ["vingt et un", 21], ["vingt-et-un", 21], ["vingt-deux", 22], ["vingt deux", 22], ["vingt-trois", 23], ["vingt trois", 23], ["vingt-quatre", 24], ["vingt quatre", 24], ]); const hourWordMatch = s.match(/([a-z\- ]+?)\s*heures?/); if (!hourWordMatch) return undefined; const hourWord = hourWordMatch[1].trim(); let hour = NUM.get(hourWord); if (hour == null) { const cleaned = hourWord.replace(/\s+/g, " "); hour = NUM.get(cleaned); } if (hour == null) return undefined; let minutes = 0; if (/\bet (demie|demi)\b/.test(s)) minutes = 30; else if (/\bet quart\b/.test(s)) minutes = 15; else if (/\bmoins le quart\b/.test(s)) { hour = (hour + 23) % 24; minutes = 45; } else { const MIN = new Map([ ["cinq", 5], ["dix", 10], ["quinze", 15], ["vingt", 20], ["vingt-cinq", 25], ["vingt cinq", 25], ["trente", 30], ["trente-cinq", 35], ["trente cinq", 35], ["quarante", 40], ["quarante-cinq", 45], ["quarante cinq", 45], ["cinquante", 50], ["cinquante-cinq", 55], ["cinquante cinq", 55], ]); const minWordMatch = s.match(/heures?\s+([a-z\- ]+?)(?:[).,;]|$)/); if (minWordMatch) { const mw = minWordMatch[1].trim(); const m1 = MIN.get(mw); if (m1 != null) minutes = m1; } } return `${String(hour).padStart(2, "0")}${String(minutes).padStart(2, "0")}`; } function extractWeekStartFromHead($) { const og = $('meta[property="og:title"]').attr("content") || $("title").text(); const m = (og ?? "").toLowerCase().match(/semaine du\s+(\d{1,2}\s+\w+\s+\d{4})/i); if (m) return frDateToISO(m[1]); return undefined; } function detectOrganeFromTitle(s) { const t = (s ?? "").trim(); if (!t) return { organeTitleRaw: undefined, organeDetected: undefined }; const lower = t.toLowerCase(); const m = lower.match(/commission(?:\s+des|\s+de|)\s+([^:]+)$/i); let organeDetected; if (m && m[1]) { organeDetected = ("Commission " + m[1]) .replace(/\s+/g, " ") .replace(/\s+:? comptes? rendus?$/i, "") .trim(); organeDetected = organeDetected[0].toUpperCase() + organeDetected.slice(1); } return { organeTitleRaw: t, organeDetected }; } function extractDaysAndOpenings($) { const days = []; const h2s = $("h2").toArray(); for (let i = 0; i < h2s.length; i++) { const h = h2s[i]; const txt = normalizeSpaces($(h).text()); const m = txt.match(/^(?:Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche)\s+(.+?)$/i); if (!m) continue; const iso = frDateToISO(m[1]); if (!iso) continue; let openTime; let cur = $(h).next(); while (cur.length && cur[0].tagName !== "h2") { const t = normalizeSpaces(cur.text()); const mt = t.match(/La réunion est ouverte à\s+(\d{1,2})\s*h(?:\s*(\d{2}))?/i); if (mt) { openTime = `${mt[1].padStart(2, "0")}:${(mt[2] ?? "00").padStart(2, "0")}`; break; } cur = cur.next(); } days.push({ date: iso, openTime, h2Index: i }); } return days; } function extractOrganeCode($) { const names = $("a[name]") .toArray() .map((a) => ($(a).attr("name") || "").trim()); return names.find((n) => /^[A-Z]{3,6}$/.test(n)); } export function parseCommissionMetadataFromHtml(html, sourceFileName) { const $ = cheerio.load(html); const h1 = $("h1.page-title").first().text().trim() || undefined; const headTitle = $('meta[property="og:title"]').attr("content") || $("title").text() || undefined; const { organeTitleRaw, organeDetected } = detectOrganeFromTitle(h1 || headTitle); let weekStart = extractWeekStartFromHead($); const days = extractDaysAndOpenings($); if (!weekStart && days.length > 0) weekStart = days[0].date; const organeCode = extractOrganeCode($); return { sourceFile: sourceFileName ?? null, organeTitleRaw: organeTitleRaw ?? null, organeDetected: organeDetected ?? null, organeCode: organeCode ?? null, weekStart: weekStart ?? null, days, // [{date, openTime?, h2Index}] }; } function isGroupedReunion(o) { return o && typeof o === "object" && typeof o.uid === "string" && typeof o.date === "string"; } export async function loadAgendaForDate(dataDir, yyyymmdd, session) { const baseDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session)); if (!(await fs.pathExists(baseDir))) return []; const files = (await fs.readdir(baseDir)).filter((f) => f.startsWith(`RUSN${yyyymmdd}IDC`) && f.toLowerCase().endsWith(".json")); const out = []; for (const f of files) { const p = path.join(baseDir, f); try { const raw = await fs.readFile(p, "utf8"); const obj = JSON.parse(raw); if (!isGroupedReunion(obj)) { continue; } if (!obj.uid.startsWith(`RUSN${yyyymmdd}IDC`)) { continue; } out.push(obj); } catch { // ignore } } return out; } export async function linkCRtoCommissionGroup(opts) { const { dataDir, dateISO, organeDetected, hourShort, crUid, titreGuess, groupUid } = opts; const computedUid = crUid.replace(/^CRC/, "RU"); const uid = groupUid ?? computedUid; const session = sessionStartYearFromDate(new Date(dateISO)); const groupedDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session)); await fs.ensureDir(groupedDir); const filePath = path.join(groupedDir, `${uid}.json`); let group = null; let created = false; let updated = false; try { if (await fs.pathExists(filePath)) { group = await fs.readJSON(filePath); } } catch (e) { console.warn(`[AGENDA][COM] Unreadable JSON → ${filePath} (${e?.message}) → will recreate`); } if (!group) { group = { uid, chambre: "SN", date: dateISO, type: "Commission", startTime: hourShortToStartTime(hourShort), endTime: null, captationVideo: false, titre: titreGuess ?? "", organe: organeDetected ?? "Commission", objet: titreGuess ?? "", events: [], compteRenduRefUid: crUid, }; created = true; console.log(`[AGENDA][COM] Created new group uid=${uid} for CR uid=${crUid}`); } else { group.compteRenduRefUid = crUid; updated = true; console.log(`[AGENDA][COM] Updated group uid=${uid} for CR uid=${crUid}`); } // Lien CR // Enrichir depuis CR si vide // const sommaire = cr?.metadonnees?.sommaire as Sommaire | undefined; // if (sommaire) { // const { titre: dTitre, objet: dObjet } = deriveTitreObjetFromSommaire(sommaire, undefined); // if (!group.titre && dTitre) group.titre = dTitre; // if ((!group.objet || !group.objet.trim()) && dObjet) group.objet = dObjet; // } else if (!group.titre && titreGuess) { // group.titre = titreGuess; // } await fs.writeJSON(filePath, group, { spaces: 2 }); return { uid, filePath, created, updated }; }