@tricoteuses/senat
Version:
Handle French Sénat's open data
266 lines (265 loc) • 9.38 kB
JavaScript
import path from "path";
import * as cheerio from "cheerio";
import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
import fs from "fs-extra";
import { sessionStartYearFromDate } from "../model/seance";
import { frDateToISO, hourShortToStartTime } from "./date";
import { normalizeSpaces } from "./string_cleaning";
// Convert "quinze heures trente", "15 heures 30", "dix-sept heures moins le quart", etc. en "HHMM"
function parseFrenchClockToHHMM(input) {
const s = (input || "")
.toLowerCase()
.normalize("NFKD")
.replace(/[\u0300-\u036f]/g, "")
.trim();
if (!s)
return undefined;
const digitMatch = s.match(/(\d{1,2})\s*heures?(?:\s*(\d{1,2}))?/);
if (digitMatch) {
const h = Math.min(24, Math.max(0, parseInt(digitMatch[1], 10)));
const m = digitMatch[2] ? Math.min(59, Math.max(0, parseInt(digitMatch[2], 10))) : 0;
return `${String(h).padStart(2, "0")}${String(m).padStart(2, "0")}`;
}
const NUM = new Map([
["zero", 0],
["une", 1],
["un", 1],
["deux", 2],
["trois", 3],
["quatre", 4],
["cinq", 5],
["six", 6],
["sept", 7],
["huit", 8],
["neuf", 9],
["dix", 10],
["onze", 11],
["douze", 12],
["treize", 13],
["quatorze", 14],
["quinze", 15],
["seize", 16],
["dix-sept", 17],
["dix sept", 17],
["dix-huit", 18],
["dix huit", 18],
["dix-neuf", 19],
["dix neuf", 19],
["vingt", 20],
["vingt et une", 21],
["vingt-et-une", 21],
["vingt et un", 21],
["vingt-et-un", 21],
["vingt-deux", 22],
["vingt deux", 22],
["vingt-trois", 23],
["vingt trois", 23],
["vingt-quatre", 24],
["vingt quatre", 24],
]);
const hourWordMatch = s.match(/([a-z\- ]+?)\s*heures?/);
if (!hourWordMatch)
return undefined;
const hourWord = hourWordMatch[1].trim();
let hour = NUM.get(hourWord);
if (hour == null) {
const cleaned = hourWord.replace(/\s+/g, " ");
hour = NUM.get(cleaned);
}
if (hour == null)
return undefined;
let minutes = 0;
if (/\bet (demie|demi)\b/.test(s))
minutes = 30;
else if (/\bet quart\b/.test(s))
minutes = 15;
else if (/\bmoins le quart\b/.test(s)) {
hour = (hour + 23) % 24;
minutes = 45;
}
else {
const MIN = new Map([
["cinq", 5],
["dix", 10],
["quinze", 15],
["vingt", 20],
["vingt-cinq", 25],
["vingt cinq", 25],
["trente", 30],
["trente-cinq", 35],
["trente cinq", 35],
["quarante", 40],
["quarante-cinq", 45],
["quarante cinq", 45],
["cinquante", 50],
["cinquante-cinq", 55],
["cinquante cinq", 55],
]);
const minWordMatch = s.match(/heures?\s+([a-z\- ]+?)(?:[).,;]|$)/);
if (minWordMatch) {
const mw = minWordMatch[1].trim();
const m1 = MIN.get(mw);
if (m1 != null)
minutes = m1;
}
}
return `${String(hour).padStart(2, "0")}${String(minutes).padStart(2, "0")}`;
}
function extractWeekStartFromHead($) {
const og = $('meta[property="og:title"]').attr("content") || $("title").text();
const m = (og ?? "").toLowerCase().match(/semaine du\s+(\d{1,2}\s+\w+\s+\d{4})/i);
if (m)
return frDateToISO(m[1]);
return undefined;
}
function detectOrganeFromTitle(s) {
const t = (s ?? "").trim();
if (!t)
return { organeTitleRaw: undefined, organeDetected: undefined };
const lower = t.toLowerCase();
const m = lower.match(/commission(?:\s+des|\s+de|)\s+([^:]+)$/i);
let organeDetected;
if (m && m[1]) {
organeDetected = ("Commission " + m[1])
.replace(/\s+/g, " ")
.replace(/\s+:? comptes? rendus?$/i, "")
.trim();
organeDetected = organeDetected[0].toUpperCase() + organeDetected.slice(1);
}
return { organeTitleRaw: t, organeDetected };
}
function extractDaysAndOpenings($) {
const days = [];
const h2s = $("h2").toArray();
for (let i = 0; i < h2s.length; i++) {
const h = h2s[i];
const txt = normalizeSpaces($(h).text());
const m = txt.match(/^(?:Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche)\s+(.+?)$/i);
if (!m)
continue;
const iso = frDateToISO(m[1]);
if (!iso)
continue;
let openTime;
let cur = $(h).next();
while (cur.length && cur[0].tagName !== "h2") {
const t = normalizeSpaces(cur.text());
const mt = t.match(/La réunion est ouverte à\s+(\d{1,2})\s*h(?:\s*(\d{2}))?/i);
if (mt) {
openTime = `${mt[1].padStart(2, "0")}:${(mt[2] ?? "00").padStart(2, "0")}`;
break;
}
cur = cur.next();
}
days.push({ date: iso, openTime, h2Index: i });
}
return days;
}
function extractOrganeCode($) {
const names = $("a[name]")
.toArray()
.map((a) => ($(a).attr("name") || "").trim());
return names.find((n) => /^[A-Z]{3,6}$/.test(n));
}
export function parseCommissionMetadataFromHtml(html, sourceFileName) {
const $ = cheerio.load(html);
const h1 = $("h1.page-title").first().text().trim() || undefined;
const headTitle = $('meta[property="og:title"]').attr("content") || $("title").text() || undefined;
const { organeTitleRaw, organeDetected } = detectOrganeFromTitle(h1 || headTitle);
let weekStart = extractWeekStartFromHead($);
const days = extractDaysAndOpenings($);
if (!weekStart && days.length > 0)
weekStart = days[0].date;
const organeCode = extractOrganeCode($);
return {
sourceFile: sourceFileName ?? null,
organeTitleRaw: organeTitleRaw ?? null,
organeDetected: organeDetected ?? null,
organeCode: organeCode ?? null,
weekStart: weekStart ?? null,
days, // [{date, openTime?, h2Index}]
};
}
function isGroupedReunion(o) {
return o && typeof o === "object" && typeof o.uid === "string" && typeof o.date === "string";
}
export async function loadAgendaForDate(dataDir, yyyymmdd, session) {
const baseDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session));
if (!(await fs.pathExists(baseDir)))
return [];
const files = (await fs.readdir(baseDir)).filter((f) => f.startsWith(`RUSN${yyyymmdd}IDC`) && f.toLowerCase().endsWith(".json"));
const out = [];
for (const f of files) {
const p = path.join(baseDir, f);
try {
const raw = await fs.readFile(p, "utf8");
const obj = JSON.parse(raw);
if (!isGroupedReunion(obj)) {
continue;
}
if (!obj.uid.startsWith(`RUSN${yyyymmdd}IDC`)) {
continue;
}
out.push(obj);
}
catch {
// ignore
}
}
return out;
}
export async function linkCRtoCommissionGroup(opts) {
const { dataDir, dateISO, organeDetected, hourShort, crUid, titreGuess, groupUid } = opts;
const computedUid = crUid.replace(/^CRC/, "RU");
const uid = groupUid ?? computedUid;
const session = sessionStartYearFromDate(new Date(dateISO));
const groupedDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session));
await fs.ensureDir(groupedDir);
const filePath = path.join(groupedDir, `${uid}.json`);
let group = null;
let created = false;
let updated = false;
try {
if (await fs.pathExists(filePath)) {
group = await fs.readJSON(filePath);
}
}
catch (e) {
console.warn(`[AGENDA][COM] Unreadable JSON → ${filePath} (${e?.message}) → will recreate`);
}
if (!group) {
group = {
uid,
chambre: "SN",
date: dateISO,
type: "Commission",
startTime: hourShortToStartTime(hourShort),
endTime: null,
captationVideo: false,
titre: titreGuess ?? "",
organe: organeDetected ?? "Commission",
objet: titreGuess ?? "",
events: [],
compteRenduRefUid: crUid,
};
created = true;
console.log(`[AGENDA][COM] Created new group uid=${uid} for CR uid=${crUid}`);
}
else {
group.compteRenduRefUid = crUid;
updated = true;
console.log(`[AGENDA][COM] Updated group uid=${uid} for CR uid=${crUid}`);
}
// Lien CR
// Enrichir depuis CR si vide
// const sommaire = cr?.metadonnees?.sommaire as Sommaire | undefined;
// if (sommaire) {
// const { titre: dTitre, objet: dObjet } = deriveTitreObjetFromSommaire(sommaire, undefined);
// if (!group.titre && dTitre) group.titre = dTitre;
// if ((!group.objet || !group.objet.trim()) && dObjet) group.objet = dObjet;
// } else if (!group.titre && titreGuess) {
// group.titre = titreGuess;
// }
await fs.writeJSON(filePath, group, { spaces: 2 });
return { uid, filePath, created, updated };
}