UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

377 lines (376 loc) 17.1 kB
import fs, { ensureDir } from "fs-extra"; import assert from "assert"; import path from "path"; import * as cheerio from "cheerio"; import { COMMISSION_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders"; import { loadAgendaForDate, parseCommissionMetadataFromHtml, linkCRtoCommissionGroup } from "../utils/cr_spliting"; import { cleanTitle, extractDayH3Sections, parseCommissionCRSectionFromDom } from "../model/commission"; import commandLineArgs from "command-line-args"; import { commonOptions } from "./shared/cli_helpers"; import { sessionStartYearFromDate } from "../model/seance"; import { getSessionsFromStart } from "../types/sessions"; import { ensureAndClearDir, fetchWithRetry } from "./shared/util"; import { jaccard, jaccardTokenSim } from "../utils/scoring"; import * as git from "../git.js"; class CommissionCRDownloadError extends Error { constructor(message, url) { super(`An error occurred while retrieving Commission CR ${url}: ${message}`); } } const optionsDefinitions = [ ...commonOptions, { name: "concurrency", type: Number, defaultValue: 6, help: "Max parallel downloads" }, { name: "politenessMs", type: Number, defaultValue: 150, help: "Delay per worker (ms)" }, { help: "parse and convert comptes-rendus des débats into JSON", name: "parseDebats", type: Boolean, }, ]; const options = commandLineArgs(optionsDefinitions); const COMMISSION_HUBS = { "Commission des affaires étrangères": [ "https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres.html", "https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres_archives.html", ], "Commission des affaires économiques": [ "https://www.senat.fr/compte-rendu-commissions/economie.html", "https://www.senat.fr/compte-rendu-commissions/economie_archives.html", ], "Commission de l'amenagement du territoire et du développement durable": [ "https://www.senat.fr/compte-rendu-commissions/developpement-durable.html", "https://www.senat.fr/compte-rendu-commissions/developpement-durable_archives.html", ], "Commission de la culture": [ "https://www.senat.fr/compte-rendu-commissions/culture.html", "https://www.senat.fr/compte-rendu-commissions/culture_archives.html", ], "Commission des finances": [ "https://www.senat.fr/compte-rendu-commissions/finances.html", "https://www.senat.fr/compte-rendu-commissions/finances_archives.html", ], "Commission des lois": [ "https://www.senat.fr/compte-rendu-commissions/lois.html", "https://www.senat.fr/compte-rendu-commissions/lois_archives.html", ], "Commission des affaires sociales": [ "https://www.senat.fr/compte-rendu-commissions/affaires-sociales.html", "https://www.senat.fr/compte-rendu-commissions/affaires-sociales_archives.html", ], "Commission des affaires européennes": [ "https://www.senat.fr/compte-rendu-commissions/affaires-europeennes.html", "https://www.senat.fr/compte-rendu-commissions/affaires-europeennes_archives.html", ], }; async function harvestWeeklyLinksFromHub(hubUrl) { const res = await fetchWithRetry(hubUrl); if (!res.ok) return []; const html = await res.text(); const $ = cheerio.load(html); const out = []; $("a[href]").each((_, a) => { const href = ($(a).attr("href") || "").trim(); const m = href.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i); if (m) { const url = href.startsWith("http") ? href : new URL(href, hubUrl).toString(); out.push(url); } }); return Array.from(new Set(out)); } async function discoverCommissionWeeklyPages(fromSession) { const results = []; for (const [commissionKey, hubs] of Object.entries(COMMISSION_HUBS)) { for (const hubUrl of hubs) { try { const links = await harvestWeeklyLinksFromHub(hubUrl); for (const url of links) { const m = url.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i); if (!m) continue; const yyyymmdd = m[1]; const year = Number(yyyymmdd.slice(0, 4)); const month = Number(yyyymmdd.slice(4, 6)); const session = month >= 10 ? year : year - 1; if (session < fromSession) continue; results.push({ url, yyyymmdd, commissionKey }); } } catch (e) { console.warn(`[COM-CR][hub-fail] ${hubUrl}${e?.message ?? e}`); } } } return results.sort((a, b) => a.yyyymmdd.localeCompare(b.yyyymmdd)); } function commitAndPushGit(datasetDir, options) { let exitCode = 10; // 0: some data changed, 10: no modification if (options.commit) { const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options.remote); if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) { exitCode = errorCode; } } } function toHourShort(hhmm) { if (!hhmm) return null; const m = hhmm.match(/^(\d{2}):(\d{2})$/); return m ? `${m[1]}${m[2]}` : null; } function timeToMinutes(hhmm) { const [h, m] = hhmm.split(":").map((n) => parseInt(n, 10)); return (h || 0) * 60 + (m || 0); } async function tryDownload(url) { const res = await fetch(url, { redirect: "follow" }); if (res.status === 404) return null; if (!res.ok) throw new CommissionCRDownloadError(String(res.status), url); const ab = await res.arrayBuffer(); return Buffer.from(ab); } function normOrgane(s) { return s .toLowerCase() .normalize("NFD") .replace(/[\u0300-\u036f]/g, "") .replace(/&/g, " et ") .replace(/[^a-z0-9\s-]/g, " ") .replace(/\s+/g, " ") .trim(); } function toTokens(s) { return new Set(normOrgane(s) .split(/\s+/) .filter((t) => t.length >= 3 && !["commission", "des", "de", "du", "d", "la", "le", "les", "et"].includes(t))); } function reunionOrganeCandidates(h) { const any = h; const out = [any.organeSlug, any.organeKey, any.organe, h.titre].filter(Boolean); return Array.from(new Set(out.map(normOrgane))); } function organeSimilarity(h, commissionKey) { const keyTokens = toTokens(commissionKey.replace(/-/g, " ")); const cand = reunionOrganeCandidates(h).map(toTokens); let best = 0; for (const B of cand) best = Math.max(best, jaccard(keyTokens, B)); return best; // 0..1 } function timeProximityScore(h, openHHMM, maxDeltaMin) { if (!openHHMM) return 0; const hhmm = (h.startTime ?? null); if (!hhmm) return 0; const d = Math.abs(timeToMinutes(hhmm) - timeToMinutes(openHHMM)); if (d > maxDeltaMin) return 0; return 1 - d / maxDeltaMin; // 0..1 (1 = même heure) } function titleSimilarity(reunion, sectionTitle) { const t = reunion.titre ?? ""; const o = reunion.objet ?? ""; if (!sectionTitle.trim()) return 0; const sTit = jaccardTokenSim(t, sectionTitle); const sObj = jaccardTokenSim(o, sectionTitle); return Math.max(sTit, sObj); } async function retrieveCommissionCRs(options = {}) { const dataDir = options["dataDir"]; const fromSession = Number(options["fromSession"]); const concurrency = Number(options["concurrency"] ?? 6); const politenessMs = Number(options["politenessMs"] ?? 150); const commissionsRootDir = path.join(dataDir, COMMISSION_FOLDER); const originalRoot = path.join(commissionsRootDir, DATA_ORIGINAL_FOLDER); if (!options["keepDir"]) { ensureAndClearDir(originalRoot); } else { ensureDir(originalRoot); } const discovered = await discoverCommissionWeeklyPages(fromSession); console.log(`[COM-CR][discover] ${discovered.length} links (>= session ${fromSession})`); const jobs = discovered.map(({ url, yyyymmdd, commissionKey }) => { const d = new Date(Number(yyyymmdd.slice(0, 4)), Number(yyyymmdd.slice(4, 6)) - 1, Number(yyyymmdd.slice(6, 8))); const session = sessionStartYearFromDate(d); const dir = path.join(originalRoot, String(session), commissionKey); fs.ensureDirSync(dir); const slug = url.replace(/^.*\/(\d{8})\/([^\/]+)\.html$/i, "$2"); const outPath = path.join(dir, `${yyyymmdd}.${slug}.html`); return { url, outPath, yyyymmdd, commissionKey }; }); console.log(`[COM-CR] Downloading ${jobs.length} links → ${path.relative(process.cwd(), originalRoot)}`); let completed = 0, saved = 0, skipped = 0, notFound = 0; const workers = Array.from({ length: Math.max(1, concurrency) }, async () => { while (true) { const job = jobs.shift(); if (!job) break; const { url, outPath, yyyymmdd } = job; try { if (await fs.pathExists(outPath)) { skipped++; } else { const buf = await tryDownload(url); if (!buf) { notFound++; console.warn(`[COM-CR][404] ${url} → week=${yyyymmdd}`); } else { await fs.writeFile(outPath, buf); saved++; } } } catch (e) { console.error(`[COM-CR][err] ${url}${e?.message || e}`); } finally { completed++; if (politenessMs > 0) await new Promise((r) => setTimeout(r, politenessMs)); } } }); await Promise.all(workers); console.log(`[COM-CR] done: saved=${saved} | skipped=${skipped} | 404=${notFound} | total=${completed}`); const sessions = getSessionsFromStart(options["fromSession"]); const comRoot = path.join(dataDir, COMMISSION_FOLDER); const transformedRoot = path.join(comRoot, DATA_TRANSFORMED_FOLDER); if (options["keepDir"]) ensureDir(transformedRoot); else ensureAndClearDir(transformedRoot); for (const session of sessions) { const originalSessionDir = path.join(originalRoot, String(session)); if (!(await fs.pathExists(originalSessionDir))) continue; const commissionDirs = (await fs.readdir(originalSessionDir, { withFileTypes: true })) .filter((d) => d.isDirectory()) .map((d) => d.name); // ex: "affaires-etrangeres", "finances", etc. for (const commissionKey of commissionDirs) { const commissionDir = path.join(originalSessionDir, commissionKey); const htmlFiles = (await fs.readdir(commissionDir)).filter((f) => /\.html?$/i.test(f)).sort(); let totalFiles = 0; let linkedFiles = 0; for (const f of htmlFiles) { const htmlPath = path.join(commissionDir, f); let meta; let raw = ""; try { raw = await fs.readFile(htmlPath, "utf8"); meta = parseCommissionMetadataFromHtml(raw, f); } catch (e) { console.warn(`[COM-CR][PRE][${session}] Cannot read/parse ${f}:`, e); continue; } if (!meta?.days?.length) continue; const $ = cheerio.load(raw, { xmlMode: false }); for (const day of meta.days) { const yyyymmdd = day.date.replace(/-/g, ""); const dt = new Date(Number(day.date.slice(0, 4)), Number(day.date.slice(5, 7)) - 1, Number(day.date.slice(8, 10))); const daySession = sessionStartYearFromDate(dt); let hits = await loadAgendaForDate(dataDir, yyyymmdd, daySession); console.log(`[COM-CR][TRANSFORM] ${f}${hits.length} agenda events on ${day.date} :`); const sections = extractDayH3Sections($, day.date); if (sections.length === 0) { console.warn(`[COM-CR][TRANSFORM] no sections found for ${f} on ${day.date}, skipping.`); continue; } const MAX_TIME_DELTA_MIN = 120; const ORGANE_GATE = 0.55; const TITLE_GATE = 0.2; const W_ORG = 0.4; const W_TIM = 0.4; const W_TIT = 0.2; for (let sIdx = 0; sIdx < sections.length; sIdx++) { const sec = sections[sIdx]; let best = null; let reason = "fallback-none"; if (hits.length) { const scored = hits .map((h) => { const sOrg = organeSimilarity(h, commissionKey); // 0..1 const sTim = timeProximityScore(h, sec.time ?? day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1 const sTit = titleSimilarity(h, sec.title); // 0..1 const total = W_ORG * sOrg + W_TIM * sTim + W_TIT * sTit; return { h, sOrg, sTim, sTit, total }; }) .filter((x) => x.sOrg >= ORGANE_GATE && x.sTit >= TITLE_GATE) .sort((a, b) => b.total - a.total); if (scored[0]) { best = scored[0].h; reason = scored[0].sTit >= Math.max(scored[0].sOrg, scored[0].sTim) ? "title" : scored[0].sOrg >= scored[0].sTim ? "organe" : "time"; } } const hourShort = toHourShort(day.openTime) ?? "NA"; const cr = parseCommissionCRSectionFromDom($, htmlPath, { dateISO: day.date, hourShort, organe: commissionKey, section: sec, matched: best ?? undefined, }); if (!cr) { console.warn(`[COM-CR][TRANSFORM] parse failed for section#${sIdx} ${path.basename(htmlPath)}${best ? best.uid : "NO-GROUP"} (${commissionKey})`); continue; } const fileUid = cr.uid; const transformedSessionDir = path.join(transformedRoot, String(daySession)); fs.ensureDirSync(transformedSessionDir); const outPath = path.join(transformedSessionDir, `${fileUid}.json`); await fs.writeJSON(outPath, cr, { spaces: 2 }); const titreGuess = cleanTitle(sections[sIdx].title) || "Commission du " + day.date; const up = await linkCRtoCommissionGroup({ dataDir, dateISO: day.date, organeDetected: commissionKey, hourShort, crUid: fileUid, titreGuess, groupUid: best ? best.uid : undefined, }); totalFiles++; if (up.created || up.updated) linkedFiles++; else { console.warn(`[COM-CR][AGENDA][WARN] CR ${fileUid} (section#${sIdx}) not linked (reason=${reason})`); } } } } if (!options["silent"]) { console.log(`[COM-CR][SESSION ${session}][${commissionKey}] Processed ${totalFiles} CR files, linked to agenda: ${linkedFiles}`); } } } const debatsDir = path.join(dataDir, COMMISSION_FOLDER); commitAndPushGit(debatsDir, options); } async function main() { const dataDir = options["dataDir"]; assert(dataDir, "Missing argument: data directory"); console.time("CRI processing time"); await retrieveCommissionCRs(options); console.timeEnd("CRI processing time"); } main() .then(() => process.exit(0)) .catch((error) => { console.error(error); process.exit(1); });