UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

360 lines (359 loc) 14.7 kB
/** * Needs to be ran after retrieve_agenda.ts script ! * - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr * - extracts XML files, distributes them by session/year */ import assert from "assert"; import commandLineArgs from "command-line-args"; import fs, { ensureDirSync } from "fs-extra"; import path from "path"; import StreamZip from "node-stream-zip"; import * as cheerio from "cheerio"; import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders"; import { commonOptions } from "./shared/cli_helpers"; import { parseCompteRenduIntervalFromFile, sessionStartYearFromDate } from "../model/seance"; import { extractSommaireBlocks, makeReunionUid } from "../utils/reunion_parsing"; import { getSessionsFromStart } from "../types/sessions"; import { ensureAndClearDir, fetchWithRetry } from "./shared/util"; import { isNoiseBlock, scoreSommaireBlockForEvent } from "../utils/scoring"; import { parseYYYYMMDD } from "../utils/date"; import * as git from "../git.js"; const optionsDefinitions = [ ...commonOptions, { help: "parse and convert comptes-rendus des débats into JSON", name: "parseDebats", type: Boolean, }, ]; const options = commandLineArgs(optionsDefinitions); const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip"; class CompteRenduError extends Error { constructor(message, url) { super(`An error occurred while retrieving ${url}: ${message}`); } } async function downloadCriZip(zipPath) { if (!options["silent"]) console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`); const response = await fetchWithRetry(CRI_ZIP_URL); if (!response.ok) { if (response.status === 404) { console.warn(`CRI zip ${CRI_ZIP_URL} not found`); return; } throw new CompteRenduError(String(response.status), CRI_ZIP_URL); } const buf = Buffer.from(await response.arrayBuffer()); await fs.writeFile(zipPath, buf); if (!options["silent"]) { const mb = (buf.length / (1024 * 1024)).toFixed(1); console.log(`[CRI] Downloaded ${mb} MB → ${zipPath}`); } } async function extractAndDistributeXmlBySession(zipPath, originalRoot) { const zip = new StreamZip.async({ file: zipPath }); const entries = await zip.entries(); let count = 0; for (const entryName of Object.keys(entries)) { if (!entryName.toLowerCase().endsWith(".xml")) continue; // ex: d20231005.xml const base = path.basename(entryName); const m = base.match(/^d(\d{8})\.xml$/i); if (!m) continue; const yyyymmdd = m[1]; const dt = parseYYYYMMDD(yyyymmdd); if (!dt) continue; const session = sessionStartYearFromDate(dt); const destDir = path.join(originalRoot, String(session)); await fs.ensureDir(destDir); const outPath = path.join(destDir, base); await zip.extract(entryName, outPath); count++; } await zip.close(); return count; } export async function retrieveCriXmlDump(dataDir, options = {}) { const root = path.join(dataDir, COMPTES_RENDUS_FOLDER); ensureDirSync(root); const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER); if (!options["keepDir"]) { ensureAndClearDir(originalRoot); } else { fs.ensureDirSync(originalRoot); } const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER); if (!options["keepDir"]) { ensureAndClearDir(transformedRoot); } else { fs.ensureDirSync(transformedRoot); } const sessions = getSessionsFromStart(options["fromSession"]); // 1) Download ZIP global + distribut by session const zipPath = path.join(dataDir, "cri.zip"); console.log("[CRI] Downloading global CRI zip…"); await downloadCriZip(zipPath); console.log("[CRI] Extracting + distributing XMLs by session…"); for (const session of sessions) { const dir = path.join(originalRoot, String(session)); if (await fs.pathExists(dir)) { for (const f of await fs.readdir(dir)) if (/\.xml$/i.test(f)) await fs.remove(path.join(dir, f)); } } const n = await extractAndDistributeXmlBySession(zipPath, originalRoot); if (n === 0) { console.warn("[CRI] No XML extracted. Archive empty or layout changed?"); } else { console.log(`[CRI] Distributed ${n} XML file(s) into session folders.`); } if (!options["parseDebats"]) { console.log("[CRI] parseDebats not requested → done."); return; } for (const session of sessions) { const originalSessionDir = path.join(originalRoot, String(session)); if (!(await fs.pathExists(originalSessionDir))) { continue; } const xmlFiles = (await fs.readdir(originalSessionDir)).filter((f) => /^d\d{8}\.xml$/i.test(f)).sort(); const transformedSessionDir = path.join(transformedRoot, String(session)); await fs.ensureDir(transformedSessionDir); const now = Date.now(); for (const f of xmlFiles) { const yyyymmdd = f.slice(1, 9); const xmlPath = path.join(originalSessionDir, f); // === ONLY-RECENT if (options["only-recent"]) { const cutoff = now - options["only-recent"] * 24 * 3600 * 1000; const seanceTs = Date.parse(`${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`); if (seanceTs < cutoff) { const files = await fs.readdir(transformedSessionDir); const dayFiles = files.filter((fn) => fn.startsWith(`CRSSN${yyyymmdd}E`) && fn.endsWith(".json")); if (dayFiles.length > 0) { for (const fn of dayFiles) { const match = fn.match(/^CRSSN(\d{8})E(.+)\.json$/); const eventId = match?.[2]; if (!eventId) continue; const crPath = path.join(transformedSessionDir, fn); try { const cr = await fs.readJSON(crPath); await linkCriEventIntoAgenda(dataDir, yyyymmdd, eventId, cr.uid, cr, session); } catch (e) { console.warn(`[CR] [${session}] Could not relink existing CR into a reunion for ${yyyymmdd} event=${eventId}:`, e); } } continue; } } } // === Charger les events SP du jour depuis les agendas groupés === const dayEvents = await loadAgendaSpEventsForDate(dataDir, yyyymmdd, session); if (dayEvents.length === 0) { console.warn(`[CRI] [${session}] No agenda SP events found for ${yyyymmdd} → skip split/link`); continue; } // === Lire XML + construire index DOM === let raw; let $; let order; let idx; try { raw = await fs.readFile(xmlPath, "utf8"); $ = cheerio.load(raw, { xml: false }); order = $("body *").toArray(); idx = new Map(order.map((el, i) => [el, i])); } catch (e) { console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e); continue; } // === Extraire sommaire + matcher vers events agenda === const blocks = extractSommaireBlocks($, idx); const intervals = buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents); if (!intervals.length) { console.warn(`[CRI] [${session}] No confident split intervals for ${yyyymmdd} → skip`); continue; } // === Parser / écrire / linker chaque segment par event === for (const iv of intervals) { const outName = `CRSSN${yyyymmdd}E${iv.agendaEventId}.json`; const outPath = path.join(transformedSessionDir, outName); const cr = await parseCompteRenduIntervalFromFile(xmlPath, iv.startIndex, iv.endIndex, iv.agendaEventId); if (!cr) { console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} event=${iv.agendaEventId} → skip`); continue; } await fs.ensureDir(transformedSessionDir); await fs.writeJSON(outPath, cr, { spaces: 2 }); try { await linkCriEventIntoAgenda(dataDir, yyyymmdd, iv.agendaEventId, cr.uid, cr, session); } catch (e) { console.warn(`[CR] [${session}] Could not link CR into agenda for ${yyyymmdd} event=${iv.agendaEventId}:`, e); } } } } const debatsDir = path.join(dataDir, COMPTES_RENDUS_FOLDER); commitAndPushGit(debatsDir, options); } function commitAndPushGit(datasetDir, options) { let exitCode = 10; // 0: some data changed, 10: no modification if (options.commit) { const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options.remote); if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) { exitCode = errorCode; } } } async function linkCriEventIntoAgenda(dataDir, yyyymmdd, agendaEventId, crUid, cr, session) { const agendadDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString()); fs.ensureDirSync(agendadDir); const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`; const agendaUid = makeReunionUid(dateISO, "SP", agendaEventId, null); const agendaPath = path.join(agendadDir, `${agendaUid}.json`); let agenda = null; if (await fs.pathExists(agendaPath)) { try { agenda = await fs.readJSON(agendaPath); } catch (e) { console.warn(`[CR] unreadable reunion JSON → ${agendaPath} (${e})`); agenda = null; } } if (!agenda) { console.warn(`[CR] Missing reunion file for SP event=${agendaEventId}: ${agendaPath}`); return; } ; agenda.compteRenduRefUid = crUid; await fs.writeJSON(agendaPath, agenda, { spaces: 2 }); console.log(`[CR] Linked CR ${crUid}${path.basename(agendaPath)} (event=${agendaEventId})`); } function buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents) { const MIN_SCORE = 0.65; const MIN_GAP = 0.08; const firstIntervenant = $("div.intervenant").first()[0]; const firstIntervenantIdx = firstIntervenant ? (idx.get(firstIntervenant) ?? null) : null; const pivots = []; for (const b of blocks) { if (isNoiseBlock(b.text)) continue; let best = null; let second = 0; for (const ev of dayEvents) { const s = scoreSommaireBlockForEvent(b.text, ev); if (!best || s > best.score) { second = best?.score ?? second; best = { ev, score: s }; } else if (s > second) { second = s; } } if (!best) continue; const resolved = resolveTargetIndex($, idx, b.targetId); const contentStartIndex = resolved ?? b.startIndex; if (firstIntervenantIdx != null && contentStartIndex < firstIntervenantIdx && resolved == null) { continue; } if (best.score < MIN_SCORE) continue; if (best.score - second < MIN_GAP) continue; pivots.push({ agendaEventId: best.ev.id, startIndex: contentStartIndex, score: best.score, }); } if (pivots.length === 0) return []; // Dédupe par event (on garde le premier startIndex) const byEvent = new Map(); for (const p of pivots.sort((a, b) => a.startIndex - b.startIndex)) { if (!byEvent.has(p.agendaEventId)) { byEvent.set(p.agendaEventId, { startIndex: p.startIndex, score: p.score, }); } } const sorted = Array.from(byEvent.entries()) .map(([agendaEventId, v]) => ({ agendaEventId, startIndex: v.startIndex, score: v.score, })) .sort((a, b) => a.startIndex - b.startIndex); // Construction des intervalles const intervals = []; for (let i = 0; i < sorted.length; i++) { const cur = sorted[i]; const next = sorted[i + 1]; const endIndex = next ? next.startIndex - 1 : order.length - 1; intervals.push({ agendaEventId: cur.agendaEventId, startIndex: cur.startIndex, endIndex, score: cur.score, }); } return intervals; } async function loadAgendaSpEventsForDate(dataDir, yyyymmdd, session) { const agendasDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString()); if (!(await fs.pathExists(agendasDir))) return []; const files = (await fs.readdir(agendasDir)).filter((fn) => fn.startsWith(`RUSN${yyyymmdd}IDS`) && fn.endsWith(".json")); const events = []; for (const fn of files) { try { const g = (await fs.readJSON(path.join(agendasDir, fn))); const e = g?.events?.[0]; if (e && e.type === "Séance publique") events.push(e); } catch { } } return events; } function cssEscapeIdent(s) { return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"'); } function resolveTargetIndex($, idx, targetId) { if (!targetId) return null; const safe = cssEscapeIdent(targetId); const el = $(`[id="${safe}"]`)[0] || $(`[name="${safe}"]`)[0]; if (!el) return null; const i = idx.get(el); return i == null ? null : i; } async function main() { const dataDir = options["dataDir"]; assert(dataDir, "Missing argument: data directory"); console.time("CRI processing time"); await retrieveCriXmlDump(dataDir, options); console.timeEnd("CRI processing time"); } main() .then(() => process.exit(0)) .catch((error) => { console.error(error); process.exit(1); });