@tricoteuses/senat
Version:
Handle French Sénat's open data
360 lines (359 loc) • 14.7 kB
JavaScript
/**
* Needs to be ran after retrieve_agenda.ts script !
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
* - extracts XML files, distributes them by session/year
*/
import assert from "assert";
import commandLineArgs from "command-line-args";
import fs, { ensureDirSync } from "fs-extra";
import path from "path";
import StreamZip from "node-stream-zip";
import * as cheerio from "cheerio";
import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
import { commonOptions } from "./shared/cli_helpers";
import { parseCompteRenduIntervalFromFile, sessionStartYearFromDate } from "../model/seance";
import { extractSommaireBlocks, makeReunionUid } from "../utils/reunion_parsing";
import { getSessionsFromStart } from "../types/sessions";
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
import { isNoiseBlock, scoreSommaireBlockForEvent } from "../utils/scoring";
import { parseYYYYMMDD } from "../utils/date";
import * as git from "../git.js";
const optionsDefinitions = [
...commonOptions,
{
help: "parse and convert comptes-rendus des débats into JSON",
name: "parseDebats",
type: Boolean,
},
];
const options = commandLineArgs(optionsDefinitions);
const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip";
class CompteRenduError extends Error {
constructor(message, url) {
super(`An error occurred while retrieving ${url}: ${message}`);
}
}
async function downloadCriZip(zipPath) {
if (!options["silent"])
console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`);
const response = await fetchWithRetry(CRI_ZIP_URL);
if (!response.ok) {
if (response.status === 404) {
console.warn(`CRI zip ${CRI_ZIP_URL} not found`);
return;
}
throw new CompteRenduError(String(response.status), CRI_ZIP_URL);
}
const buf = Buffer.from(await response.arrayBuffer());
await fs.writeFile(zipPath, buf);
if (!options["silent"]) {
const mb = (buf.length / (1024 * 1024)).toFixed(1);
console.log(`[CRI] Downloaded ${mb} MB → ${zipPath}`);
}
}
async function extractAndDistributeXmlBySession(zipPath, originalRoot) {
const zip = new StreamZip.async({ file: zipPath });
const entries = await zip.entries();
let count = 0;
for (const entryName of Object.keys(entries)) {
if (!entryName.toLowerCase().endsWith(".xml"))
continue;
// ex: d20231005.xml
const base = path.basename(entryName);
const m = base.match(/^d(\d{8})\.xml$/i);
if (!m)
continue;
const yyyymmdd = m[1];
const dt = parseYYYYMMDD(yyyymmdd);
if (!dt)
continue;
const session = sessionStartYearFromDate(dt);
const destDir = path.join(originalRoot, String(session));
await fs.ensureDir(destDir);
const outPath = path.join(destDir, base);
await zip.extract(entryName, outPath);
count++;
}
await zip.close();
return count;
}
export async function retrieveCriXmlDump(dataDir, options = {}) {
const root = path.join(dataDir, COMPTES_RENDUS_FOLDER);
ensureDirSync(root);
const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER);
if (!options["keepDir"]) {
ensureAndClearDir(originalRoot);
}
else {
fs.ensureDirSync(originalRoot);
}
const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER);
if (!options["keepDir"]) {
ensureAndClearDir(transformedRoot);
}
else {
fs.ensureDirSync(transformedRoot);
}
const sessions = getSessionsFromStart(options["fromSession"]);
// 1) Download ZIP global + distribut by session
const zipPath = path.join(dataDir, "cri.zip");
console.log("[CRI] Downloading global CRI zip…");
await downloadCriZip(zipPath);
console.log("[CRI] Extracting + distributing XMLs by session…");
for (const session of sessions) {
const dir = path.join(originalRoot, String(session));
if (await fs.pathExists(dir)) {
for (const f of await fs.readdir(dir))
if (/\.xml$/i.test(f))
await fs.remove(path.join(dir, f));
}
}
const n = await extractAndDistributeXmlBySession(zipPath, originalRoot);
if (n === 0) {
console.warn("[CRI] No XML extracted. Archive empty or layout changed?");
}
else {
console.log(`[CRI] Distributed ${n} XML file(s) into session folders.`);
}
if (!options["parseDebats"]) {
console.log("[CRI] parseDebats not requested → done.");
return;
}
for (const session of sessions) {
const originalSessionDir = path.join(originalRoot, String(session));
if (!(await fs.pathExists(originalSessionDir))) {
continue;
}
const xmlFiles = (await fs.readdir(originalSessionDir)).filter((f) => /^d\d{8}\.xml$/i.test(f)).sort();
const transformedSessionDir = path.join(transformedRoot, String(session));
await fs.ensureDir(transformedSessionDir);
const now = Date.now();
for (const f of xmlFiles) {
const yyyymmdd = f.slice(1, 9);
const xmlPath = path.join(originalSessionDir, f);
// === ONLY-RECENT
if (options["only-recent"]) {
const cutoff = now - options["only-recent"] * 24 * 3600 * 1000;
const seanceTs = Date.parse(`${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`);
if (seanceTs < cutoff) {
const files = await fs.readdir(transformedSessionDir);
const dayFiles = files.filter((fn) => fn.startsWith(`CRSSN${yyyymmdd}E`) && fn.endsWith(".json"));
if (dayFiles.length > 0) {
for (const fn of dayFiles) {
const match = fn.match(/^CRSSN(\d{8})E(.+)\.json$/);
const eventId = match?.[2];
if (!eventId)
continue;
const crPath = path.join(transformedSessionDir, fn);
try {
const cr = await fs.readJSON(crPath);
await linkCriEventIntoAgenda(dataDir, yyyymmdd, eventId, cr.uid, cr, session);
}
catch (e) {
console.warn(`[CR] [${session}] Could not relink existing CR into a reunion for ${yyyymmdd} event=${eventId}:`, e);
}
}
continue;
}
}
}
// === Charger les events SP du jour depuis les agendas groupés ===
const dayEvents = await loadAgendaSpEventsForDate(dataDir, yyyymmdd, session);
if (dayEvents.length === 0) {
console.warn(`[CRI] [${session}] No agenda SP events found for ${yyyymmdd} → skip split/link`);
continue;
}
// === Lire XML + construire index DOM ===
let raw;
let $;
let order;
let idx;
try {
raw = await fs.readFile(xmlPath, "utf8");
$ = cheerio.load(raw, { xml: false });
order = $("body *").toArray();
idx = new Map(order.map((el, i) => [el, i]));
}
catch (e) {
console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e);
continue;
}
// === Extraire sommaire + matcher vers events agenda ===
const blocks = extractSommaireBlocks($, idx);
const intervals = buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents);
if (!intervals.length) {
console.warn(`[CRI] [${session}] No confident split intervals for ${yyyymmdd} → skip`);
continue;
}
// === Parser / écrire / linker chaque segment par event ===
for (const iv of intervals) {
const outName = `CRSSN${yyyymmdd}E${iv.agendaEventId}.json`;
const outPath = path.join(transformedSessionDir, outName);
const cr = await parseCompteRenduIntervalFromFile(xmlPath, iv.startIndex, iv.endIndex, iv.agendaEventId);
if (!cr) {
console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} event=${iv.agendaEventId} → skip`);
continue;
}
await fs.ensureDir(transformedSessionDir);
await fs.writeJSON(outPath, cr, { spaces: 2 });
try {
await linkCriEventIntoAgenda(dataDir, yyyymmdd, iv.agendaEventId, cr.uid, cr, session);
}
catch (e) {
console.warn(`[CR] [${session}] Could not link CR into agenda for ${yyyymmdd} event=${iv.agendaEventId}:`, e);
}
}
}
}
const debatsDir = path.join(dataDir, COMPTES_RENDUS_FOLDER);
commitAndPushGit(debatsDir, options);
}
function commitAndPushGit(datasetDir, options) {
let exitCode = 10; // 0: some data changed, 10: no modification
if (options.commit) {
const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options.remote);
if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) {
exitCode = errorCode;
}
}
}
async function linkCriEventIntoAgenda(dataDir, yyyymmdd, agendaEventId, crUid, cr, session) {
const agendadDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
fs.ensureDirSync(agendadDir);
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
const agendaUid = makeReunionUid(dateISO, "SP", agendaEventId, null);
const agendaPath = path.join(agendadDir, `${agendaUid}.json`);
let agenda = null;
if (await fs.pathExists(agendaPath)) {
try {
agenda = await fs.readJSON(agendaPath);
}
catch (e) {
console.warn(`[CR] unreadable reunion JSON → ${agendaPath} (${e})`);
agenda = null;
}
}
if (!agenda) {
console.warn(`[CR] Missing reunion file for SP event=${agendaEventId}: ${agendaPath}`);
return;
}
;
agenda.compteRenduRefUid = crUid;
await fs.writeJSON(agendaPath, agenda, { spaces: 2 });
console.log(`[CR] Linked CR ${crUid} → ${path.basename(agendaPath)} (event=${agendaEventId})`);
}
function buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents) {
const MIN_SCORE = 0.65;
const MIN_GAP = 0.08;
const firstIntervenant = $("div.intervenant").first()[0];
const firstIntervenantIdx = firstIntervenant ? (idx.get(firstIntervenant) ?? null) : null;
const pivots = [];
for (const b of blocks) {
if (isNoiseBlock(b.text))
continue;
let best = null;
let second = 0;
for (const ev of dayEvents) {
const s = scoreSommaireBlockForEvent(b.text, ev);
if (!best || s > best.score) {
second = best?.score ?? second;
best = { ev, score: s };
}
else if (s > second) {
second = s;
}
}
if (!best)
continue;
const resolved = resolveTargetIndex($, idx, b.targetId);
const contentStartIndex = resolved ?? b.startIndex;
if (firstIntervenantIdx != null && contentStartIndex < firstIntervenantIdx && resolved == null) {
continue;
}
if (best.score < MIN_SCORE)
continue;
if (best.score - second < MIN_GAP)
continue;
pivots.push({
agendaEventId: best.ev.id,
startIndex: contentStartIndex,
score: best.score,
});
}
if (pivots.length === 0)
return [];
// Dédupe par event (on garde le premier startIndex)
const byEvent = new Map();
for (const p of pivots.sort((a, b) => a.startIndex - b.startIndex)) {
if (!byEvent.has(p.agendaEventId)) {
byEvent.set(p.agendaEventId, {
startIndex: p.startIndex,
score: p.score,
});
}
}
const sorted = Array.from(byEvent.entries())
.map(([agendaEventId, v]) => ({
agendaEventId,
startIndex: v.startIndex,
score: v.score,
}))
.sort((a, b) => a.startIndex - b.startIndex);
// Construction des intervalles
const intervals = [];
for (let i = 0; i < sorted.length; i++) {
const cur = sorted[i];
const next = sorted[i + 1];
const endIndex = next ? next.startIndex - 1 : order.length - 1;
intervals.push({
agendaEventId: cur.agendaEventId,
startIndex: cur.startIndex,
endIndex,
score: cur.score,
});
}
return intervals;
}
async function loadAgendaSpEventsForDate(dataDir, yyyymmdd, session) {
const agendasDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
if (!(await fs.pathExists(agendasDir)))
return [];
const files = (await fs.readdir(agendasDir)).filter((fn) => fn.startsWith(`RUSN${yyyymmdd}IDS`) && fn.endsWith(".json"));
const events = [];
for (const fn of files) {
try {
const g = (await fs.readJSON(path.join(agendasDir, fn)));
const e = g?.events?.[0];
if (e && e.type === "Séance publique")
events.push(e);
}
catch { }
}
return events;
}
function cssEscapeIdent(s) {
return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
}
function resolveTargetIndex($, idx, targetId) {
if (!targetId)
return null;
const safe = cssEscapeIdent(targetId);
const el = $(`[id="${safe}"]`)[0] || $(`[name="${safe}"]`)[0];
if (!el)
return null;
const i = idx.get(el);
return i == null ? null : i;
}
async function main() {
const dataDir = options["dataDir"];
assert(dataDir, "Missing argument: data directory");
console.time("CRI processing time");
await retrieveCriXmlDump(dataDir, options);
console.timeEnd("CRI processing time");
}
main()
.then(() => process.exit(0))
.catch((error) => {
console.error(error);
process.exit(1);
});