@tricoteuses/senat
Version:
Handle French Sénat's open data
377 lines (376 loc) • 17.1 kB
JavaScript
import fs, { ensureDir } from "fs-extra";
import assert from "assert";
import path from "path";
import * as cheerio from "cheerio";
import { COMMISSION_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
import { loadAgendaForDate, parseCommissionMetadataFromHtml, linkCRtoCommissionGroup } from "../utils/cr_spliting";
import { cleanTitle, extractDayH3Sections, parseCommissionCRSectionFromDom } from "../model/commission";
import commandLineArgs from "command-line-args";
import { commonOptions } from "./shared/cli_helpers";
import { sessionStartYearFromDate } from "../model/seance";
import { getSessionsFromStart } from "../types/sessions";
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
import { jaccard, jaccardTokenSim } from "../utils/scoring";
import * as git from "../git.js";
class CommissionCRDownloadError extends Error {
constructor(message, url) {
super(`An error occurred while retrieving Commission CR ${url}: ${message}`);
}
}
const optionsDefinitions = [
...commonOptions,
{ name: "concurrency", type: Number, defaultValue: 6, help: "Max parallel downloads" },
{ name: "politenessMs", type: Number, defaultValue: 150, help: "Delay per worker (ms)" },
{
help: "parse and convert comptes-rendus des débats into JSON",
name: "parseDebats",
type: Boolean,
},
];
const options = commandLineArgs(optionsDefinitions);
const COMMISSION_HUBS = {
"Commission des affaires étrangères": [
"https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres.html",
"https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres_archives.html",
],
"Commission des affaires économiques": [
"https://www.senat.fr/compte-rendu-commissions/economie.html",
"https://www.senat.fr/compte-rendu-commissions/economie_archives.html",
],
"Commission de l'amenagement du territoire et du développement durable": [
"https://www.senat.fr/compte-rendu-commissions/developpement-durable.html",
"https://www.senat.fr/compte-rendu-commissions/developpement-durable_archives.html",
],
"Commission de la culture": [
"https://www.senat.fr/compte-rendu-commissions/culture.html",
"https://www.senat.fr/compte-rendu-commissions/culture_archives.html",
],
"Commission des finances": [
"https://www.senat.fr/compte-rendu-commissions/finances.html",
"https://www.senat.fr/compte-rendu-commissions/finances_archives.html",
],
"Commission des lois": [
"https://www.senat.fr/compte-rendu-commissions/lois.html",
"https://www.senat.fr/compte-rendu-commissions/lois_archives.html",
],
"Commission des affaires sociales": [
"https://www.senat.fr/compte-rendu-commissions/affaires-sociales.html",
"https://www.senat.fr/compte-rendu-commissions/affaires-sociales_archives.html",
],
"Commission des affaires européennes": [
"https://www.senat.fr/compte-rendu-commissions/affaires-europeennes.html",
"https://www.senat.fr/compte-rendu-commissions/affaires-europeennes_archives.html",
],
};
async function harvestWeeklyLinksFromHub(hubUrl) {
const res = await fetchWithRetry(hubUrl);
if (!res.ok)
return [];
const html = await res.text();
const $ = cheerio.load(html);
const out = [];
$("a[href]").each((_, a) => {
const href = ($(a).attr("href") || "").trim();
const m = href.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
if (m) {
const url = href.startsWith("http") ? href : new URL(href, hubUrl).toString();
out.push(url);
}
});
return Array.from(new Set(out));
}
async function discoverCommissionWeeklyPages(fromSession) {
const results = [];
for (const [commissionKey, hubs] of Object.entries(COMMISSION_HUBS)) {
for (const hubUrl of hubs) {
try {
const links = await harvestWeeklyLinksFromHub(hubUrl);
for (const url of links) {
const m = url.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
if (!m)
continue;
const yyyymmdd = m[1];
const year = Number(yyyymmdd.slice(0, 4));
const month = Number(yyyymmdd.slice(4, 6));
const session = month >= 10 ? year : year - 1;
if (session < fromSession)
continue;
results.push({ url, yyyymmdd, commissionKey });
}
}
catch (e) {
console.warn(`[COM-CR][hub-fail] ${hubUrl} → ${e?.message ?? e}`);
}
}
}
return results.sort((a, b) => a.yyyymmdd.localeCompare(b.yyyymmdd));
}
function commitAndPushGit(datasetDir, options) {
let exitCode = 10; // 0: some data changed, 10: no modification
if (options.commit) {
const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options.remote);
if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) {
exitCode = errorCode;
}
}
}
function toHourShort(hhmm) {
if (!hhmm)
return null;
const m = hhmm.match(/^(\d{2}):(\d{2})$/);
return m ? `${m[1]}${m[2]}` : null;
}
function timeToMinutes(hhmm) {
const [h, m] = hhmm.split(":").map((n) => parseInt(n, 10));
return (h || 0) * 60 + (m || 0);
}
async function tryDownload(url) {
const res = await fetch(url, { redirect: "follow" });
if (res.status === 404)
return null;
if (!res.ok)
throw new CommissionCRDownloadError(String(res.status), url);
const ab = await res.arrayBuffer();
return Buffer.from(ab);
}
function normOrgane(s) {
return s
.toLowerCase()
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "")
.replace(/&/g, " et ")
.replace(/[^a-z0-9\s-]/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function toTokens(s) {
return new Set(normOrgane(s)
.split(/\s+/)
.filter((t) => t.length >= 3 && !["commission", "des", "de", "du", "d", "la", "le", "les", "et"].includes(t)));
}
function reunionOrganeCandidates(h) {
const any = h;
const out = [any.organeSlug, any.organeKey, any.organe, h.titre].filter(Boolean);
return Array.from(new Set(out.map(normOrgane)));
}
function organeSimilarity(h, commissionKey) {
const keyTokens = toTokens(commissionKey.replace(/-/g, " "));
const cand = reunionOrganeCandidates(h).map(toTokens);
let best = 0;
for (const B of cand)
best = Math.max(best, jaccard(keyTokens, B));
return best; // 0..1
}
function timeProximityScore(h, openHHMM, maxDeltaMin) {
if (!openHHMM)
return 0;
const hhmm = (h.startTime ?? null);
if (!hhmm)
return 0;
const d = Math.abs(timeToMinutes(hhmm) - timeToMinutes(openHHMM));
if (d > maxDeltaMin)
return 0;
return 1 - d / maxDeltaMin; // 0..1 (1 = même heure)
}
function titleSimilarity(reunion, sectionTitle) {
const t = reunion.titre ?? "";
const o = reunion.objet ?? "";
if (!sectionTitle.trim())
return 0;
const sTit = jaccardTokenSim(t, sectionTitle);
const sObj = jaccardTokenSim(o, sectionTitle);
return Math.max(sTit, sObj);
}
async function retrieveCommissionCRs(options = {}) {
const dataDir = options["dataDir"];
const fromSession = Number(options["fromSession"]);
const concurrency = Number(options["concurrency"] ?? 6);
const politenessMs = Number(options["politenessMs"] ?? 150);
const commissionsRootDir = path.join(dataDir, COMMISSION_FOLDER);
const originalRoot = path.join(commissionsRootDir, DATA_ORIGINAL_FOLDER);
if (!options["keepDir"]) {
ensureAndClearDir(originalRoot);
}
else {
ensureDir(originalRoot);
}
const discovered = await discoverCommissionWeeklyPages(fromSession);
console.log(`[COM-CR][discover] ${discovered.length} links (>= session ${fromSession})`);
const jobs = discovered.map(({ url, yyyymmdd, commissionKey }) => {
const d = new Date(Number(yyyymmdd.slice(0, 4)), Number(yyyymmdd.slice(4, 6)) - 1, Number(yyyymmdd.slice(6, 8)));
const session = sessionStartYearFromDate(d);
const dir = path.join(originalRoot, String(session), commissionKey);
fs.ensureDirSync(dir);
const slug = url.replace(/^.*\/(\d{8})\/([^\/]+)\.html$/i, "$2");
const outPath = path.join(dir, `${yyyymmdd}.${slug}.html`);
return { url, outPath, yyyymmdd, commissionKey };
});
console.log(`[COM-CR] Downloading ${jobs.length} links → ${path.relative(process.cwd(), originalRoot)}`);
let completed = 0, saved = 0, skipped = 0, notFound = 0;
const workers = Array.from({ length: Math.max(1, concurrency) }, async () => {
while (true) {
const job = jobs.shift();
if (!job)
break;
const { url, outPath, yyyymmdd } = job;
try {
if (await fs.pathExists(outPath)) {
skipped++;
}
else {
const buf = await tryDownload(url);
if (!buf) {
notFound++;
console.warn(`[COM-CR][404] ${url} → week=${yyyymmdd}`);
}
else {
await fs.writeFile(outPath, buf);
saved++;
}
}
}
catch (e) {
console.error(`[COM-CR][err] ${url} → ${e?.message || e}`);
}
finally {
completed++;
if (politenessMs > 0)
await new Promise((r) => setTimeout(r, politenessMs));
}
}
});
await Promise.all(workers);
console.log(`[COM-CR] done: saved=${saved} | skipped=${skipped} | 404=${notFound} | total=${completed}`);
const sessions = getSessionsFromStart(options["fromSession"]);
const comRoot = path.join(dataDir, COMMISSION_FOLDER);
const transformedRoot = path.join(comRoot, DATA_TRANSFORMED_FOLDER);
if (options["keepDir"])
ensureDir(transformedRoot);
else
ensureAndClearDir(transformedRoot);
for (const session of sessions) {
const originalSessionDir = path.join(originalRoot, String(session));
if (!(await fs.pathExists(originalSessionDir)))
continue;
const commissionDirs = (await fs.readdir(originalSessionDir, { withFileTypes: true }))
.filter((d) => d.isDirectory())
.map((d) => d.name); // ex: "affaires-etrangeres", "finances", etc.
for (const commissionKey of commissionDirs) {
const commissionDir = path.join(originalSessionDir, commissionKey);
const htmlFiles = (await fs.readdir(commissionDir)).filter((f) => /\.html?$/i.test(f)).sort();
let totalFiles = 0;
let linkedFiles = 0;
for (const f of htmlFiles) {
const htmlPath = path.join(commissionDir, f);
let meta;
let raw = "";
try {
raw = await fs.readFile(htmlPath, "utf8");
meta = parseCommissionMetadataFromHtml(raw, f);
}
catch (e) {
console.warn(`[COM-CR][PRE][${session}] Cannot read/parse ${f}:`, e);
continue;
}
if (!meta?.days?.length)
continue;
const $ = cheerio.load(raw, { xmlMode: false });
for (const day of meta.days) {
const yyyymmdd = day.date.replace(/-/g, "");
const dt = new Date(Number(day.date.slice(0, 4)), Number(day.date.slice(5, 7)) - 1, Number(day.date.slice(8, 10)));
const daySession = sessionStartYearFromDate(dt);
let hits = await loadAgendaForDate(dataDir, yyyymmdd, daySession);
console.log(`[COM-CR][TRANSFORM] ${f} → ${hits.length} agenda events on ${day.date} :`);
const sections = extractDayH3Sections($, day.date);
if (sections.length === 0) {
console.warn(`[COM-CR][TRANSFORM] no sections found for ${f} on ${day.date}, skipping.`);
continue;
}
const MAX_TIME_DELTA_MIN = 120;
const ORGANE_GATE = 0.55;
const TITLE_GATE = 0.2;
const W_ORG = 0.4;
const W_TIM = 0.4;
const W_TIT = 0.2;
for (let sIdx = 0; sIdx < sections.length; sIdx++) {
const sec = sections[sIdx];
let best = null;
let reason = "fallback-none";
if (hits.length) {
const scored = hits
.map((h) => {
const sOrg = organeSimilarity(h, commissionKey); // 0..1
const sTim = timeProximityScore(h, sec.time ?? day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
const sTit = titleSimilarity(h, sec.title); // 0..1
const total = W_ORG * sOrg + W_TIM * sTim + W_TIT * sTit;
return { h, sOrg, sTim, sTit, total };
})
.filter((x) => x.sOrg >= ORGANE_GATE && x.sTit >= TITLE_GATE)
.sort((a, b) => b.total - a.total);
if (scored[0]) {
best = scored[0].h;
reason =
scored[0].sTit >= Math.max(scored[0].sOrg, scored[0].sTim)
? "title"
: scored[0].sOrg >= scored[0].sTim
? "organe"
: "time";
}
}
const hourShort = toHourShort(day.openTime) ?? "NA";
const cr = parseCommissionCRSectionFromDom($, htmlPath, {
dateISO: day.date,
hourShort,
organe: commissionKey,
section: sec,
matched: best ?? undefined,
});
if (!cr) {
console.warn(`[COM-CR][TRANSFORM] parse failed for section#${sIdx} ${path.basename(htmlPath)} → ${best ? best.uid : "NO-GROUP"} (${commissionKey})`);
continue;
}
const fileUid = cr.uid;
const transformedSessionDir = path.join(transformedRoot, String(daySession));
fs.ensureDirSync(transformedSessionDir);
const outPath = path.join(transformedSessionDir, `${fileUid}.json`);
await fs.writeJSON(outPath, cr, { spaces: 2 });
const titreGuess = cleanTitle(sections[sIdx].title) || "Commission du " + day.date;
const up = await linkCRtoCommissionGroup({
dataDir,
dateISO: day.date,
organeDetected: commissionKey,
hourShort,
crUid: fileUid,
titreGuess,
groupUid: best ? best.uid : undefined,
});
totalFiles++;
if (up.created || up.updated)
linkedFiles++;
else {
console.warn(`[COM-CR][AGENDA][WARN] CR ${fileUid} (section#${sIdx}) not linked (reason=${reason})`);
}
}
}
}
if (!options["silent"]) {
console.log(`[COM-CR][SESSION ${session}][${commissionKey}] Processed ${totalFiles} CR files, linked to agenda: ${linkedFiles}`);
}
}
}
const debatsDir = path.join(dataDir, COMMISSION_FOLDER);
commitAndPushGit(debatsDir, options);
}
async function main() {
const dataDir = options["dataDir"];
assert(dataDir, "Missing argument: data directory");
console.time("CRI processing time");
await retrieveCommissionCRs(options);
console.timeEnd("CRI processing time");
}
main()
.then(() => process.exit(0))
.catch((error) => {
console.error(error);
process.exit(1);
});