UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

133 lines (132 loc) 5.54 kB
import assert from "assert"; import commandLineArgs from "command-line-args"; import fs from "fs-extra"; import { DateTime } from "luxon"; import path from "path"; import { AGENDA_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders"; import { parseAgendaFromFile } from "../model/agenda"; import { getSessionsFromStart } from "../types/sessions"; import { ID_DATE_FORMAT } from "./datautil"; import { commonOptions } from "./shared/cli_helpers"; import { fetchWithRetry } from "./shared/util"; import { buildReunionsByBucket } from "../utils/reunion_parsing"; import { buildSenatDossierIndex } from "../utils/reunion_odj_building"; const optionsDefinitions = [ ...commonOptions, { help: "parse and convert agenda events into JSON", name: "parseAgenda", type: Boolean, }, ]; const options = commandLineArgs(optionsDefinitions); const SENAT_GLOBAL_AGENDA_URL_ROOT = "https://www.senat.fr/aglae/Global"; const EVENT_DATE_FORMAT = "ddMMyyyy"; class AgendaError extends Error { constructor(message, agendaName) { super(`An error occurred while retrieving Agenda ${agendaName}: ${message}`); } } async function retrieveAgendas(options, sessions) { console.log(`[AGENDA] Retrieving agendas for sessions ${sessions.join(", ")}`); const agendaRootDir = path.join(options["dataDir"], AGENDA_FOLDER); fs.ensureDirSync(agendaRootDir); const originalAgendaDir = path.join(agendaRootDir, DATA_ORIGINAL_FOLDER); fs.ensureDirSync(originalAgendaDir); const transformedAgendaDir = path.join(agendaRootDir, DATA_TRANSFORMED_FOLDER); if (options["parseAgenda"]) { fs.ensureDirSync(transformedAgendaDir); } let dossierIndex = {}; dossierIndex = buildSenatDossierIndex(options); for (const session of sessions) { const originalAgendaSessionDir = path.join(originalAgendaDir, `${session}`); fs.ensureDirSync(originalAgendaSessionDir); if (!options["keepDir"]) fs.emptyDirSync(originalAgendaSessionDir); const transformedAgendaSessionDir = path.join(transformedAgendaDir, `${session}`); if (options["parseAgenda"]) { fs.ensureDirSync(transformedAgendaSessionDir); if (!options["keepDir"]) fs.emptyDirSync(transformedAgendaSessionDir); } const fifteenDaysFromNow = new Date(); fifteenDaysFromNow.setDate(fifteenDaysFromNow.getDate() + 15); // Don't download agendas more than 15 days in the future for (const date = new Date(session, 9, 1); date <= new Date(session + 1, 8, 30) && date <= fifteenDaysFromNow; date.setDate(date.getDate() + 1)) { const agendaName = DateTime.fromJSDate(date).toFormat(EVENT_DATE_FORMAT); const agendaFileName = DateTime.fromJSDate(date).toFormat(ID_DATE_FORMAT); const agendaPath = path.join(originalAgendaSessionDir, `${agendaFileName}.html`); try { await downloadAgenda(agendaName, agendaPath); if (options["parseAgenda"]) { await parseAgenda(transformedAgendaSessionDir, agendaFileName, agendaPath, dossierIndex); } } catch (error) { console.error(error); } } } } async function downloadAgenda(agendaName, agendaPath) { const agendaUrl = `${SENAT_GLOBAL_AGENDA_URL_ROOT}/agl${agendaName}.html`; if (!options["silent"]) { console.log(`Downloading Agenda ${agendaUrl}…`); } const response = await fetchWithRetry(agendaUrl); if (!response.ok) { if (response.status === 404) { console.warn(`Agenda ${agendaUrl} not found`); } else { throw new AgendaError(String(response.status), agendaName); } return; } const agendaContent = await response.arrayBuffer(); if (!agendaContent) { return; } fs.writeFileSync(agendaPath, Buffer.from(agendaContent)); } function writeGroupsAsFiles(dir, groups) { for (const g of groups) { const outPath = path.join(dir, `${g.uid}.json`); fs.writeJSONSync(outPath, g, { spaces: 2 }); } } async function parseAgenda(transformedAgendaSessionDir, agendaFileName, agendaPath, dossierBySenatUrl) { if (!options["silent"]) console.log(`Parsing Agenda ${agendaPath}…`); const parsedAgendaEvents = await parseAgendaFromFile(agendaPath); if (!parsedAgendaEvents?.length) return; const flatPath = path.join(transformedAgendaSessionDir, `${agendaFileName}.json`); fs.writeJSONSync(flatPath, parsedAgendaEvents, { spaces: 2 }); const byBucket = buildReunionsByBucket(parsedAgendaEvents, dossierBySenatUrl); // SP if (byBucket.IDS.length > 0) { writeGroupsAsFiles(transformedAgendaSessionDir, byBucket.IDS); } // NON-SP for (const suffix of ["IDC", "IDM", "IDO", "IDI"]) { const groups = byBucket[suffix]; if (groups.length > 0) { writeGroupsAsFiles(transformedAgendaSessionDir, groups); } } } async function main() { const dataDir = options["dataDir"]; assert(dataDir, "Missing argument: data directory"); const sessions = getSessionsFromStart(options["fromSession"]); console.time("agenda processing time"); await retrieveAgendas(options, sessions); console.timeEnd("agenda processing time"); } main() .then(() => process.exit(0)) .catch((error) => { console.log(error); process.exit(1); });