UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

107 lines (106 loc) 4.4 kB
import assert from "assert"; import commandLineArgs from "command-line-args"; import fs from "fs-extra"; import path from "path"; import { COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDebats } from "../loaders"; import { parseCompteRenduFromFile } from "../model/compte_rendu"; import { getSessionsFromStart } from "../types/sessions"; import { commonOptions } from "./shared/cli_helpers"; import { ensureAndClearDir } from "./shared/util"; const optionsDefinitions = [ ...commonOptions, { help: "parse and convert comptes-rendus des débats into JSON", name: "parseDebats", type: Boolean, }, ]; const options = commandLineArgs(optionsDefinitions); const SENAT_COMPTE_RENDU_URL_ROOT = "https://www.senat.fr/seances"; class CompteRenduError extends Error { constructor(message, compteRenduUrl) { super(`An error occurred while retrieving Compte-Rendu ${compteRenduUrl}: ${message}`); } } async function retrieveComptesRendus(dataDir, sessions) { const comptesRendusRootDir = path.join(dataDir, COMPTES_RENDUS_FOLDER); ensureAndClearDir(comptesRendusRootDir); const originalComptesRendusDir = path.join(comptesRendusRootDir, DATA_ORIGINAL_FOLDER); fs.ensureDirSync(originalComptesRendusDir); const transformedComptesRendusDir = path.join(comptesRendusRootDir, DATA_TRANSFORMED_FOLDER); if (options["parseDebats"]) { fs.ensureDirSync(transformedComptesRendusDir); } for (const session of sessions) { const originalComptesRendusSessionDir = path.join(originalComptesRendusDir, `${session}`); fs.ensureDirSync(originalComptesRendusSessionDir); const transformedComptesRendusSessionDir = path.join(transformedComptesRendusDir, `${session}`); if (options["parseDebats"]) { fs.ensureDirSync(transformedComptesRendusSessionDir); } for (const { item: debat, } of iterLoadSenatDebats(dataDir, session)) { if (!debat.url) { continue; } try { const debatMonoUrl = `${path.parse(debat.url).dir}/s${debat.id}_mono.html`; const compteRenduPath = path.join(originalComptesRendusSessionDir, `${debat.id}.html`); await downloadCompteRendu(debatMonoUrl, compteRenduPath); if (options["parseDebats"]) { await parseCompteRendu(transformedComptesRendusSessionDir, compteRenduPath, debat); } } catch (error) { console.error(error); } } } } async function downloadCompteRendu(debatUrl, compteRenduPath) { const compteRenduUrl = `${SENAT_COMPTE_RENDU_URL_ROOT}/${debatUrl}`; if (!options["silent"]) { console.log(`Downloading Compte-Rendu ${compteRenduUrl}…`); } const response = await fetch(compteRenduUrl); if (!response.ok) { if (response.status === 404) { console.warn(`Compte-Rendu ${compteRenduUrl} not found`); } else { throw new CompteRenduError(String(response.status), compteRenduUrl); } return; } const compteRenduContent = await response.arrayBuffer(); if (!compteRenduContent) { return; } fs.writeFileSync(compteRenduPath, Buffer.from(compteRenduContent)); } async function parseCompteRendu(transformedComptesRendusSessionDir, compteRenduPath, debat) { if (!options["silent"]) { console.log(`Parsing compte-rendu ${compteRenduPath}…`); } const parsedCompteRendu = await parseCompteRenduFromFile(compteRenduPath, debat); if (!parsedCompteRendu) { return; } const parsedFilePath = path.parse(compteRenduPath); fs.writeJSONSync(path.join(transformedComptesRendusSessionDir, `${parsedFilePath.name}.json`), parsedCompteRendu, { spaces: 2 }); } async function main() { const dataDir = options["dataDir"]; assert(dataDir, "Missing argument: data directory"); const sessions = getSessionsFromStart(options["fromSession"]); console.time("comptes-rendus processing time"); await retrieveComptesRendus(dataDir, sessions); if (!options["silent"]) { console.timeEnd("comptes-rendus processing time"); } } main() .then(() => process.exit(0)) .catch((error) => { console.log(error); process.exit(1); });