UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

354 lines (353 loc) 15.8 kB
import assert from "assert"; import commandLineArgs from "command-line-args"; import fs from "fs-extra"; import path from "path"; import pLimit from "p-limit"; import * as git from "../git"; import { datasets, EnabledDatasets, getEnabledDatasets } from "../datasets"; import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, SCRUTINS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, ENRICHED_TEXTE_FOLDER, } from "../loaders"; import { findAllAmendements, findAllCirconscriptions, findAllDebats, findAllDossiers, findAllScrutins, findAllOrganismes, findAllQuestions, findAllSens, findAllTextes, findAllRapports, } from "../model"; import { processRapport, processTexte } from "./retrieve_documents"; import { buildActesLegislatifs } from "../model/dosleg"; import { UNDEFINED_SESSION } from "../types/sessions"; import { getSessionFromDate, getSessionFromSignet } from "./datautil"; import { commonOptions } from "./shared/cli_helpers"; import { ensureAndClearDir } from "./shared/util"; let exitCode = 10; // 0: some data changed, 10: no modification const optionsDefinitions = [...commonOptions]; const options = commandLineArgs(optionsDefinitions); const SENAT_TEXTE_XML_BASE_URL = "https://www.senat.fr/akomantoso/"; const SENAT_TEXTE_BASE_URL = "https://www.senat.fr/leg/"; const SENAT_EXPOSE_DES_MOTIFS_BASE_URL = "https://www.senat.fr/leg/exposes-des-motifs/"; const SENAT_RAPPORT_BASE_URL = "https://www.senat.fr/rap/"; function commitAndPushGit(datasetDir, options) { if (options.commit) { const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options.remote); if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) { exitCode = errorCode; } } } async function convertData() { const dataDir = options["dataDir"]; assert(dataDir, "Missing argument: data directory"); const enabledDatasets = getEnabledDatasets(options["categories"]); console.time("data transformation time"); if (enabledDatasets & EnabledDatasets.Ameli) { try { await convertDatasetAmeli(dataDir, options); const ameliDir = path.join(dataDir, datasets.ameli.database); commitAndPushGit(ameliDir, options); } catch (error) { console.error("Error converting Ameli dataset:", error); } } if (enabledDatasets & EnabledDatasets.Debats) { try { await convertDatasetDebats(dataDir, options); const debatsDir = path.join(dataDir, datasets.debats.database); commitAndPushGit(debatsDir, options); } catch (error) { console.error("Error converting Debats dataset:", error); } } if (enabledDatasets & EnabledDatasets.DosLeg) { try { await convertDatasetDosLeg(dataDir, options); const doslegDir = path.join(dataDir, datasets.dosleg.database); commitAndPushGit(doslegDir, options); } catch (error) { console.error("Error converting DosLeg dataset:", error); } try { await convertDatasetScrutins(dataDir, options); const scrutinsDir = path.join(dataDir, SCRUTINS_FOLDER); commitAndPushGit(scrutinsDir, options); } catch (error) { console.error("Error converting Scrutins dataset:", error); } } if (enabledDatasets & EnabledDatasets.Questions) { try { await convertDatasetQuestions(dataDir, options); const questionsDir = path.join(dataDir, datasets.questions.database); commitAndPushGit(questionsDir, options); } catch (error) { console.error("Error converting Questions dataset:", error); } } if (enabledDatasets & EnabledDatasets.Sens) { try { await convertDatasetSens(dataDir, options); const sensDir = path.join(dataDir, datasets.sens.database); commitAndPushGit(sensDir, options); } catch (error) { console.error("Error converting Sens dataset:", error); } } if (!options["silent"]) { console.timeEnd("data transformation time"); } } async function convertDatasetAmeli(dataDir, options) { const dataset = datasets.ameli; if (!options["silent"]) { console.log(`Converting database ${dataset.database} data into files…`); } const ameliReorganizedRootDir = path.join(dataDir, dataset.database); if (!options.keepDir) { ensureAndClearDir(ameliReorganizedRootDir); } for await (const amendement of findAllAmendements(options["fromSession"])) { if (options["verbose"]) { console.log(`Converting ${amendement["numero"]} file…`); } const session = String(amendement["session"]) || UNDEFINED_SESSION; const signetDossierLegislatif = amendement["signet_dossier_legislatif"] || `${amendement["nature_texte"]}-${amendement["numero_texte"]}`.toLowerCase(); const amendementFileName = `${amendement["numero"]}.json`; const filePath = path.join(ameliReorganizedRootDir, String(session), signetDossierLegislatif, amendementFileName); await fs.outputJSON(filePath, amendement, { spaces: 2 }); } } async function convertDatasetDebats(dataDir, options) { const dataset = datasets.debats; if (!options["silent"]) { console.log(`Converting database ${dataset.database} data into files…`); } const debatsReorganizedRootDir = path.join(dataDir, dataset.database); if (!options.keepDir) { ensureAndClearDir(debatsReorganizedRootDir); } for await (const debat of findAllDebats()) { if (options["verbose"]) { console.log(`Converting ${debat.id} file…`); } const session = getSessionFromDate(debat.date_seance); if (options["fromSession"] && session < options["fromSession"]) { continue; } const debatFileName = `${debat.id}.json`; const filePath = path.join(debatsReorganizedRootDir, String(session), debatFileName); await fs.outputJSON(filePath, debat, { spaces: 2 }); } } async function convertDatasetDosLeg(dataDir, options) { const dataset = datasets.dosleg; if (!options["silent"]) { console.log(`Converting database ${dataset.database} data into files…`); } const doslegReorganizedRootDir = path.join(dataDir, dataset.database); const dossiersReorganizedDir = path.join(doslegReorganizedRootDir, DOSLEG_DOSSIERS_FOLDER); if (!options.keepDir) { ensureAndClearDir(doslegReorganizedRootDir); ensureAndClearDir(dossiersReorganizedDir); } for await (const dossier of findAllDossiers()) { if (options["verbose"]) { console.log(`Converting ${dossier["signet"]} file…`); } let dossierReorganizedDir = path.join(dossiersReorganizedDir, String(UNDEFINED_SESSION)); const session = getSessionFromSignet(dossier["signet"]) || UNDEFINED_SESSION; if (options["fromSession"] && session < options["fromSession"]) { continue; } dossierReorganizedDir = path.join(dossiersReorganizedDir, String(session)); const actesBrutsNormalises = buildActesLegislatifs(dossier); const dossierWithActes = { ...dossier, actes_legislatifs: actesBrutsNormalises, }; const dossierFile = `${dossier["signet"]}.json`; await fs.outputJSON(path.join(dossierReorganizedDir, dossierFile), dossierWithActes, { spaces: 2 }); } await convertTextes(dataDir, options); await convertRapports(dataDir, options); } async function convertDatasetScrutins(dataDir, options) { const dataset = datasets.dosleg; if (!options["silent"]) { console.log(`Converting database scrutins (${dataset.database}) data into files…`); } const scrutinsReorganizedDir = path.join(dataDir, SCRUTINS_FOLDER); if (!options.keepDir) { ensureAndClearDir(scrutinsReorganizedDir); } for await (const scrutin of findAllScrutins(options["fromSession"])) { if (options["verbose"]) { console.log(`Converting ${scrutin["numero"]} file…`); } let scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(UNDEFINED_SESSION)); const session = scrutin["session"] || UNDEFINED_SESSION; scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(session)); const scrutinFileName = `${scrutin["numero"]}.json`; await fs.outputJSON(path.join(scrutinReorganizedDir, scrutinFileName), scrutin, { spaces: 2, }); } } async function convertDatasetQuestions(dataDir, options) { const dataset = datasets.questions; if (!options["silent"]) { console.log(`Converting database ${dataset.database} data into files…`); } const questionsReorganizedRootDir = path.join(dataDir, dataset.database); if (!options.keepDir) { ensureAndClearDir(questionsReorganizedRootDir); } const limit = pLimit(10); const tasks = []; for await (const question of findAllQuestions()) { tasks.push(limit(async () => { if (options["verbose"]) { console.log(`Converting ${question["reference"]} file…`); } const legislature = question["legislature"] ? question["legislature"] : 0; const questionFileName = `${question["reference"]}.json`; await fs.outputJSON(path.join(questionsReorganizedRootDir, String(legislature), questionFileName), question, { spaces: 2, }); })); } await Promise.all(tasks); } async function convertTextes(dataDir, options) { const originalTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER); const transformedTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER); const enrichedTextesDir = path.join(dataDir, ENRICHED_TEXTE_FOLDER); if (!options["silent"]) { console.log("Converting database textes data into files…"); } for await (const texte of findAllTextes()) { const session = texte["session"] ?? UNDEFINED_SESSION; if (options["fromSession"] && session < options["fromSession"]) { continue; } if (!texte["url"]) { continue; } const texteName = path.parse(texte["url"]).name; const texteDir = path.join(originalTextesDir, `${session}`, texteName); // oritxtcod = 1 corresponds to "Texte de loi déposé au Sénat" const hasExposeDesMotifs = texte["origine"] === "déposé au Sénat" || texte["origine"] === "transmis au Sénat"; const metadata = { name: texteName, session: texte["session"], date: texte["date"], url_expose_des_motifs: hasExposeDesMotifs ? new URL(`${texteName}-expose.html`, SENAT_EXPOSE_DES_MOTIFS_BASE_URL) : undefined, url_xml: new URL(`${texteName}.akn.xml`, SENAT_TEXTE_XML_BASE_URL), url_html: new URL(`${texteName}.html`, SENAT_TEXTE_BASE_URL), url_pdf: new URL(`${texteName}.pdf`, SENAT_TEXTE_BASE_URL), }; fs.outputJSONSync(path.join(texteDir, `${texteName}.json`), texte, { spaces: 2, }); fs.outputJSONSync(path.join(texteDir, DOCUMENT_METADATA_FILE), metadata, { spaces: 2, }); if (options.fetchDocuments) { await processTexte(metadata, originalTextesDir, transformedTextesDir, enrichedTextesDir, options); } } } async function convertRapports(dataDir, options) { const originalRapportsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER); if (!options["silent"]) { console.log("Converting database rapports data into files…"); } for await (const rapport of findAllRapports()) { const session = rapport["session"] ?? UNDEFINED_SESSION; if (options["fromSession"] && session < options["fromSession"]) { continue; } if (!rapport["url"]) { continue; } const parsedRapportUrl = path.parse(rapport["url"]); const rapportName = parsedRapportUrl.name; const rapportDir = path.join(originalRapportsDir, `${session}`, rapportName); const rapportHtmlUrlBase = `${rapportName}_mono.html`; const rapportHtmlUrl = path.format({ dir: parsedRapportUrl.dir, base: rapportHtmlUrlBase, }); const rapportPdfUrlBase = `${rapportName}1.pdf`; const rapportPdfUrl = path.format({ dir: parsedRapportUrl.dir, base: rapportPdfUrlBase, }); const metadata = { name: rapportName, session: rapport["session"], date: rapport["date"], url_html: new URL(rapportHtmlUrl, SENAT_RAPPORT_BASE_URL), url_pdf: new URL(rapportPdfUrl, SENAT_RAPPORT_BASE_URL), }; fs.outputJSONSync(path.join(rapportDir, `${rapportName}.json`), rapport, { spaces: 2, }); fs.outputJSONSync(path.join(rapportDir, DOCUMENT_METADATA_FILE), metadata, { spaces: 2, }); if (options.fetchDocuments) { await processRapport(metadata, originalRapportsDir, options); } } } async function convertDatasetSens(dataDir, options) { const dataset = datasets.sens; if (!options["silent"]) { console.log(`Converting database ${dataset.database} data into files…`); } const sensReorganizedRootDir = path.join(dataDir, dataset.database); const senateursReorganizedDir = path.join(sensReorganizedRootDir, SENS_SENATEURS_FOLDER); const circonscriptionsReorganizedDir = path.join(sensReorganizedRootDir, SENS_CIRCONSCRIPTIONS_FOLDER); const organismesReorganizedDir = path.join(sensReorganizedRootDir, SENS_ORGANISMES_FOLDER); if (!options.keepDir) { ensureAndClearDir(sensReorganizedRootDir); ensureAndClearDir(senateursReorganizedDir); ensureAndClearDir(circonscriptionsReorganizedDir); ensureAndClearDir(organismesReorganizedDir); } for await (const sen of findAllSens()) { if (options["verbose"]) { console.log(`Converting ${sen["matricule"]} file…`); } const senFileName = `${sen["matricule"]}.json`; fs.outputJSONSync(path.join(senateursReorganizedDir, senFileName), sen, { spaces: 2, }); } for await (const circonscription of findAllCirconscriptions()) { if (options["verbose"]) { console.log(`Converting ${circonscription["identifiant"]} file…`); } const circonscriptionFileName = `${circonscription["identifiant"]}.json`; fs.outputJSONSync(path.join(circonscriptionsReorganizedDir, circonscriptionFileName), circonscription, { spaces: 2, }); } for await (const organisme of findAllOrganismes()) { if (options["verbose"]) { console.log(`Converting ${organisme["code"]} file…`); } const organismeFileName = `${organisme["code"]}.json`; fs.outputJSONSync(path.join(organismesReorganizedDir, organisme["type_code"], organismeFileName), organisme, { spaces: 2, }); } } convertData() .then(() => process.exit(exitCode)) .catch((error) => { console.log(error); process.exit(1); });