@tricoteuses/senat
Version:
Handle French Sénat's open data
354 lines (353 loc) • 15.8 kB
JavaScript
import assert from "assert";
import commandLineArgs from "command-line-args";
import fs from "fs-extra";
import path from "path";
import pLimit from "p-limit";
import * as git from "../git";
import { datasets, EnabledDatasets, getEnabledDatasets } from "../datasets";
import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, SCRUTINS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, ENRICHED_TEXTE_FOLDER, } from "../loaders";
import { findAllAmendements, findAllCirconscriptions, findAllDebats, findAllDossiers, findAllScrutins, findAllOrganismes, findAllQuestions, findAllSens, findAllTextes, findAllRapports, } from "../model";
import { processRapport, processTexte } from "./retrieve_documents";
import { buildActesLegislatifs } from "../model/dosleg";
import { UNDEFINED_SESSION } from "../types/sessions";
import { getSessionFromDate, getSessionFromSignet } from "./datautil";
import { commonOptions } from "./shared/cli_helpers";
import { ensureAndClearDir } from "./shared/util";
let exitCode = 10; // 0: some data changed, 10: no modification
const optionsDefinitions = [...commonOptions];
const options = commandLineArgs(optionsDefinitions);
const SENAT_TEXTE_XML_BASE_URL = "https://www.senat.fr/akomantoso/";
const SENAT_TEXTE_BASE_URL = "https://www.senat.fr/leg/";
const SENAT_EXPOSE_DES_MOTIFS_BASE_URL = "https://www.senat.fr/leg/exposes-des-motifs/";
const SENAT_RAPPORT_BASE_URL = "https://www.senat.fr/rap/";
function commitAndPushGit(datasetDir, options) {
if (options.commit) {
const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options.remote);
if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) {
exitCode = errorCode;
}
}
}
async function convertData() {
const dataDir = options["dataDir"];
assert(dataDir, "Missing argument: data directory");
const enabledDatasets = getEnabledDatasets(options["categories"]);
console.time("data transformation time");
if (enabledDatasets & EnabledDatasets.Ameli) {
try {
await convertDatasetAmeli(dataDir, options);
const ameliDir = path.join(dataDir, datasets.ameli.database);
commitAndPushGit(ameliDir, options);
}
catch (error) {
console.error("Error converting Ameli dataset:", error);
}
}
if (enabledDatasets & EnabledDatasets.Debats) {
try {
await convertDatasetDebats(dataDir, options);
const debatsDir = path.join(dataDir, datasets.debats.database);
commitAndPushGit(debatsDir, options);
}
catch (error) {
console.error("Error converting Debats dataset:", error);
}
}
if (enabledDatasets & EnabledDatasets.DosLeg) {
try {
await convertDatasetDosLeg(dataDir, options);
const doslegDir = path.join(dataDir, datasets.dosleg.database);
commitAndPushGit(doslegDir, options);
}
catch (error) {
console.error("Error converting DosLeg dataset:", error);
}
try {
await convertDatasetScrutins(dataDir, options);
const scrutinsDir = path.join(dataDir, SCRUTINS_FOLDER);
commitAndPushGit(scrutinsDir, options);
}
catch (error) {
console.error("Error converting Scrutins dataset:", error);
}
}
if (enabledDatasets & EnabledDatasets.Questions) {
try {
await convertDatasetQuestions(dataDir, options);
const questionsDir = path.join(dataDir, datasets.questions.database);
commitAndPushGit(questionsDir, options);
}
catch (error) {
console.error("Error converting Questions dataset:", error);
}
}
if (enabledDatasets & EnabledDatasets.Sens) {
try {
await convertDatasetSens(dataDir, options);
const sensDir = path.join(dataDir, datasets.sens.database);
commitAndPushGit(sensDir, options);
}
catch (error) {
console.error("Error converting Sens dataset:", error);
}
}
if (!options["silent"]) {
console.timeEnd("data transformation time");
}
}
async function convertDatasetAmeli(dataDir, options) {
const dataset = datasets.ameli;
if (!options["silent"]) {
console.log(`Converting database ${dataset.database} data into files…`);
}
const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
if (!options.keepDir) {
ensureAndClearDir(ameliReorganizedRootDir);
}
for await (const amendement of findAllAmendements(options["fromSession"])) {
if (options["verbose"]) {
console.log(`Converting ${amendement["numero"]} file…`);
}
const session = String(amendement["session"]) || UNDEFINED_SESSION;
const signetDossierLegislatif = amendement["signet_dossier_legislatif"] ||
`${amendement["nature_texte"]}-${amendement["numero_texte"]}`.toLowerCase();
const amendementFileName = `${amendement["numero"]}.json`;
const filePath = path.join(ameliReorganizedRootDir, String(session), signetDossierLegislatif, amendementFileName);
await fs.outputJSON(filePath, amendement, { spaces: 2 });
}
}
async function convertDatasetDebats(dataDir, options) {
const dataset = datasets.debats;
if (!options["silent"]) {
console.log(`Converting database ${dataset.database} data into files…`);
}
const debatsReorganizedRootDir = path.join(dataDir, dataset.database);
if (!options.keepDir) {
ensureAndClearDir(debatsReorganizedRootDir);
}
for await (const debat of findAllDebats()) {
if (options["verbose"]) {
console.log(`Converting ${debat.id} file…`);
}
const session = getSessionFromDate(debat.date_seance);
if (options["fromSession"] && session < options["fromSession"]) {
continue;
}
const debatFileName = `${debat.id}.json`;
const filePath = path.join(debatsReorganizedRootDir, String(session), debatFileName);
await fs.outputJSON(filePath, debat, { spaces: 2 });
}
}
async function convertDatasetDosLeg(dataDir, options) {
const dataset = datasets.dosleg;
if (!options["silent"]) {
console.log(`Converting database ${dataset.database} data into files…`);
}
const doslegReorganizedRootDir = path.join(dataDir, dataset.database);
const dossiersReorganizedDir = path.join(doslegReorganizedRootDir, DOSLEG_DOSSIERS_FOLDER);
if (!options.keepDir) {
ensureAndClearDir(doslegReorganizedRootDir);
ensureAndClearDir(dossiersReorganizedDir);
}
for await (const dossier of findAllDossiers()) {
if (options["verbose"]) {
console.log(`Converting ${dossier["signet"]} file…`);
}
let dossierReorganizedDir = path.join(dossiersReorganizedDir, String(UNDEFINED_SESSION));
const session = getSessionFromSignet(dossier["signet"]) || UNDEFINED_SESSION;
if (options["fromSession"] && session < options["fromSession"]) {
continue;
}
dossierReorganizedDir = path.join(dossiersReorganizedDir, String(session));
const actesBrutsNormalises = buildActesLegislatifs(dossier);
const dossierWithActes = {
...dossier,
actes_legislatifs: actesBrutsNormalises,
};
const dossierFile = `${dossier["signet"]}.json`;
await fs.outputJSON(path.join(dossierReorganizedDir, dossierFile), dossierWithActes, { spaces: 2 });
}
await convertTextes(dataDir, options);
await convertRapports(dataDir, options);
}
async function convertDatasetScrutins(dataDir, options) {
const dataset = datasets.dosleg;
if (!options["silent"]) {
console.log(`Converting database scrutins (${dataset.database}) data into files…`);
}
const scrutinsReorganizedDir = path.join(dataDir, SCRUTINS_FOLDER);
if (!options.keepDir) {
ensureAndClearDir(scrutinsReorganizedDir);
}
for await (const scrutin of findAllScrutins(options["fromSession"])) {
if (options["verbose"]) {
console.log(`Converting ${scrutin["numero"]} file…`);
}
let scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(UNDEFINED_SESSION));
const session = scrutin["session"] || UNDEFINED_SESSION;
scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(session));
const scrutinFileName = `${scrutin["numero"]}.json`;
await fs.outputJSON(path.join(scrutinReorganizedDir, scrutinFileName), scrutin, {
spaces: 2,
});
}
}
async function convertDatasetQuestions(dataDir, options) {
const dataset = datasets.questions;
if (!options["silent"]) {
console.log(`Converting database ${dataset.database} data into files…`);
}
const questionsReorganizedRootDir = path.join(dataDir, dataset.database);
if (!options.keepDir) {
ensureAndClearDir(questionsReorganizedRootDir);
}
const limit = pLimit(10);
const tasks = [];
for await (const question of findAllQuestions()) {
tasks.push(limit(async () => {
if (options["verbose"]) {
console.log(`Converting ${question["reference"]} file…`);
}
const legislature = question["legislature"] ? question["legislature"] : 0;
const questionFileName = `${question["reference"]}.json`;
await fs.outputJSON(path.join(questionsReorganizedRootDir, String(legislature), questionFileName), question, {
spaces: 2,
});
}));
}
await Promise.all(tasks);
}
async function convertTextes(dataDir, options) {
const originalTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
const transformedTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER);
const enrichedTextesDir = path.join(dataDir, ENRICHED_TEXTE_FOLDER);
if (!options["silent"]) {
console.log("Converting database textes data into files…");
}
for await (const texte of findAllTextes()) {
const session = texte["session"] ?? UNDEFINED_SESSION;
if (options["fromSession"] && session < options["fromSession"]) {
continue;
}
if (!texte["url"]) {
continue;
}
const texteName = path.parse(texte["url"]).name;
const texteDir = path.join(originalTextesDir, `${session}`, texteName);
// oritxtcod = 1 corresponds to "Texte de loi déposé au Sénat"
const hasExposeDesMotifs = texte["origine"] === "déposé au Sénat" || texte["origine"] === "transmis au Sénat";
const metadata = {
name: texteName,
session: texte["session"],
date: texte["date"],
url_expose_des_motifs: hasExposeDesMotifs
? new URL(`${texteName}-expose.html`, SENAT_EXPOSE_DES_MOTIFS_BASE_URL)
: undefined,
url_xml: new URL(`${texteName}.akn.xml`, SENAT_TEXTE_XML_BASE_URL),
url_html: new URL(`${texteName}.html`, SENAT_TEXTE_BASE_URL),
url_pdf: new URL(`${texteName}.pdf`, SENAT_TEXTE_BASE_URL),
};
fs.outputJSONSync(path.join(texteDir, `${texteName}.json`), texte, {
spaces: 2,
});
fs.outputJSONSync(path.join(texteDir, DOCUMENT_METADATA_FILE), metadata, {
spaces: 2,
});
if (options.fetchDocuments) {
await processTexte(metadata, originalTextesDir, transformedTextesDir, enrichedTextesDir, options);
}
}
}
async function convertRapports(dataDir, options) {
const originalRapportsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER);
if (!options["silent"]) {
console.log("Converting database rapports data into files…");
}
for await (const rapport of findAllRapports()) {
const session = rapport["session"] ?? UNDEFINED_SESSION;
if (options["fromSession"] && session < options["fromSession"]) {
continue;
}
if (!rapport["url"]) {
continue;
}
const parsedRapportUrl = path.parse(rapport["url"]);
const rapportName = parsedRapportUrl.name;
const rapportDir = path.join(originalRapportsDir, `${session}`, rapportName);
const rapportHtmlUrlBase = `${rapportName}_mono.html`;
const rapportHtmlUrl = path.format({
dir: parsedRapportUrl.dir,
base: rapportHtmlUrlBase,
});
const rapportPdfUrlBase = `${rapportName}1.pdf`;
const rapportPdfUrl = path.format({
dir: parsedRapportUrl.dir,
base: rapportPdfUrlBase,
});
const metadata = {
name: rapportName,
session: rapport["session"],
date: rapport["date"],
url_html: new URL(rapportHtmlUrl, SENAT_RAPPORT_BASE_URL),
url_pdf: new URL(rapportPdfUrl, SENAT_RAPPORT_BASE_URL),
};
fs.outputJSONSync(path.join(rapportDir, `${rapportName}.json`), rapport, {
spaces: 2,
});
fs.outputJSONSync(path.join(rapportDir, DOCUMENT_METADATA_FILE), metadata, {
spaces: 2,
});
if (options.fetchDocuments) {
await processRapport(metadata, originalRapportsDir, options);
}
}
}
async function convertDatasetSens(dataDir, options) {
const dataset = datasets.sens;
if (!options["silent"]) {
console.log(`Converting database ${dataset.database} data into files…`);
}
const sensReorganizedRootDir = path.join(dataDir, dataset.database);
const senateursReorganizedDir = path.join(sensReorganizedRootDir, SENS_SENATEURS_FOLDER);
const circonscriptionsReorganizedDir = path.join(sensReorganizedRootDir, SENS_CIRCONSCRIPTIONS_FOLDER);
const organismesReorganizedDir = path.join(sensReorganizedRootDir, SENS_ORGANISMES_FOLDER);
if (!options.keepDir) {
ensureAndClearDir(sensReorganizedRootDir);
ensureAndClearDir(senateursReorganizedDir);
ensureAndClearDir(circonscriptionsReorganizedDir);
ensureAndClearDir(organismesReorganizedDir);
}
for await (const sen of findAllSens()) {
if (options["verbose"]) {
console.log(`Converting ${sen["matricule"]} file…`);
}
const senFileName = `${sen["matricule"]}.json`;
fs.outputJSONSync(path.join(senateursReorganizedDir, senFileName), sen, {
spaces: 2,
});
}
for await (const circonscription of findAllCirconscriptions()) {
if (options["verbose"]) {
console.log(`Converting ${circonscription["identifiant"]} file…`);
}
const circonscriptionFileName = `${circonscription["identifiant"]}.json`;
fs.outputJSONSync(path.join(circonscriptionsReorganizedDir, circonscriptionFileName), circonscription, {
spaces: 2,
});
}
for await (const organisme of findAllOrganismes()) {
if (options["verbose"]) {
console.log(`Converting ${organisme["code"]} file…`);
}
const organismeFileName = `${organisme["code"]}.json`;
fs.outputJSONSync(path.join(organismesReorganizedDir, organisme["type_code"], organismeFileName), organisme, {
spaces: 2,
});
}
}
convertData()
.then(() => process.exit(exitCode))
.catch((error) => {
console.log(error);
process.exit(1);
});