@tricoteuses/senat
Version:
Handle French Sénat's open data
107 lines (106 loc) • 4.4 kB
JavaScript
import assert from "assert";
import commandLineArgs from "command-line-args";
import fs from "fs-extra";
import path from "path";
import { COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDebats } from "../loaders";
import { parseCompteRenduFromFile } from "../model/compte_rendu";
import { getSessionsFromStart } from "../types/sessions";
import { commonOptions } from "./shared/cli_helpers";
import { ensureAndClearDir } from "./shared/util";
const optionsDefinitions = [
...commonOptions,
{
help: "parse and convert comptes-rendus des débats into JSON",
name: "parseDebats",
type: Boolean,
},
];
const options = commandLineArgs(optionsDefinitions);
const SENAT_COMPTE_RENDU_URL_ROOT = "https://www.senat.fr/seances";
class CompteRenduError extends Error {
constructor(message, compteRenduUrl) {
super(`An error occurred while retrieving Compte-Rendu ${compteRenduUrl}: ${message}`);
}
}
async function retrieveComptesRendus(dataDir, sessions) {
const comptesRendusRootDir = path.join(dataDir, COMPTES_RENDUS_FOLDER);
ensureAndClearDir(comptesRendusRootDir);
const originalComptesRendusDir = path.join(comptesRendusRootDir, DATA_ORIGINAL_FOLDER);
fs.ensureDirSync(originalComptesRendusDir);
const transformedComptesRendusDir = path.join(comptesRendusRootDir, DATA_TRANSFORMED_FOLDER);
if (options["parseDebats"]) {
fs.ensureDirSync(transformedComptesRendusDir);
}
for (const session of sessions) {
const originalComptesRendusSessionDir = path.join(originalComptesRendusDir, `${session}`);
fs.ensureDirSync(originalComptesRendusSessionDir);
const transformedComptesRendusSessionDir = path.join(transformedComptesRendusDir, `${session}`);
if (options["parseDebats"]) {
fs.ensureDirSync(transformedComptesRendusSessionDir);
}
for (const { item: debat, } of iterLoadSenatDebats(dataDir, session)) {
if (!debat.url) {
continue;
}
try {
const debatMonoUrl = `${path.parse(debat.url).dir}/s${debat.id}_mono.html`;
const compteRenduPath = path.join(originalComptesRendusSessionDir, `${debat.id}.html`);
await downloadCompteRendu(debatMonoUrl, compteRenduPath);
if (options["parseDebats"]) {
await parseCompteRendu(transformedComptesRendusSessionDir, compteRenduPath, debat);
}
}
catch (error) {
console.error(error);
}
}
}
}
async function downloadCompteRendu(debatUrl, compteRenduPath) {
const compteRenduUrl = `${SENAT_COMPTE_RENDU_URL_ROOT}/${debatUrl}`;
if (!options["silent"]) {
console.log(`Downloading Compte-Rendu ${compteRenduUrl}…`);
}
const response = await fetch(compteRenduUrl);
if (!response.ok) {
if (response.status === 404) {
console.warn(`Compte-Rendu ${compteRenduUrl} not found`);
}
else {
throw new CompteRenduError(String(response.status), compteRenduUrl);
}
return;
}
const compteRenduContent = await response.arrayBuffer();
if (!compteRenduContent) {
return;
}
fs.writeFileSync(compteRenduPath, Buffer.from(compteRenduContent));
}
async function parseCompteRendu(transformedComptesRendusSessionDir, compteRenduPath, debat) {
if (!options["silent"]) {
console.log(`Parsing compte-rendu ${compteRenduPath}…`);
}
const parsedCompteRendu = await parseCompteRenduFromFile(compteRenduPath, debat);
if (!parsedCompteRendu) {
return;
}
const parsedFilePath = path.parse(compteRenduPath);
fs.writeJSONSync(path.join(transformedComptesRendusSessionDir, `${parsedFilePath.name}.json`), parsedCompteRendu, { spaces: 2 });
}
async function main() {
const dataDir = options["dataDir"];
assert(dataDir, "Missing argument: data directory");
const sessions = getSessionsFromStart(options["fromSession"]);
console.time("comptes-rendus processing time");
await retrieveComptesRendus(dataDir, sessions);
if (!options["silent"]) {
console.timeEnd("comptes-rendus processing time");
}
}
main()
.then(() => process.exit(0))
.catch((error) => {
console.log(error);
process.exit(1);
});