@tricoteuses/senat
Version:
Handle French Sénat's open data
254 lines (253 loc) • 11.2 kB
JavaScript
import assert from "assert";
import commandLineArgs from "command-line-args";
import fs from "fs-extra";
import path from "path";
import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte";
import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
import { commonOptions } from "./shared/cli_helpers";
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
const optionsDefinitions = [
...commonOptions,
{
help: "parse and convert documents into JSON (textes only for now, requires format xml)",
name: "parseDocuments",
type: Boolean,
},
{
alias: "F",
help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
multiple: true,
name: "formats",
type: String,
},
{
help: "types of documents to retrieve (textes/rapports); leave empty for all",
multiple: true,
name: "types",
type: String,
},
{
help: "force retrieve all documents, even already retrieved ones",
name: "force",
type: Boolean,
},
];
const options = commandLineArgs(optionsDefinitions);
const textDecoder = new TextDecoder("utf8");
async function retrieveTextes(dataDir, sessions) {
const textesDir = path.join(dataDir, TEXTE_FOLDER);
fs.ensureDirSync(textesDir);
const originalTextesDir = path.join(textesDir, DATA_ORIGINAL_FOLDER);
const transformedTextesDir = path.join(textesDir, DATA_TRANSFORMED_FOLDER);
if (options["parseDocuments"]) {
ensureAndClearDir(transformedTextesDir);
}
let retrievedTextesCount = 0;
const texteUrlsNotFoundOrError = [];
const texteUrlsParseError = [];
for (const session of sessions) {
for (const { item: texteMetadata, } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
fs.ensureDirSync(texteDir);
let exposeDesMotifsContent = null;
if (texteMetadata.url_expose_des_motifs) {
exposeDesMotifsContent = await downloadExposeDesMotifs(texteDir, texteMetadata.name, String(texteMetadata.url_expose_des_motifs));
}
if (isOptionEmptyOrHasValue(options["formats"], "xml")) {
const textePath = path.join(texteDir, `${texteMetadata.name}.xml`);
let texteBuffer = null;
if (!options["force"] && fs.existsSync(textePath)) {
if (!options["silent"]) {
console.info(`Already downloaded texte ${textePath}…`);
}
}
else {
texteBuffer = await downloadDocument(texteMetadata.url_xml.toString());
if (!texteBuffer) {
texteUrlsNotFoundOrError.push(texteMetadata.url_xml);
continue;
}
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
retrievedTextesCount++;
}
if (options["parseDocuments"]) {
const parsedTexte = await parseDocument(texteMetadata.session, transformedTextesDir, textePath, texteMetadata.name, texteBuffer, exposeDesMotifsContent);
if (!parsedTexte) {
texteUrlsParseError.push(texteMetadata.url_xml);
}
}
}
if (isOptionEmptyOrHasValue(options["formats"], "html")) {
const textePath = path.join(texteDir, `${texteMetadata.name}.html`);
if (!options["force"] && fs.existsSync(textePath)) {
if (!options["silent"]) {
console.info(`Already downloaded texte ${textePath}…`);
}
}
else {
const texteBuffer = await downloadDocument(texteMetadata.url_html.toString());
if (!texteBuffer) {
texteUrlsNotFoundOrError.push(texteMetadata.url_html);
continue;
}
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
retrievedTextesCount++;
}
}
if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`);
if (!options["force"] && fs.existsSync(textePath)) {
if (!options["silent"]) {
console.info(`Already downloaded texte ${textePath}…`);
}
}
else {
const texteBuffer = await downloadDocument(texteMetadata.url_pdf.toString());
if (!texteBuffer) {
texteUrlsNotFoundOrError.push(texteMetadata.url_pdf);
continue;
}
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
retrievedTextesCount++;
}
}
}
}
if (options["verbose"]) {
console.log(`${retrievedTextesCount} textes retrieved`);
console.log(`${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`);
if (options["parseDocuments"]) {
console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`);
}
}
}
async function retrieveRapports(dataDir, sessions) {
const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
fs.ensureDirSync(rapportsDir);
const originalRapportsDir = path.join(rapportsDir, DATA_ORIGINAL_FOLDER);
let retrievedRapportsCount = 0;
const rapportUrlsNotFoundOrError = [];
for (const session of sessions) {
for (const { item: rapportMetadata, } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
fs.ensureDirSync(rapportDir);
if (isOptionEmptyOrHasValue(options["formats"], "html")) {
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`);
if (!options["force"] && fs.existsSync(rapportPath)) {
if (!options["silent"]) {
console.info(`Already downloaded rapport ${rapportPath}…`);
}
continue;
}
const rapportBuffer = await downloadDocument(rapportMetadata.url_html.toString());
if (!rapportBuffer) {
rapportUrlsNotFoundOrError.push(rapportMetadata.url_html);
continue;
}
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
retrievedRapportsCount++;
}
if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`);
if (!options["force"] && fs.existsSync(rapportPath)) {
if (!options["silent"]) {
console.info(`Already downloaded rapport ${rapportPath}…`);
}
continue;
}
const rapportBuffer = await downloadDocument(rapportMetadata.url_pdf.toString());
if (!rapportBuffer) {
rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf);
continue;
}
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
retrievedRapportsCount++;
}
}
}
if (options["verbose"]) {
console.log(`${retrievedRapportsCount} rapports retrieved`);
console.log(`${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`);
}
}
async function downloadExposeDesMotifs(texteDir, texteName, url) {
const content = await downloadDocument(url);
if (!content) {
return null;
}
const exposeDesMotifsPath = path.join(texteDir, `${texteName}-expose.html`);
fs.writeFileSync(exposeDesMotifsPath, Buffer.from(content));
return content;
}
async function downloadDocument(documentUrl) {
if (!options["silent"]) {
console.log(`Downloading document ${documentUrl}…`);
}
try {
const response = await fetchWithRetry(documentUrl);
if (!response.ok) {
if (response.status === 404) {
console.warn(`Texte ${documentUrl} not found`);
}
else {
console.error(`An error occurred while retrieving document ${documentUrl}: ${response.status}`);
}
return null;
}
return response.arrayBuffer();
}
catch (error) {
console.error(error.message);
return null;
}
}
async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null) {
if (!options["silent"]) {
console.log(`Parsing texte ${textePath}…`);
}
let parsedTexte;
if (texteBuffer) {
const texteXml = textDecoder.decode(texteBuffer);
parsedTexte = parseTexte(texteXml);
}
else {
parsedTexte = await parseTexteFromFile(textePath);
}
if (!parsedTexte) {
return null;
}
if (exposeDesMotifs) {
if (!options["silent"]) {
console.log("Parsing exposé des motifs…");
}
const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs);
parsedTexte.exposeDesMotifs =
parseExposeDesMotifs(exposeDesMotifsHtml);
}
const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName);
fs.ensureDirSync(transformedTexteDir);
fs.writeJSONSync(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 });
return parsedTexte;
}
async function main() {
const dataDir = options["dataDir"];
assert(dataDir, "Missing argument: data directory");
const sessions = getSessionsFromStart(options["fromSession"]);
console.time("documents processing time");
if (isOptionEmptyOrHasValue(options["types"], "textes")) {
await retrieveTextes(dataDir, sessions);
}
if (isOptionEmptyOrHasValue(options["types"], "rapports")) {
await retrieveRapports(dataDir, sessions);
}
if (!options["silent"]) {
console.timeEnd("documents processing time");
}
}
main()
.then(() => process.exit(0))
.catch((error) => {
console.log(error);
process.exit(1);
});