@tricoteuses/senat
Version:
Handle French Sénat's open data
270 lines (269 loc) • 11.8 kB
JavaScript
import assert from "assert";
import commandLineArgs from "command-line-args";
import fs from "fs-extra";
import { DateTime } from "luxon";
import path from "path";
import { convertSenatXmlToHtml } from "../conversion_textes";
import * as git from "../git";
import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, ENRICHED_TEXTE_FOLDER, iterLoadSenatRapportUrls, iterLoadSenatTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../parsers/texte";
import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
import { commonOptions } from "./shared/cli_helpers";
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
let exitCode = 10; // 0: some data changed, 10: no modification
const optionsDefinitions = [
...commonOptions,
{
alias: "F",
help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
multiple: true,
name: "formats",
type: String,
},
{
help: "types of documents to retrieve (textes/rapports); leave empty for all",
multiple: true,
name: "types",
type: String,
},
{
help: "force retrieve all documents, even already retrieved ones",
name: "force",
type: Boolean,
},
];
const options = commandLineArgs(optionsDefinitions);
const textDecoder = new TextDecoder("utf8");
const today = DateTime.now();
function commitAndPushGit(datasetDir) {
if (options["commit"]) {
const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options["remote"]);
if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) {
exitCode = errorCode;
}
}
}
function isDocumentRecent(documentDate, daysThreshold) {
if (!documentDate)
return false;
const docDate = DateTime.fromISO(documentDate);
return docDate.isValid && today.diff(docDate, "days").days <= daysThreshold;
}
function shouldDownload(filePath, docDate, options) {
if (options.force)
return true;
if (!fs.existsSync(filePath))
return true;
if (options.onlyRecent !== undefined) {
return isDocumentRecent(docDate, options.onlyRecent);
}
return false;
}
async function downloadDocument(documentUrl, verbose) {
if (verbose) {
console.log(`Downloading document ${documentUrl}…`);
}
try {
const response = await fetchWithRetry(documentUrl);
if (!response.ok) {
if (response.status === 404) {
if (verbose) {
console.warn(`Document ${documentUrl} not found`);
}
}
else {
if (verbose) {
console.error(`An error occurred while retrieving document ${documentUrl}: ${response.status}`);
}
}
return null;
}
return response.arrayBuffer();
}
catch (error) {
console.error(error.message);
return null;
}
}
async function processDocument(url, destPath, docDate, options) {
if (!shouldDownload(destPath, docDate, options)) {
if (options.verbose)
console.info(`Already downloaded ${destPath}…`);
return { success: true, skipped: true, buffer: null };
}
const arrayBuffer = await downloadDocument(url, options.verbose);
if (!arrayBuffer) {
return { success: false, skipped: false, buffer: null };
}
const buffer = Buffer.from(arrayBuffer);
await fs.outputFile(destPath, buffer);
return { success: true, skipped: false, buffer };
}
export async function processTexte(texteMetadata, originalTextesDir, transformedTextesDir, enrichedTextesDir, options) {
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
// Pre-compute whether the parsed JSON output already exists, to avoid re-parsing unchanged files
const parsedJsonPath = path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name, `${texteMetadata.name}.json`);
const parsedOutputExists = options.parseDocuments ? await fs.pathExists(parsedJsonPath) : false;
let exposeDesMotifsContent = null;
if (texteMetadata.url_expose_des_motifs) {
const exposePath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
const res = await processDocument(texteMetadata.url_expose_des_motifs.toString(), exposePath, texteMetadata.date, options);
if (res.buffer) {
exposeDesMotifsContent = res.buffer;
}
else if (res.skipped && options.parseDocuments && !parsedOutputExists) {
// Only load expose from disk if we'll actually need to re-parse
if (await fs.pathExists(exposePath)) {
exposeDesMotifsContent = await fs.readFile(exposePath);
}
}
}
const formats = [
{ type: "xml", url: texteMetadata.url_xml, isParseTarget: true },
{ type: "html", url: texteMetadata.url_html, isParseTarget: false },
{ type: "pdf", url: texteMetadata.url_pdf, isParseTarget: false },
];
for (const format of formats) {
if (!isOptionEmptyOrHasValue(options.formats, format.type))
continue;
if (format.url === undefined || format.url.toString().includes("#"))
continue;
const destPath = path.join(texteDir, `${texteMetadata.name}.${format.type}`);
const result = await processDocument(format.url.toString(), destPath, texteMetadata.date, options);
// Specific logic: Parsing (Only applies to XML)
if (format.isParseTarget && options.parseDocuments) {
// Skip re-parsing if the XML was not newly downloaded AND the parsed output already exists
const needsParsing = !result.skipped || !parsedOutputExists;
if (needsParsing && (result.buffer !== null || (await fs.pathExists(destPath)))) {
await parseDocument(texteMetadata.session, transformedTextesDir, destPath, texteMetadata.name, result.buffer, exposeDesMotifsContent, options);
let texteXmlContent = null;
if (result.buffer === null && (await fs.pathExists(destPath))) {
texteXmlContent = await fs.readFile(destPath, "utf-8");
}
else if (result.buffer !== null) {
texteXmlContent = textDecoder.decode(result.buffer);
}
if (texteXmlContent !== null) {
try {
await convertSenatXmlToHtml(texteXmlContent, path.join(enrichedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name, `${texteMetadata.name}.html`));
}
catch (error) {
console.error(`Error converting ${texteMetadata.name} to HTML: ${error.message}`);
}
}
}
else if (options.verbose) {
console.info(`Skipping parse for already processed texte ${texteMetadata.name}…`);
}
}
}
}
export async function processRapport(rapportMetadata, originalRapportsDir, options) {
const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
const formats = [
{ type: "html", url: rapportMetadata.url_html },
{ type: "pdf", url: rapportMetadata.url_pdf },
];
for (const format of formats) {
if (!isOptionEmptyOrHasValue(options["formats"], format.type))
continue;
const destPath = path.join(rapportDir, `${rapportMetadata.name}.${format.type}`);
await processDocument(format.url.toString(), destPath, rapportMetadata.date, options);
}
}
async function processTextes(dataDir, sessions) {
const textesDir = path.join(dataDir, TEXTE_FOLDER);
const originalTextesDir = path.join(textesDir, DATA_ORIGINAL_FOLDER);
const transformedTextesDir = path.join(textesDir, DATA_TRANSFORMED_FOLDER);
const enrichedTextesDir = path.join(dataDir, ENRICHED_TEXTE_FOLDER);
if (options["parseDocuments"] && options["only-recent"] === undefined) {
ensureAndClearDir(transformedTextesDir);
ensureAndClearDir(enrichedTextesDir);
}
const dlOptions = {
force: options["force"],
silent: options["silent"],
verbose: options["verbose"],
onlyRecent: options["only-recent"],
formats: options["formats"],
parseDocuments: options["parseDocuments"],
};
for (const session of sessions) {
for (const { item: texteMetadata } of iterLoadSenatTexteUrls(dataDir, session)) {
await processTexte(texteMetadata, originalTextesDir, transformedTextesDir, enrichedTextesDir, dlOptions);
}
}
commitAndPushGit(textesDir);
commitAndPushGit(enrichedTextesDir);
}
async function processRapports(dataDir, sessions) {
const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
const originalRapportsDir = path.join(rapportsDir, DATA_ORIGINAL_FOLDER);
const dlOptions = {
force: options["force"],
silent: options["silent"],
verbose: options["verbose"],
onlyRecent: options["only-recent"],
formats: options["formats"],
};
for (const session of sessions) {
for (const { item: rapportMetadata } of iterLoadSenatRapportUrls(dataDir, session)) {
await processRapport(rapportMetadata, originalRapportsDir, dlOptions);
}
}
commitAndPushGit(rapportsDir);
}
async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null, options = {}) {
if (options.verbose) {
console.log(`Parsing texte ${textePath}…`);
}
let parsedTexte;
if (texteBuffer) {
const texteXml = textDecoder.decode(texteBuffer);
parsedTexte = parseTexte(texteXml);
}
else {
if (!(await fs.pathExists(textePath))) {
if (options.verbose) {
console.warn(`Skipping parse for missing XML file: ${textePath}`);
}
return null;
}
parsedTexte = await parseTexteFromFile(textePath);
}
if (!parsedTexte)
return null;
if (exposeDesMotifs) {
if (options.verbose) {
console.log("Parsing exposé des motifs…");
}
const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs);
parsedTexte.expose_motifs = parseExposeDesMotifs(exposeDesMotifsHtml);
}
const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName);
await fs.outputJSON(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 });
return parsedTexte;
}
async function main() {
const dataDir = options["dataDir"];
assert(dataDir, "Missing argument: data directory");
const sessions = getSessionsFromStart(options["fromSession"]);
console.time("documents processing time");
if (isOptionEmptyOrHasValue(options["types"], "textes")) {
await processTextes(dataDir, sessions);
}
if (isOptionEmptyOrHasValue(options["types"], "rapports")) {
await processRapports(dataDir, sessions);
}
if (!options["silent"]) {
console.timeEnd("documents processing time");
}
}
if (process.argv[1].endsWith("retrieve_documents.ts")) {
main()
.then(() => process.exit(exitCode))
.catch((error) => {
console.log(error);
process.exit(1);
});
}