UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

254 lines (253 loc) 11.2 kB
import assert from "assert"; import commandLineArgs from "command-line-args"; import fs from "fs-extra"; import path from "path"; import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders"; import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte"; import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions"; import { commonOptions } from "./shared/cli_helpers"; import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util"; const optionsDefinitions = [ ...commonOptions, { help: "parse and convert documents into JSON (textes only for now, requires format xml)", name: "parseDocuments", type: Boolean, }, { alias: "F", help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all", multiple: true, name: "formats", type: String, }, { help: "types of documents to retrieve (textes/rapports); leave empty for all", multiple: true, name: "types", type: String, }, { help: "force retrieve all documents, even already retrieved ones", name: "force", type: Boolean, }, ]; const options = commandLineArgs(optionsDefinitions); const textDecoder = new TextDecoder("utf8"); async function retrieveTextes(dataDir, sessions) { const textesDir = path.join(dataDir, TEXTE_FOLDER); fs.ensureDirSync(textesDir); const originalTextesDir = path.join(textesDir, DATA_ORIGINAL_FOLDER); const transformedTextesDir = path.join(textesDir, DATA_TRANSFORMED_FOLDER); if (options["parseDocuments"]) { ensureAndClearDir(transformedTextesDir); } let retrievedTextesCount = 0; const texteUrlsNotFoundOrError = []; const texteUrlsParseError = []; for (const session of sessions) { for (const { item: texteMetadata, } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) { const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name); fs.ensureDirSync(texteDir); let exposeDesMotifsContent = null; if (texteMetadata.url_expose_des_motifs) { exposeDesMotifsContent = await downloadExposeDesMotifs(texteDir, texteMetadata.name, String(texteMetadata.url_expose_des_motifs)); } if (isOptionEmptyOrHasValue(options["formats"], "xml")) { const textePath = path.join(texteDir, `${texteMetadata.name}.xml`); let texteBuffer = null; if (!options["force"] && fs.existsSync(textePath)) { if (!options["silent"]) { console.info(`Already downloaded texte ${textePath}…`); } } else { texteBuffer = await downloadDocument(texteMetadata.url_xml.toString()); if (!texteBuffer) { texteUrlsNotFoundOrError.push(texteMetadata.url_xml); continue; } fs.writeFileSync(textePath, Buffer.from(texteBuffer)); retrievedTextesCount++; } if (options["parseDocuments"]) { const parsedTexte = await parseDocument(texteMetadata.session, transformedTextesDir, textePath, texteMetadata.name, texteBuffer, exposeDesMotifsContent); if (!parsedTexte) { texteUrlsParseError.push(texteMetadata.url_xml); } } } if (isOptionEmptyOrHasValue(options["formats"], "html")) { const textePath = path.join(texteDir, `${texteMetadata.name}.html`); if (!options["force"] && fs.existsSync(textePath)) { if (!options["silent"]) { console.info(`Already downloaded texte ${textePath}…`); } } else { const texteBuffer = await downloadDocument(texteMetadata.url_html.toString()); if (!texteBuffer) { texteUrlsNotFoundOrError.push(texteMetadata.url_html); continue; } fs.writeFileSync(textePath, Buffer.from(texteBuffer)); retrievedTextesCount++; } } if (isOptionEmptyOrHasValue(options["formats"], "pdf")) { const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`); if (!options["force"] && fs.existsSync(textePath)) { if (!options["silent"]) { console.info(`Already downloaded texte ${textePath}…`); } } else { const texteBuffer = await downloadDocument(texteMetadata.url_pdf.toString()); if (!texteBuffer) { texteUrlsNotFoundOrError.push(texteMetadata.url_pdf); continue; } fs.writeFileSync(textePath, Buffer.from(texteBuffer)); retrievedTextesCount++; } } } } if (options["verbose"]) { console.log(`${retrievedTextesCount} textes retrieved`); console.log(`${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`); if (options["parseDocuments"]) { console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`); } } } async function retrieveRapports(dataDir, sessions) { const rapportsDir = path.join(dataDir, RAPPORT_FOLDER); fs.ensureDirSync(rapportsDir); const originalRapportsDir = path.join(rapportsDir, DATA_ORIGINAL_FOLDER); let retrievedRapportsCount = 0; const rapportUrlsNotFoundOrError = []; for (const session of sessions) { for (const { item: rapportMetadata, } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) { const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name); fs.ensureDirSync(rapportDir); if (isOptionEmptyOrHasValue(options["formats"], "html")) { const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`); if (!options["force"] && fs.existsSync(rapportPath)) { if (!options["silent"]) { console.info(`Already downloaded rapport ${rapportPath}…`); } continue; } const rapportBuffer = await downloadDocument(rapportMetadata.url_html.toString()); if (!rapportBuffer) { rapportUrlsNotFoundOrError.push(rapportMetadata.url_html); continue; } fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer)); retrievedRapportsCount++; } if (isOptionEmptyOrHasValue(options["formats"], "pdf")) { const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`); if (!options["force"] && fs.existsSync(rapportPath)) { if (!options["silent"]) { console.info(`Already downloaded rapport ${rapportPath}…`); } continue; } const rapportBuffer = await downloadDocument(rapportMetadata.url_pdf.toString()); if (!rapportBuffer) { rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf); continue; } fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer)); retrievedRapportsCount++; } } } if (options["verbose"]) { console.log(`${retrievedRapportsCount} rapports retrieved`); console.log(`${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`); } } async function downloadExposeDesMotifs(texteDir, texteName, url) { const content = await downloadDocument(url); if (!content) { return null; } const exposeDesMotifsPath = path.join(texteDir, `${texteName}-expose.html`); fs.writeFileSync(exposeDesMotifsPath, Buffer.from(content)); return content; } async function downloadDocument(documentUrl) { if (!options["silent"]) { console.log(`Downloading document ${documentUrl}…`); } try { const response = await fetchWithRetry(documentUrl); if (!response.ok) { if (response.status === 404) { console.warn(`Texte ${documentUrl} not found`); } else { console.error(`An error occurred while retrieving document ${documentUrl}: ${response.status}`); } return null; } return response.arrayBuffer(); } catch (error) { console.error(error.message); return null; } } async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null) { if (!options["silent"]) { console.log(`Parsing texte ${textePath}…`); } let parsedTexte; if (texteBuffer) { const texteXml = textDecoder.decode(texteBuffer); parsedTexte = parseTexte(texteXml); } else { parsedTexte = await parseTexteFromFile(textePath); } if (!parsedTexte) { return null; } if (exposeDesMotifs) { if (!options["silent"]) { console.log("Parsing exposé des motifs…"); } const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs); parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml); } const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName); fs.ensureDirSync(transformedTexteDir); fs.writeJSONSync(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 }); return parsedTexte; } async function main() { const dataDir = options["dataDir"]; assert(dataDir, "Missing argument: data directory"); const sessions = getSessionsFromStart(options["fromSession"]); console.time("documents processing time"); if (isOptionEmptyOrHasValue(options["types"], "textes")) { await retrieveTextes(dataDir, sessions); } if (isOptionEmptyOrHasValue(options["types"], "rapports")) { await retrieveRapports(dataDir, sessions); } if (!options["silent"]) { console.timeEnd("documents processing time"); } } main() .then(() => process.exit(0)) .catch((error) => { console.log(error); process.exit(1); });