UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

219 lines (218 loc) 8.65 kB
import assert from "assert"; import { execSync } from "child_process"; import commandLineArgs from "command-line-args"; import fs from "fs-extra"; // import fetch from "node-fetch" import path from "path"; // import stream from "stream" import StreamZip from "node-stream-zip"; import readline from "readline"; // import util from "util" import windows1252 from "windows-1252"; import config from "../config"; import { datasets, getChosenDatasets, getEnabledDatasets } from "../datasets"; import { commonOptions } from "./shared/cli_helpers"; const badWindows1252CharacterRegex = /[\u0080-\u009f]/g; const optionsDefinitions = [ ...commonOptions, { alias: "a", help: "all options: fetch, unzip, repair-encoding, import", name: "all", type: Boolean, }, { alias: "c", help: "create TypeScript interfaces from databases schemas into src/raw_types_* directories", name: "schema", type: Boolean, }, { alias: "e", help: "repair Windows CP 1252 encoding of SQL dumps", name: "repairEncoding", type: Boolean, }, { alias: "f", help: "fetch datasets instead of retrieving them from files", name: "fetch", type: Boolean, }, { alias: "i", help: "import SQL dumps into a freshly (re-)created database", name: "import", type: Boolean, }, { alias: "S", help: "sudo psql commands with given user", name: "sudo", type: String, }, { alias: "z", help: "unzip SQL files", name: "unzip", type: Boolean, }, ]; const options = commandLineArgs(optionsDefinitions); // const pipeline = util.promisify(stream.pipeline) async function retrieveDataset(dataDir, dataset) { const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1); const zipFilePath = path.join(dataDir, zipFilename); if (options["all"] || options["fetch"]) { // Fetch & save ZIP file. if (!options["silent"]) { console.log(`Loading ${dataset.title}: ${zipFilename}…`); } // Fetch fails with OpenSSL error: dh key too small. // (so does "curl"). // const response = await fetch(dataset.url) // if (!response.ok) { // console.error(response.status, response.statusText) // console.error(await response.text()) // throw new Error(`Fetch failed: ${dataset.url}`) // } // await pipeline(response.body!, fs.createWriteStream(zipFilePath)) fs.removeSync(zipFilePath); execSync(`wget --quiet ${dataset.url}`, { cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); } const sqlFilename = `${dataset.database}.sql`; const sqlFilePath = path.join(dataDir, sqlFilename); if (options["all"] || options["unzip"]) { if (!options["silent"]) { console.log(`Unzipping ${dataset.title}: ${zipFilename}…`); } fs.removeSync(sqlFilePath); const zip = new StreamZip({ file: zipFilePath, storeEntries: true, }); await new Promise((resolve, reject) => { zip.on("ready", () => { zip.extract(null, dataDir, (err, _count) => { zip.close(); if (err) { reject(err); } else { resolve(null); } }); }); }); if (dataset.repairZip !== undefined) { if (!options["silent"]) { console.log(`Repairing Zip path ${dataset.title}: ${sqlFilename}…`); } dataset.repairZip(dataset, dataDir); } } if ((options["all"] || options["repairEncoding"]) && dataset.repairEncoding) { if (!options["silent"]) { console.log(`Repairing Windows CP1252 encoding of ${dataset.title}: ${sqlFilename}…`); } const repairedSqlFilePath = sqlFilePath + ".repaired"; const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, { encoding: "utf8", }); const lineReader = readline.createInterface({ input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }), crlfDelay: Infinity, }); for await (const line of lineReader) { repairedSqlWriter.write(line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" })) + "\n"); } repairedSqlWriter.end(); await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true }); } if (options["all"] || options["import"] || options["schema"]) { if (!options["silent"]) { console.log(`Importing ${dataset.title}: ${sqlFilename}…`); } execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d ${dataset.database} -f ${sqlFilename}`, { cwd: dataDir, env: process.env, encoding: "utf-8", stdio: ["pipe", "ignore", "ignore"], }); } if (options["schema"]) { let definitionsDir = path.resolve("src", "raw_types_schemats"); assert(fs.statSync(definitionsDir).isDirectory()); if (!options["silent"]) { console.log(`Creating TypeScript definitions from schema of database ${dataset.database}…`); } const dbConnectionString = `postgres://${process.env["PGUSER"]}:${process.env["PGPASSWORD"]}@${process.env["PGHOST"]}:${process.env["PGPORT"]}/${dataset.database}`; let definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`); execSync(`npx schemats generate -c ${dbConnectionString} -s ${dataset.schema} -o ${definitionFilePath}`, { // cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); const definition = fs.readFileSync(definitionFilePath, { encoding: "utf8" }); const definitionRepaired = definition .replace(/\r\n/g, "\n") .replace(/AUTO-GENERATED FILE @ \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/, "AUTO-GENERATED FILE"); fs.writeFileSync(definitionFilePath, definitionRepaired); definitionsDir = path.resolve("src", "raw_types"); definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`); execSync(`kysely-codegen --url ${dbConnectionString} --default-schema=${dataset.schema} --out-file=${definitionFilePath}`, { // cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); } } async function retrieveOpenData() { const dataDir = options["dataDir"]; assert(dataDir, "Missing argument: data directory"); process.env = { ...process.env, PGHOST: process.env["PGHOST"] || config.db.host, PGPORT: process.env["PGPORT"] || config.db.port, PGUSER: process.env["PGUSER"] || config.db.user, PGPASSWORD: process.env["PGPASSWORD"] || config.db.password, }; assert(process.env["PGHOST"] && process.env["PGPORT"] && process.env["PGUSER"] && process.env["PGPASSWORD"], "Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file"); console.time("data extraction time"); for (const [, dataset] of Object.entries(datasets)) { execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "DROP DATABASE IF EXISTS ${dataset.database}"`, { cwd: dataDir, env: process.env, encoding: "utf-8", }); execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE DATABASE ${dataset.database} WITH OWNER opendata"`, { cwd: dataDir, env: process.env, encoding: "utf-8", }); } const enabledDatasets = getEnabledDatasets(options["categories"]); const chosenDatasets = getChosenDatasets(enabledDatasets); for (const dataset of chosenDatasets) { await retrieveDataset(dataDir, dataset); } if (!options["silent"]) { console.timeEnd("data extraction time"); } } retrieveOpenData() .then(() => process.exit(0)) .catch((error) => { console.log(error); process.exit(1); });