UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

219 lines (218 loc) 8.1 kB
import assert from "assert"; import { execSync } from "child_process"; import commandLineArgs from "command-line-args"; import fs from "fs-extra"; // import fetch from "node-fetch" import path from "path"; // import stream from "stream" import StreamZip from "node-stream-zip"; import readline from "readline"; // import util from "util" import windows1252 from "windows-1252"; import config from "../config"; import { getChosenFromEnabledDatasets } from '../datasets'; const badWindows1252CharacterRegex = /[\u0080-\u009f]/g; const optionsDefinitions = [ { alias: 'k', defaultValue: ['All'], help: 'categories of datasets to reorganize', multiple: true, name: 'categories', type: String, }, { alias: "a", help: "all options: fetch, unzip, repair-encoding, import, schema", name: "all", type: Boolean, }, { alias: "c", help: "create TypeScript interfaces from databases schemas into src/raw_types directory", name: "schema", type: Boolean, }, { alias: "e", help: "repair Windows CP 1252 encoding of SQL dumps", name: "repair-encoding", type: Boolean, }, { alias: "f", help: "fetch datasets instead of retrieving them from files", name: "fetch", type: Boolean, }, { alias: "i", help: "import SQL dumps into a freshly (re-)created database", name: "import", type: Boolean, }, { alias: "z", help: "unzip SQL files", name: "unzip", type: Boolean, }, { alias: "s", help: "don't log anything", name: "silent", type: Boolean, }, { defaultOption: true, help: "directory containing Sénat open data files", name: "dataDir", type: String, }, ]; const options = commandLineArgs(optionsDefinitions); // const pipeline = util.promisify(stream.pipeline) async function retrieveDataset(dataDir, dataset) { const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1); const zipFilePath = path.join(dataDir, zipFilename); if (options.all || options.fetch) { // Fetch & save ZIP file. if (!options.silent) { console.log(`Loading ${dataset.title}: ${zipFilename}…`); } // Fetch fails with OpenSSL error: dh key too small. // (so does "curl"). // const response = await fetch(dataset.url) // if (!response.ok) { // console.error(response.status, response.statusText) // console.error(await response.text()) // throw new Error(`Fetch failed: ${dataset.url}`) // } // await pipeline(response.body!, fs.createWriteStream(zipFilePath)) fs.removeSync(zipFilePath); execSync(`wget --quiet ${dataset.url}`, { cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); } const sqlFilename = `${dataset.database}.sql`; const sqlFilePath = path.join(dataDir, sqlFilename); if (options.all || options.unzip) { if (!options.silent) { console.log(`Unzipping ${dataset.title}: ${zipFilename}…`); } fs.removeSync(sqlFilePath); const zip = new StreamZip({ file: zipFilePath, storeEntries: true, }); await new Promise((resolve, reject) => { zip.on("ready", () => { zip.extract(null, dataDir, (err, _count) => { zip.close(); if (err) { reject(err); } else { resolve(null); } }); }); }); if (dataset.repairZip !== undefined) { if (!options.silent) { console.log(`Repairing Zip path ${dataset.title}: ${sqlFilename}…`); } dataset.repairZip(dataset, dataDir); } } if ((options.all || options["repair-encoding"]) && dataset.repairEncoding) { if (!options.silent) { console.log(`Repairing Windows CP1252 encoding of ${dataset.title}: ${sqlFilename}…`); } const repairedSqlFilePath = sqlFilePath + ".repaired"; const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, { encoding: "utf8", }); const lineReader = readline.createInterface({ input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }), crlfDelay: Infinity, }); for await (const line of lineReader) { repairedSqlWriter.write(line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" })) + "\n"); } repairedSqlWriter.end(); await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true }); } if (options.all || options.import) { if (!options.silent) { console.log(`Importing ${dataset.title}: ${sqlFilename}…`); } execSync(`psql -c "DROP DATABASE IF EXISTS ${dataset.database}"`, { cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); execSync(`psql -c "CREATE DATABASE ${dataset.database} WITH OWNER opendata"`, { cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); execSync(`psql -f ${sqlFilename} ${dataset.database}`, { cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); } if (options.schema) { const definitionsDir = path.resolve("src", "raw_types"); assert(fs.statSync(definitionsDir).isDirectory()); if (!options.silent) { console.log(`Creating TypeScript definitions from schema of database ${dataset.database}…`); } const dbConnectionString = `postgres://${process.env.PGUSER}:${process.env.PGPASSWORD}@${process.env.PGHOST}:${process.env.PGPORT}/${dataset.database}`; const definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`); execSync(`npx schemats generate -c ${dbConnectionString} -s public -o ${definitionFilePath}`, { // cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); const definition = fs.readFileSync(definitionFilePath, { encoding: "utf8" }); const definitionRepaired = definition .replace(/\r\n/g, "\n") .replace(/AUTO-GENERATED FILE @ \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/, "AUTO-GENERATED FILE"); fs.writeFileSync(definitionFilePath, definitionRepaired); } } async function retrieveOpenData() { const dataDir = options.dataDir; assert(dataDir, "Missing argument: data directory"); process.env = { ...process.env, PGHOST: process.env.PGHOST || config.db.host, PGPORT: process.env.PGPORT || config.db.port, PGUSER: process.env.PGUSER || config.db.user, PGPASSWORD: process.env.PGPASSWORD || config.db.password }; assert(process.env.PGHOST && process.env.PGPORT && process.env.PGUSER && process.env.PGPASSWORD, 'Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file'); const choosenDatasets = getChosenFromEnabledDatasets(options.categories); // await Promise.all(choosenDatasets.map(dataset => retrieveDataset(dataDir, dataset))) for (const dataset of choosenDatasets) { await retrieveDataset(dataDir, dataset); } } retrieveOpenData() .then(() => process.exit(0)) .catch((error) => { console.log(error); process.exit(1); });