UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

259 lines (246 loc) 7.51 kB
import assert from "assert" import { execSync } from "child_process" import commandLineArgs from "command-line-args" import fs from "fs-extra" // import fetch from "node-fetch" import path from "path" // import stream from "stream" import StreamZip from "node-stream-zip" import readline from "readline" // import util from "util" import windows1252 from "windows-1252" import config from "../config" import { Dataset, getChosenFromEnabledDatasets } from '../datasets' const badWindows1252CharacterRegex = /[\u0080-\u009f]/g const optionsDefinitions = [ { alias: 'k', defaultValue: ['All'], help: 'categories of datasets to reorganize', multiple: true, name: 'categories', type: String, }, { alias: "a", help: "all options: fetch, unzip, repair-encoding, import, schema", name: "all", type: Boolean, }, { alias: "c", help: "create TypeScript interfaces from databases schemas into src/raw_types directory", name: "schema", type: Boolean, }, { alias: "e", help: "repair Windows CP 1252 encoding of SQL dumps", name: "repair-encoding", type: Boolean, }, { alias: "f", help: "fetch datasets instead of retrieving them from files", name: "fetch", type: Boolean, }, { alias: "i", help: "import SQL dumps into a freshly (re-)created database", name: "import", type: Boolean, }, { alias: "z", help: "unzip SQL files", name: "unzip", type: Boolean, }, { alias: "s", help: "don't log anything", name: "silent", type: Boolean, }, { defaultOption: true, help: "directory containing Sénat open data files", name: "dataDir", type: String, }, ] const options = commandLineArgs(optionsDefinitions) // const pipeline = util.promisify(stream.pipeline) async function retrieveDataset( dataDir: string, dataset: Dataset, ): Promise<void> { const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1) const zipFilePath = path.join(dataDir, zipFilename) if (options.all || options.fetch) { // Fetch & save ZIP file. if (!options.silent) { console.log(`Loading ${dataset.title}: ${zipFilename}…`) } // Fetch fails with OpenSSL error: dh key too small. // (so does "curl"). // const response = await fetch(dataset.url) // if (!response.ok) { // console.error(response.status, response.statusText) // console.error(await response.text()) // throw new Error(`Fetch failed: ${dataset.url}`) // } // await pipeline(response.body!, fs.createWriteStream(zipFilePath)) fs.removeSync(zipFilePath) execSync(`wget --quiet ${dataset.url}`, { cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }) } const sqlFilename = `${dataset.database}.sql` const sqlFilePath = path.join(dataDir, sqlFilename) if (options.all || options.unzip) { if (!options.silent) { console.log(`Unzipping ${dataset.title}: ${zipFilename}…`) } fs.removeSync(sqlFilePath) const zip = new StreamZip({ file: zipFilePath, storeEntries: true, }) await new Promise((resolve, reject) => { zip.on("ready", () => { zip.extract(null, dataDir, (err?: any, _count?: number) => { zip.close() if (err) { reject(err) } else { resolve(null) } }) }) }) if (dataset.repairZip !== undefined) { if (!options.silent) { console.log(`Repairing Zip path ${dataset.title}: ${sqlFilename}…`) } dataset.repairZip(dataset, dataDir) } } if ((options.all || options["repair-encoding"]) && dataset.repairEncoding) { if (!options.silent) { console.log( `Repairing Windows CP1252 encoding of ${dataset.title}: ${sqlFilename}…`, ) } const repairedSqlFilePath = sqlFilePath + ".repaired" const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, { encoding: "utf8", }) const lineReader = readline.createInterface({ input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }), crlfDelay: Infinity, }) for await (const line of lineReader) { repairedSqlWriter.write( line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" }), ) + "\n", ) } repairedSqlWriter.end() await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true }) } if (options.all || options.import) { if (!options.silent) { console.log(`Importing ${dataset.title}: ${sqlFilename}…`) } execSync(`psql -c "DROP DATABASE IF EXISTS ${dataset.database}"`, { cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }) execSync( `psql -c "CREATE DATABASE ${dataset.database} WITH OWNER opendata"`, { cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }, ) execSync(`psql -f ${sqlFilename} ${dataset.database}`, { cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }) } if (options.schema) { const definitionsDir = path.resolve("src", "raw_types") assert(fs.statSync(definitionsDir).isDirectory()) if (!options.silent) { console.log( `Creating TypeScript definitions from schema of database ${dataset.database}…`, ) } const dbConnectionString = `postgres://${process.env.PGUSER}:${process.env.PGPASSWORD}@${process.env.PGHOST}:${process.env.PGPORT}/${dataset.database}` const definitionFilePath = path.join( definitionsDir, `${dataset.database}.ts`, ) execSync( `npx schemats generate -c ${dbConnectionString} -s public -o ${definitionFilePath}`, { // cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }, ) const definition = fs.readFileSync(definitionFilePath, { encoding: "utf8" }) const definitionRepaired = definition .replace(/\r\n/g, "\n") .replace( /AUTO-GENERATED FILE @ \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/, "AUTO-GENERATED FILE", ) fs.writeFileSync(definitionFilePath, definitionRepaired) } } async function retrieveOpenData(): Promise<void> { const dataDir = options.dataDir assert(dataDir, "Missing argument: data directory") process.env = { ...process.env, PGHOST: process.env.PGHOST || config.db.host, PGPORT: process.env.PGPORT || config.db.port, PGUSER: process.env.PGUSER || config.db.user, PGPASSWORD: process.env.PGPASSWORD || config.db.password } assert(process.env.PGHOST && process.env.PGPORT && process.env.PGUSER && process.env.PGPASSWORD, 'Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file' ) const choosenDatasets: Dataset[] = getChosenFromEnabledDatasets(options.categories) // await Promise.all(choosenDatasets.map(dataset => retrieveDataset(dataDir, dataset))) for (const dataset of choosenDatasets) { await retrieveDataset(dataDir, dataset) } } retrieveOpenData() .then(() => process.exit(0)) .catch((error) => { console.log(error) process.exit(1) })