UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

317 lines (316 loc) 13.7 kB
import assert from "assert"; import { execSync } from "child_process"; import commandLineArgs from "command-line-args"; import fs from "fs-extra"; import path from "path"; import StreamZip from "node-stream-zip"; import readline from "readline"; import * as windows1252 from "windows-1252"; import { pipeline } from "stream"; import { promisify } from "util"; import config from "../config"; import { getChosenDatasets, getEnabledDatasets } from "../datasets"; import { commonOptions } from "./shared/cli_helpers"; const badWindows1252CharacterRegex = /[\u0080-\u009f]/g; const optionsDefinitions = [ ...commonOptions, { alias: "a", help: "all options: fetch, unzip, repair-encoding, import", name: "all", type: Boolean, }, { alias: "c", help: "create TypeScript interfaces from databases schemas into src/raw_types_* directories", name: "schema", type: Boolean, }, { alias: "e", help: "repair Windows CP 1252 encoding of SQL dumps", name: "repairEncoding", type: Boolean, }, { alias: "f", help: "fetch datasets instead of retrieving them from files", name: "fetch", type: Boolean, }, { alias: "i", help: "import SQL dumps into a freshly (re-)created database", name: "import", type: Boolean, }, { alias: "S", help: "sudo psql commands with given user", name: "sudo", type: String, }, { alias: "z", help: "unzip SQL files", name: "unzip", type: Boolean, }, ]; const options = commandLineArgs(optionsDefinitions); const streamPipeline = promisify(pipeline); async function downloadFile(url, dest) { const response = await fetch(url); if (!response.ok) { throw new Error(`Download failed ${response.status} ${response.statusText} for ${url}`); } await streamPipeline(response.body, fs.createWriteStream(dest)); } /** * Copy a dataset database to the main Senat database (overwriting its contents). */ async function copyToSenat(dataset, dataDir, options) { if (!options["silent"]) { console.log(`Copying ${dataset.database} to Senat database...`); } const sqlFilePath = path.join(dataDir, `${dataset.database}.sql`); const schemaDumpFile = path.join(dataDir, `${dataset.database}_schema_dump.sql`); // Write the header and then stream the rest of the SQL file const schemaSqlWriter = fs.createWriteStream(schemaDumpFile, { encoding: "utf8" }); // Add CREATE SCHEMA statement at the top schemaSqlWriter.write(`DROP SCHEMA IF EXISTS ${dataset.database} CASCADE;\n`); schemaSqlWriter.write(`CREATE SCHEMA IF NOT EXISTS ${dataset.database};\n`); schemaSqlWriter.write(`GRANT USAGE ON SCHEMA ${dataset.database} TO ${config.db.user};\n`); schemaSqlWriter.write(`GRANT SELECT ON ALL TABLES IN SCHEMA ${dataset.database} TO ${config.db.user};\n`); schemaSqlWriter.write(`ALTER DEFAULT PRIVILEGES IN SCHEMA ${dataset.database} GRANT SELECT ON TABLES TO ${config.db.user};\n`); const lineReader = readline.createInterface({ input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }), crlfDelay: Infinity, }); for await (const line of lineReader) { let newLine = line; // Replace 'public' schema outside single-quoted strings function replacePublicOutsideStrings(line, schema) { const parts = line.split(/(')/); let inString = false; for (let i = 0; i < parts.length; i++) { if (parts[i] === "'") { inString = !inString; } else if (!inString) { // Only replace outside of strings, including before comma parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema); } } return parts.join(""); } newLine = replacePublicOutsideStrings(line, dataset.database); // Replace SET client_encoding to UTF8 newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';"); schemaSqlWriter.write(newLine + "\n"); } schemaSqlWriter.end(); await new Promise((resolve, reject) => { schemaSqlWriter.on("finish", () => { try { execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d senat -f ${schemaDumpFile}`, { env: process.env, encoding: "utf-8", stdio: ["ignore", "pipe", "pipe"], }); } catch (error) { if (!options["silent"]) { console.error(`Failed to import ${dataset.database} schema:`); if (error.stderr) { console.error(error.stderr); } if (error.stdout) { console.error(error.stdout); } } } resolve(); }); schemaSqlWriter.on("error", reject); }); } async function retrieveDataset(dataDir, dataset) { const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1); const zipFilePath = path.join(dataDir, zipFilename); if (options["all"] || options["fetch"]) { // Fetch & save ZIP file. if (!options["silent"]) { console.log(`Loading ${dataset.title}: ${zipFilename}…`); } // Fetch fails with OpenSSL error: dh key too small. // (so does "curl"). // const response = await fetch(dataset.url) // if (!response.ok) { // console.error(response.status, response.statusText) // console.error(await response.text()) // throw new Error(`Fetch failed: ${dataset.url}`) // } // await pipeline(response.body!, fs.createWriteStream(zipFilePath)) fs.removeSync(zipFilePath); await downloadFile(dataset.url, zipFilePath); } const sqlFilename = `${dataset.database}.sql`; const sqlFilePath = path.join(dataDir, sqlFilename); if (options["all"] || options["unzip"]) { if (!options["silent"]) { console.log(`Unzipping ${dataset.title}: ${zipFilename}…`); } fs.removeSync(sqlFilePath); const zip = new StreamZip({ file: zipFilePath, storeEntries: true, }); await new Promise((resolve, reject) => { zip.on("ready", () => { zip.extract(null, dataDir, (err, _count) => { zip.close(); if (err) { reject(err); } else { resolve(null); } }); }); }); if (dataset.repairZip !== undefined) { if (!options["silent"]) { console.log(`Repairing Zip path ${dataset.title}: ${sqlFilename}…`); } dataset.repairZip(dataset, dataDir); } } if ((options["all"] || options["repairEncoding"]) && dataset.repairEncoding) { if (!options["silent"]) { console.log(`Repairing Windows CP1252 encoding in ${dataset.title}: ${sqlFilename}…`); } const repairedSqlFilePath = sqlFilePath + ".repaired"; const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, { encoding: "utf8", }); // Read the file as latin1 (ISO-8859-1/CP1252) and write as UTF-8 const lineReader = readline.createInterface({ input: fs.createReadStream(sqlFilePath, { encoding: "latin1" }), crlfDelay: Infinity, }); for await (const line of lineReader) { // Optionally repair Windows-1252 control characters let repairedLine = line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" })); repairedSqlWriter.write(repairedLine + "\n"); } repairedSqlWriter.end(); await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true }); } if (options["all"] || options["import"] || options["schema"]) { if (!options["silent"]) { console.log(`Importing ${dataset.title}: ${sqlFilename}…`); } await copyToSenat(dataset, dataDir, options); // Create indexes programmatically after import if (dataset.indexes) { for (const [table, indexes] of Object.entries(dataset.indexes)) { for (const index of indexes) { const indexName = index.name; const columns = index.columns.join(", "); const schema = dataset.database; const sql = `CREATE INDEX IF NOT EXISTS ${indexName} ON ${schema}.${table} (${columns});`; try { execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d senat -c "${sql}"`, { env: process.env, encoding: "utf-8", stdio: ["ignore", "ignore", "pipe"], }); if (!options["silent"]) { console.log(`Created index: ${indexName} on ${schema}.${table} (${columns})`); } } catch (err) { console.error(`Failed to create index ${indexName} on ${schema}.${table}:`, err); } } } } } if (options["schema"]) { let definitionsDir = path.resolve("src", "raw_types_schemats"); assert(fs.statSync(definitionsDir).isDirectory()); if (!options["silent"]) { console.log(`Creating TypeScript definitions from schema '${dataset.database}' in database 'senat'…`); } const dbConnectionString = `postgres://${process.env["PGUSER"]}:${process.env["PGPASSWORD"]}@${process.env["PGHOST"]}:${process.env["PGPORT"]}/senat`; let definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`); execSync(`npx schemats generate -c ${dbConnectionString} -s ${dataset.database} -o ${definitionFilePath}`, { // cwd: dataDir, env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); const definition = fs.readFileSync(definitionFilePath, { encoding: "utf8" }); const definitionRepaired = definition .replace(/\r\n/g, "\n") .replace(/AUTO-GENERATED FILE @ \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/, "AUTO-GENERATED FILE"); fs.writeFileSync(definitionFilePath, definitionRepaired); definitionsDir = path.resolve("src", "raw_types"); definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`); execSync(`npx kysely-codegen --url '${dbConnectionString}' --default-schema ${dataset.database} --include-pattern '${dataset.database}.*' --out-file ${definitionFilePath}`, { env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); } } async function retrieveOpenData() { const dataDir = options["dataDir"]; assert(dataDir, "Missing argument: data directory"); process.env = { ...process.env, PGHOST: process.env["PGHOST"] || config.db.host, PGPORT: process.env["PGPORT"] || String(config.db.port), PGDATABASE: process.env["PGDATABASE"] || config.db.name, PGUSER: process.env["PGUSER"] || config.db.user, PGPASSWORD: process.env["PGPASSWORD"] || config.db.password, }; assert(process.env["PGHOST"] && process.env["PGPORT"] && process.env["PGUSER"] && process.env["PGPASSWORD"], "Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file"); console.time("data extraction time"); // Create role 'opendata' if it does not exist execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE ROLE opendata" || true`, { cwd: dataDir, env: process.env, encoding: "utf-8", }); execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE DATABASE senat WITH OWNER opendata" || true`, { cwd: dataDir, env: process.env, encoding: "utf-8", }); const enabledDatasets = getEnabledDatasets(options["categories"]); const chosenDatasets = getChosenDatasets(enabledDatasets); for (const dataset of chosenDatasets) { await retrieveDataset(dataDir, dataset); } if (options["schema"]) { const dbConnectionString = `postgres://${process.env["PGUSER"]}:${process.env["PGPASSWORD"]}@${process.env["PGHOST"]}:${process.env["PGPORT"]}/senat`; const definitionsDir = path.resolve("src", "raw_types"); const definitionFilePath = path.join(definitionsDir, "senat.ts"); execSync(`npx kysely-codegen --url '${dbConnectionString}' --out-file ${definitionFilePath}`, { env: process.env, encoding: "utf-8", // stdio: ["ignore", "ignore", "pipe"], }); } if (!options["silent"]) { console.timeEnd("data extraction time"); } } retrieveOpenData() .then(() => process.exit(0)) .catch((error) => { console.log(error); process.exit(1); });