@tricoteuses/senat
Version:
Handle French Sénat's open data
317 lines (316 loc) • 13.7 kB
JavaScript
import assert from "assert";
import { execSync } from "child_process";
import commandLineArgs from "command-line-args";
import fs from "fs-extra";
import path from "path";
import StreamZip from "node-stream-zip";
import readline from "readline";
import * as windows1252 from "windows-1252";
import { pipeline } from "stream";
import { promisify } from "util";
import config from "../config";
import { getChosenDatasets, getEnabledDatasets } from "../datasets";
import { commonOptions } from "./shared/cli_helpers";
const badWindows1252CharacterRegex = /[\u0080-\u009f]/g;
const optionsDefinitions = [
...commonOptions,
{
alias: "a",
help: "all options: fetch, unzip, repair-encoding, import",
name: "all",
type: Boolean,
},
{
alias: "c",
help: "create TypeScript interfaces from databases schemas into src/raw_types_* directories",
name: "schema",
type: Boolean,
},
{
alias: "e",
help: "repair Windows CP 1252 encoding of SQL dumps",
name: "repairEncoding",
type: Boolean,
},
{
alias: "f",
help: "fetch datasets instead of retrieving them from files",
name: "fetch",
type: Boolean,
},
{
alias: "i",
help: "import SQL dumps into a freshly (re-)created database",
name: "import",
type: Boolean,
},
{
alias: "S",
help: "sudo psql commands with given user",
name: "sudo",
type: String,
},
{
alias: "z",
help: "unzip SQL files",
name: "unzip",
type: Boolean,
},
];
const options = commandLineArgs(optionsDefinitions);
const streamPipeline = promisify(pipeline);
async function downloadFile(url, dest) {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Download failed ${response.status} ${response.statusText} for ${url}`);
}
await streamPipeline(response.body, fs.createWriteStream(dest));
}
/**
* Copy a dataset database to the main Senat database (overwriting its contents).
*/
async function copyToSenat(dataset, dataDir, options) {
if (!options["silent"]) {
console.log(`Copying ${dataset.database} to Senat database...`);
}
const sqlFilePath = path.join(dataDir, `${dataset.database}.sql`);
const schemaDumpFile = path.join(dataDir, `${dataset.database}_schema_dump.sql`);
// Write the header and then stream the rest of the SQL file
const schemaSqlWriter = fs.createWriteStream(schemaDumpFile, { encoding: "utf8" });
// Add CREATE SCHEMA statement at the top
schemaSqlWriter.write(`DROP SCHEMA IF EXISTS ${dataset.database} CASCADE;\n`);
schemaSqlWriter.write(`CREATE SCHEMA IF NOT EXISTS ${dataset.database};\n`);
schemaSqlWriter.write(`GRANT USAGE ON SCHEMA ${dataset.database} TO ${config.db.user};\n`);
schemaSqlWriter.write(`GRANT SELECT ON ALL TABLES IN SCHEMA ${dataset.database} TO ${config.db.user};\n`);
schemaSqlWriter.write(`ALTER DEFAULT PRIVILEGES IN SCHEMA ${dataset.database} GRANT SELECT ON TABLES TO ${config.db.user};\n`);
const lineReader = readline.createInterface({
input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }),
crlfDelay: Infinity,
});
for await (const line of lineReader) {
let newLine = line;
// Replace 'public' schema outside single-quoted strings
function replacePublicOutsideStrings(line, schema) {
const parts = line.split(/(')/);
let inString = false;
for (let i = 0; i < parts.length; i++) {
if (parts[i] === "'") {
inString = !inString;
}
else if (!inString) {
// Only replace outside of strings, including before comma
parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema);
}
}
return parts.join("");
}
newLine = replacePublicOutsideStrings(line, dataset.database);
// Replace SET client_encoding to UTF8
newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
schemaSqlWriter.write(newLine + "\n");
}
schemaSqlWriter.end();
await new Promise((resolve, reject) => {
schemaSqlWriter.on("finish", () => {
try {
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d senat -f ${schemaDumpFile}`, {
env: process.env,
encoding: "utf-8",
stdio: ["ignore", "pipe", "pipe"],
});
}
catch (error) {
if (!options["silent"]) {
console.error(`Failed to import ${dataset.database} schema:`);
if (error.stderr) {
console.error(error.stderr);
}
if (error.stdout) {
console.error(error.stdout);
}
}
}
resolve();
});
schemaSqlWriter.on("error", reject);
});
}
async function retrieveDataset(dataDir, dataset) {
const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1);
const zipFilePath = path.join(dataDir, zipFilename);
if (options["all"] || options["fetch"]) {
// Fetch & save ZIP file.
if (!options["silent"]) {
console.log(`Loading ${dataset.title}: ${zipFilename}…`);
}
// Fetch fails with OpenSSL error: dh key too small.
// (so does "curl").
// const response = await fetch(dataset.url)
// if (!response.ok) {
// console.error(response.status, response.statusText)
// console.error(await response.text())
// throw new Error(`Fetch failed: ${dataset.url}`)
// }
// await pipeline(response.body!, fs.createWriteStream(zipFilePath))
fs.removeSync(zipFilePath);
await downloadFile(dataset.url, zipFilePath);
}
const sqlFilename = `${dataset.database}.sql`;
const sqlFilePath = path.join(dataDir, sqlFilename);
if (options["all"] || options["unzip"]) {
if (!options["silent"]) {
console.log(`Unzipping ${dataset.title}: ${zipFilename}…`);
}
fs.removeSync(sqlFilePath);
const zip = new StreamZip({
file: zipFilePath,
storeEntries: true,
});
await new Promise((resolve, reject) => {
zip.on("ready", () => {
zip.extract(null, dataDir, (err, _count) => {
zip.close();
if (err) {
reject(err);
}
else {
resolve(null);
}
});
});
});
if (dataset.repairZip !== undefined) {
if (!options["silent"]) {
console.log(`Repairing Zip path ${dataset.title}: ${sqlFilename}…`);
}
dataset.repairZip(dataset, dataDir);
}
}
if ((options["all"] || options["repairEncoding"]) && dataset.repairEncoding) {
if (!options["silent"]) {
console.log(`Repairing Windows CP1252 encoding in ${dataset.title}: ${sqlFilename}…`);
}
const repairedSqlFilePath = sqlFilePath + ".repaired";
const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, {
encoding: "utf8",
});
// Read the file as latin1 (ISO-8859-1/CP1252) and write as UTF-8
const lineReader = readline.createInterface({
input: fs.createReadStream(sqlFilePath, { encoding: "latin1" }),
crlfDelay: Infinity,
});
for await (const line of lineReader) {
// Optionally repair Windows-1252 control characters
let repairedLine = line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" }));
repairedSqlWriter.write(repairedLine + "\n");
}
repairedSqlWriter.end();
await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true });
}
if (options["all"] || options["import"] || options["schema"]) {
if (!options["silent"]) {
console.log(`Importing ${dataset.title}: ${sqlFilename}…`);
}
await copyToSenat(dataset, dataDir, options);
// Create indexes programmatically after import
if (dataset.indexes) {
for (const [table, indexes] of Object.entries(dataset.indexes)) {
for (const index of indexes) {
const indexName = index.name;
const columns = index.columns.join(", ");
const schema = dataset.database;
const sql = `CREATE INDEX IF NOT EXISTS ${indexName} ON ${schema}.${table} (${columns});`;
try {
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d senat -c "${sql}"`, {
env: process.env,
encoding: "utf-8",
stdio: ["ignore", "ignore", "pipe"],
});
if (!options["silent"]) {
console.log(`Created index: ${indexName} on ${schema}.${table} (${columns})`);
}
}
catch (err) {
console.error(`Failed to create index ${indexName} on ${schema}.${table}:`, err);
}
}
}
}
}
if (options["schema"]) {
let definitionsDir = path.resolve("src", "raw_types_schemats");
assert(fs.statSync(definitionsDir).isDirectory());
if (!options["silent"]) {
console.log(`Creating TypeScript definitions from schema '${dataset.database}' in database 'senat'…`);
}
const dbConnectionString = `postgres://${process.env["PGUSER"]}:${process.env["PGPASSWORD"]}@${process.env["PGHOST"]}:${process.env["PGPORT"]}/senat`;
let definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`);
execSync(`npx schemats generate -c ${dbConnectionString} -s ${dataset.database} -o ${definitionFilePath}`, {
// cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
const definition = fs.readFileSync(definitionFilePath, { encoding: "utf8" });
const definitionRepaired = definition
.replace(/\r\n/g, "\n")
.replace(/AUTO-GENERATED FILE @ \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/, "AUTO-GENERATED FILE");
fs.writeFileSync(definitionFilePath, definitionRepaired);
definitionsDir = path.resolve("src", "raw_types");
definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`);
execSync(`npx kysely-codegen --url '${dbConnectionString}' --default-schema ${dataset.database} --include-pattern '${dataset.database}.*' --out-file ${definitionFilePath}`, {
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
}
}
async function retrieveOpenData() {
const dataDir = options["dataDir"];
assert(dataDir, "Missing argument: data directory");
process.env = {
...process.env,
PGHOST: process.env["PGHOST"] || config.db.host,
PGPORT: process.env["PGPORT"] || String(config.db.port),
PGDATABASE: process.env["PGDATABASE"] || config.db.name,
PGUSER: process.env["PGUSER"] || config.db.user,
PGPASSWORD: process.env["PGPASSWORD"] || config.db.password,
};
assert(process.env["PGHOST"] && process.env["PGPORT"] && process.env["PGUSER"] && process.env["PGPASSWORD"], "Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file");
console.time("data extraction time");
// Create role 'opendata' if it does not exist
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE ROLE opendata" || true`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
});
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE DATABASE senat WITH OWNER opendata" || true`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
});
const enabledDatasets = getEnabledDatasets(options["categories"]);
const chosenDatasets = getChosenDatasets(enabledDatasets);
for (const dataset of chosenDatasets) {
await retrieveDataset(dataDir, dataset);
}
if (options["schema"]) {
const dbConnectionString = `postgres://${process.env["PGUSER"]}:${process.env["PGPASSWORD"]}@${process.env["PGHOST"]}:${process.env["PGPORT"]}/senat`;
const definitionsDir = path.resolve("src", "raw_types");
const definitionFilePath = path.join(definitionsDir, "senat.ts");
execSync(`npx kysely-codegen --url '${dbConnectionString}' --out-file ${definitionFilePath}`, {
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
}
if (!options["silent"]) {
console.timeEnd("data extraction time");
}
}
retrieveOpenData()
.then(() => process.exit(0))
.catch((error) => {
console.log(error);
process.exit(1);
});