@tricoteuses/senat
Version:
Handle French Sénat's open data
219 lines (218 loc) • 8.65 kB
JavaScript
import assert from "assert";
import { execSync } from "child_process";
import commandLineArgs from "command-line-args";
import fs from "fs-extra";
// import fetch from "node-fetch"
import path from "path";
// import stream from "stream"
import StreamZip from "node-stream-zip";
import readline from "readline";
// import util from "util"
import windows1252 from "windows-1252";
import config from "../config";
import { datasets, getChosenDatasets, getEnabledDatasets } from "../datasets";
import { commonOptions } from "./shared/cli_helpers";
const badWindows1252CharacterRegex = /[\u0080-\u009f]/g;
const optionsDefinitions = [
...commonOptions,
{
alias: "a",
help: "all options: fetch, unzip, repair-encoding, import",
name: "all",
type: Boolean,
},
{
alias: "c",
help: "create TypeScript interfaces from databases schemas into src/raw_types_* directories",
name: "schema",
type: Boolean,
},
{
alias: "e",
help: "repair Windows CP 1252 encoding of SQL dumps",
name: "repairEncoding",
type: Boolean,
},
{
alias: "f",
help: "fetch datasets instead of retrieving them from files",
name: "fetch",
type: Boolean,
},
{
alias: "i",
help: "import SQL dumps into a freshly (re-)created database",
name: "import",
type: Boolean,
},
{
alias: "S",
help: "sudo psql commands with given user",
name: "sudo",
type: String,
},
{
alias: "z",
help: "unzip SQL files",
name: "unzip",
type: Boolean,
},
];
const options = commandLineArgs(optionsDefinitions);
// const pipeline = util.promisify(stream.pipeline)
async function retrieveDataset(dataDir, dataset) {
const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1);
const zipFilePath = path.join(dataDir, zipFilename);
if (options["all"] || options["fetch"]) {
// Fetch & save ZIP file.
if (!options["silent"]) {
console.log(`Loading ${dataset.title}: ${zipFilename}…`);
}
// Fetch fails with OpenSSL error: dh key too small.
// (so does "curl").
// const response = await fetch(dataset.url)
// if (!response.ok) {
// console.error(response.status, response.statusText)
// console.error(await response.text())
// throw new Error(`Fetch failed: ${dataset.url}`)
// }
// await pipeline(response.body!, fs.createWriteStream(zipFilePath))
fs.removeSync(zipFilePath);
execSync(`wget --quiet ${dataset.url}`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
}
const sqlFilename = `${dataset.database}.sql`;
const sqlFilePath = path.join(dataDir, sqlFilename);
if (options["all"] || options["unzip"]) {
if (!options["silent"]) {
console.log(`Unzipping ${dataset.title}: ${zipFilename}…`);
}
fs.removeSync(sqlFilePath);
const zip = new StreamZip({
file: zipFilePath,
storeEntries: true,
});
await new Promise((resolve, reject) => {
zip.on("ready", () => {
zip.extract(null, dataDir, (err, _count) => {
zip.close();
if (err) {
reject(err);
}
else {
resolve(null);
}
});
});
});
if (dataset.repairZip !== undefined) {
if (!options["silent"]) {
console.log(`Repairing Zip path ${dataset.title}: ${sqlFilename}…`);
}
dataset.repairZip(dataset, dataDir);
}
}
if ((options["all"] || options["repairEncoding"]) && dataset.repairEncoding) {
if (!options["silent"]) {
console.log(`Repairing Windows CP1252 encoding of ${dataset.title}: ${sqlFilename}…`);
}
const repairedSqlFilePath = sqlFilePath + ".repaired";
const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, {
encoding: "utf8",
});
const lineReader = readline.createInterface({
input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }),
crlfDelay: Infinity,
});
for await (const line of lineReader) {
repairedSqlWriter.write(line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" })) + "\n");
}
repairedSqlWriter.end();
await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true });
}
if (options["all"] || options["import"] || options["schema"]) {
if (!options["silent"]) {
console.log(`Importing ${dataset.title}: ${sqlFilename}…`);
}
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d ${dataset.database} -f ${sqlFilename}`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
stdio: ["pipe", "ignore", "ignore"],
});
}
if (options["schema"]) {
let definitionsDir = path.resolve("src", "raw_types_schemats");
assert(fs.statSync(definitionsDir).isDirectory());
if (!options["silent"]) {
console.log(`Creating TypeScript definitions from schema of database ${dataset.database}…`);
}
const dbConnectionString = `postgres://${process.env["PGUSER"]}:${process.env["PGPASSWORD"]}@${process.env["PGHOST"]}:${process.env["PGPORT"]}/${dataset.database}`;
let definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`);
execSync(`npx schemats generate -c ${dbConnectionString} -s ${dataset.schema} -o ${definitionFilePath}`, {
// cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
const definition = fs.readFileSync(definitionFilePath, { encoding: "utf8" });
const definitionRepaired = definition
.replace(/\r\n/g, "\n")
.replace(/AUTO-GENERATED FILE @ \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/, "AUTO-GENERATED FILE");
fs.writeFileSync(definitionFilePath, definitionRepaired);
definitionsDir = path.resolve("src", "raw_types");
definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`);
execSync(`kysely-codegen --url ${dbConnectionString} --default-schema=${dataset.schema} --out-file=${definitionFilePath}`, {
// cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
}
}
async function retrieveOpenData() {
const dataDir = options["dataDir"];
assert(dataDir, "Missing argument: data directory");
process.env = {
...process.env,
PGHOST: process.env["PGHOST"] || config.db.host,
PGPORT: process.env["PGPORT"] || config.db.port,
PGUSER: process.env["PGUSER"] || config.db.user,
PGPASSWORD: process.env["PGPASSWORD"] || config.db.password,
};
assert(process.env["PGHOST"] &&
process.env["PGPORT"] &&
process.env["PGUSER"] &&
process.env["PGPASSWORD"], "Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file");
console.time("data extraction time");
for (const [, dataset] of Object.entries(datasets)) {
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "DROP DATABASE IF EXISTS ${dataset.database}"`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
});
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE DATABASE ${dataset.database} WITH OWNER opendata"`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
});
}
const enabledDatasets = getEnabledDatasets(options["categories"]);
const chosenDatasets = getChosenDatasets(enabledDatasets);
for (const dataset of chosenDatasets) {
await retrieveDataset(dataDir, dataset);
}
if (!options["silent"]) {
console.timeEnd("data extraction time");
}
}
retrieveOpenData()
.then(() => process.exit(0))
.catch((error) => {
console.log(error);
process.exit(1);
});