@tricoteuses/senat
Version:
Handle French Sénat's open data
219 lines (218 loc) • 8.1 kB
JavaScript
import assert from "assert";
import { execSync } from "child_process";
import commandLineArgs from "command-line-args";
import fs from "fs-extra";
// import fetch from "node-fetch"
import path from "path";
// import stream from "stream"
import StreamZip from "node-stream-zip";
import readline from "readline";
// import util from "util"
import windows1252 from "windows-1252";
import config from "../config";
import { getChosenFromEnabledDatasets } from '../datasets';
const badWindows1252CharacterRegex = /[\u0080-\u009f]/g;
const optionsDefinitions = [
{
alias: 'k',
defaultValue: ['All'],
help: 'categories of datasets to reorganize',
multiple: true,
name: 'categories',
type: String,
},
{
alias: "a",
help: "all options: fetch, unzip, repair-encoding, import, schema",
name: "all",
type: Boolean,
},
{
alias: "c",
help: "create TypeScript interfaces from databases schemas into src/raw_types directory",
name: "schema",
type: Boolean,
},
{
alias: "e",
help: "repair Windows CP 1252 encoding of SQL dumps",
name: "repair-encoding",
type: Boolean,
},
{
alias: "f",
help: "fetch datasets instead of retrieving them from files",
name: "fetch",
type: Boolean,
},
{
alias: "i",
help: "import SQL dumps into a freshly (re-)created database",
name: "import",
type: Boolean,
},
{
alias: "z",
help: "unzip SQL files",
name: "unzip",
type: Boolean,
},
{
alias: "s",
help: "don't log anything",
name: "silent",
type: Boolean,
},
{
defaultOption: true,
help: "directory containing Sénat open data files",
name: "dataDir",
type: String,
},
];
const options = commandLineArgs(optionsDefinitions);
// const pipeline = util.promisify(stream.pipeline)
async function retrieveDataset(dataDir, dataset) {
const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1);
const zipFilePath = path.join(dataDir, zipFilename);
if (options.all || options.fetch) {
// Fetch & save ZIP file.
if (!options.silent) {
console.log(`Loading ${dataset.title}: ${zipFilename}…`);
}
// Fetch fails with OpenSSL error: dh key too small.
// (so does "curl").
// const response = await fetch(dataset.url)
// if (!response.ok) {
// console.error(response.status, response.statusText)
// console.error(await response.text())
// throw new Error(`Fetch failed: ${dataset.url}`)
// }
// await pipeline(response.body!, fs.createWriteStream(zipFilePath))
fs.removeSync(zipFilePath);
execSync(`wget --quiet ${dataset.url}`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
}
const sqlFilename = `${dataset.database}.sql`;
const sqlFilePath = path.join(dataDir, sqlFilename);
if (options.all || options.unzip) {
if (!options.silent) {
console.log(`Unzipping ${dataset.title}: ${zipFilename}…`);
}
fs.removeSync(sqlFilePath);
const zip = new StreamZip({
file: zipFilePath,
storeEntries: true,
});
await new Promise((resolve, reject) => {
zip.on("ready", () => {
zip.extract(null, dataDir, (err, _count) => {
zip.close();
if (err) {
reject(err);
}
else {
resolve(null);
}
});
});
});
if (dataset.repairZip !== undefined) {
if (!options.silent) {
console.log(`Repairing Zip path ${dataset.title}: ${sqlFilename}…`);
}
dataset.repairZip(dataset, dataDir);
}
}
if ((options.all || options["repair-encoding"]) && dataset.repairEncoding) {
if (!options.silent) {
console.log(`Repairing Windows CP1252 encoding of ${dataset.title}: ${sqlFilename}…`);
}
const repairedSqlFilePath = sqlFilePath + ".repaired";
const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, {
encoding: "utf8",
});
const lineReader = readline.createInterface({
input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }),
crlfDelay: Infinity,
});
for await (const line of lineReader) {
repairedSqlWriter.write(line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" })) + "\n");
}
repairedSqlWriter.end();
await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true });
}
if (options.all || options.import) {
if (!options.silent) {
console.log(`Importing ${dataset.title}: ${sqlFilename}…`);
}
execSync(`psql -c "DROP DATABASE IF EXISTS ${dataset.database}"`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
execSync(`psql -c "CREATE DATABASE ${dataset.database} WITH OWNER opendata"`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
execSync(`psql -f ${sqlFilename} ${dataset.database}`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
}
if (options.schema) {
const definitionsDir = path.resolve("src", "raw_types");
assert(fs.statSync(definitionsDir).isDirectory());
if (!options.silent) {
console.log(`Creating TypeScript definitions from schema of database ${dataset.database}…`);
}
const dbConnectionString = `postgres://${process.env.PGUSER}:${process.env.PGPASSWORD}@${process.env.PGHOST}:${process.env.PGPORT}/${dataset.database}`;
const definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`);
execSync(`npx schemats generate -c ${dbConnectionString} -s public -o ${definitionFilePath}`, {
// cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
});
const definition = fs.readFileSync(definitionFilePath, { encoding: "utf8" });
const definitionRepaired = definition
.replace(/\r\n/g, "\n")
.replace(/AUTO-GENERATED FILE @ \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/, "AUTO-GENERATED FILE");
fs.writeFileSync(definitionFilePath, definitionRepaired);
}
}
async function retrieveOpenData() {
const dataDir = options.dataDir;
assert(dataDir, "Missing argument: data directory");
process.env = {
...process.env,
PGHOST: process.env.PGHOST || config.db.host,
PGPORT: process.env.PGPORT || config.db.port,
PGUSER: process.env.PGUSER || config.db.user,
PGPASSWORD: process.env.PGPASSWORD || config.db.password
};
assert(process.env.PGHOST
&& process.env.PGPORT
&& process.env.PGUSER
&& process.env.PGPASSWORD, 'Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file');
const choosenDatasets = getChosenFromEnabledDatasets(options.categories);
// await Promise.all(choosenDatasets.map(dataset => retrieveDataset(dataDir, dataset)))
for (const dataset of choosenDatasets) {
await retrieveDataset(dataDir, dataset);
}
}
retrieveOpenData()
.then(() => process.exit(0))
.catch((error) => {
console.log(error);
process.exit(1);
});