@tricoteuses/senat
Version:
Handle French Sénat's open data
259 lines (246 loc) • 7.51 kB
text/typescript
import assert from "assert"
import { execSync } from "child_process"
import commandLineArgs from "command-line-args"
import fs from "fs-extra"
// import fetch from "node-fetch"
import path from "path"
// import stream from "stream"
import StreamZip from "node-stream-zip"
import readline from "readline"
// import util from "util"
import windows1252 from "windows-1252"
import config from "../config"
import {
Dataset,
getChosenFromEnabledDatasets
} from '../datasets'
const badWindows1252CharacterRegex = /[\u0080-\u009f]/g
const optionsDefinitions = [
{
alias: 'k',
defaultValue: ['All'],
help: 'categories of datasets to reorganize',
multiple: true,
name: 'categories',
type: String,
},
{
alias: "a",
help: "all options: fetch, unzip, repair-encoding, import, schema",
name: "all",
type: Boolean,
},
{
alias: "c",
help:
"create TypeScript interfaces from databases schemas into src/raw_types directory",
name: "schema",
type: Boolean,
},
{
alias: "e",
help: "repair Windows CP 1252 encoding of SQL dumps",
name: "repair-encoding",
type: Boolean,
},
{
alias: "f",
help: "fetch datasets instead of retrieving them from files",
name: "fetch",
type: Boolean,
},
{
alias: "i",
help: "import SQL dumps into a freshly (re-)created database",
name: "import",
type: Boolean,
},
{
alias: "z",
help: "unzip SQL files",
name: "unzip",
type: Boolean,
},
{
alias: "s",
help: "don't log anything",
name: "silent",
type: Boolean,
},
{
defaultOption: true,
help: "directory containing Sénat open data files",
name: "dataDir",
type: String,
},
]
const options = commandLineArgs(optionsDefinitions)
// const pipeline = util.promisify(stream.pipeline)
async function retrieveDataset(
dataDir: string,
dataset: Dataset,
): Promise<void> {
const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1)
const zipFilePath = path.join(dataDir, zipFilename)
if (options.all || options.fetch) {
// Fetch & save ZIP file.
if (!options.silent) {
console.log(`Loading ${dataset.title}: ${zipFilename}…`)
}
// Fetch fails with OpenSSL error: dh key too small.
// (so does "curl").
// const response = await fetch(dataset.url)
// if (!response.ok) {
// console.error(response.status, response.statusText)
// console.error(await response.text())
// throw new Error(`Fetch failed: ${dataset.url}`)
// }
// await pipeline(response.body!, fs.createWriteStream(zipFilePath))
fs.removeSync(zipFilePath)
execSync(`wget --quiet ${dataset.url}`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
})
}
const sqlFilename = `${dataset.database}.sql`
const sqlFilePath = path.join(dataDir, sqlFilename)
if (options.all || options.unzip) {
if (!options.silent) {
console.log(`Unzipping ${dataset.title}: ${zipFilename}…`)
}
fs.removeSync(sqlFilePath)
const zip = new StreamZip({
file: zipFilePath,
storeEntries: true,
})
await new Promise((resolve, reject) => {
zip.on("ready", () => {
zip.extract(null, dataDir, (err?: any, _count?: number) => {
zip.close()
if (err) {
reject(err)
} else {
resolve(null)
}
})
})
})
if (dataset.repairZip !== undefined) {
if (!options.silent) {
console.log(`Repairing Zip path ${dataset.title}: ${sqlFilename}…`)
}
dataset.repairZip(dataset, dataDir)
}
}
if ((options.all || options["repair-encoding"]) && dataset.repairEncoding) {
if (!options.silent) {
console.log(
`Repairing Windows CP1252 encoding of ${dataset.title}: ${sqlFilename}…`,
)
}
const repairedSqlFilePath = sqlFilePath + ".repaired"
const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, {
encoding: "utf8",
})
const lineReader = readline.createInterface({
input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }),
crlfDelay: Infinity,
})
for await (const line of lineReader) {
repairedSqlWriter.write(
line.replace(badWindows1252CharacterRegex, (match) =>
windows1252.decode(match, { mode: "fatal" }),
) + "\n",
)
}
repairedSqlWriter.end()
await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true })
}
if (options.all || options.import) {
if (!options.silent) {
console.log(`Importing ${dataset.title}: ${sqlFilename}…`)
}
execSync(`psql -c "DROP DATABASE IF EXISTS ${dataset.database}"`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
})
execSync(
`psql -c "CREATE DATABASE ${dataset.database} WITH OWNER opendata"`,
{
cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
},
)
execSync(`psql -f ${sqlFilename} ${dataset.database}`, {
cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
})
}
if (options.schema) {
const definitionsDir = path.resolve("src", "raw_types")
assert(fs.statSync(definitionsDir).isDirectory())
if (!options.silent) {
console.log(
`Creating TypeScript definitions from schema of database ${dataset.database}…`,
)
}
const dbConnectionString = `postgres://${process.env.PGUSER}:${process.env.PGPASSWORD}@${process.env.PGHOST}:${process.env.PGPORT}/${dataset.database}`
const definitionFilePath = path.join(
definitionsDir,
`${dataset.database}.ts`,
)
execSync(
`npx schemats generate -c ${dbConnectionString} -s public -o ${definitionFilePath}`,
{
// cwd: dataDir,
env: process.env,
encoding: "utf-8",
// stdio: ["ignore", "ignore", "pipe"],
},
)
const definition = fs.readFileSync(definitionFilePath, { encoding: "utf8" })
const definitionRepaired = definition
.replace(/\r\n/g, "\n")
.replace(
/AUTO-GENERATED FILE @ \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/,
"AUTO-GENERATED FILE",
)
fs.writeFileSync(definitionFilePath, definitionRepaired)
}
}
async function retrieveOpenData(): Promise<void> {
const dataDir = options.dataDir
assert(dataDir, "Missing argument: data directory")
process.env = {
...process.env,
PGHOST: process.env.PGHOST || config.db.host,
PGPORT: process.env.PGPORT || config.db.port,
PGUSER: process.env.PGUSER || config.db.user,
PGPASSWORD: process.env.PGPASSWORD || config.db.password
}
assert(process.env.PGHOST
&& process.env.PGPORT
&& process.env.PGUSER
&& process.env.PGPASSWORD,
'Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file'
)
const choosenDatasets: Dataset[] = getChosenFromEnabledDatasets(options.categories)
// await Promise.all(choosenDatasets.map(dataset => retrieveDataset(dataDir, dataset)))
for (const dataset of choosenDatasets) {
await retrieveDataset(dataDir, dataset)
}
}
retrieveOpenData()
.then(() => process.exit(0))
.catch((error) => {
console.log(error)
process.exit(1)
})