data-cli
Version:
CLI for working with data packages
180 lines (158 loc) • 6 kB
JavaScript
// Packages
const fs = require('fs')
const path = require('path')
const url = require('url')
const mkdirp = require('mkdirp')
const minimist = require('minimist')
const {Dataset, File, isDataset, parseDatasetIdentifier} = require('data.js')
const {get, config} = require('datahub-client')
const unzip = require('unzip')
// Ours
const {customMarked} = require('../lib/utils/tools.js')
const wait = require('../lib/utils/output/wait')
const {handleError} = require('../lib/utils/error')
const argv = minimist(process.argv.slice(2), {
string: ['get'],
boolean: ['help', 'debug'],
alias: {help: 'h'}
})
const getMarkdown = fs.readFileSync(path.join(__dirname, '../docs/get.md'), 'utf8')
const help = () => {
console.log('\n' + customMarked(getMarkdown))
}
if (argv.help || !argv._[0]) {
help()
process.exit(0)
}
let identifier = argv._[0]
const run = async () => {
const stopSpinner = wait('Loading...')
try {
const start = new Date()
let pathToSave
const parsedIdentifier = await parseDatasetIdentifier(identifier)
const itIsDataset = isDataset(identifier)
const githubDataset = parsedIdentifier.type === 'github' && parsedIdentifier.name.slice((parsedIdentifier.name.lastIndexOf('.') - 1 >>> 0) + 2) === ''
if (itIsDataset || githubDataset) {
const dataset = await Dataset.load(identifier)
const owner = dataset.identifier.owner || ''
const name = dataset.identifier.name
pathToSave = path.join(owner, name)
if (!checkDestIsEmpty(owner, name)) {
throw new Error(`${owner}/${name} is not empty!`)
}
/** usual dataset download */
const allResources = await get(dataset)
// Save all files on disk
const myPromises = allResources.map(async resource => {
return saveIt(owner, name, resource)
})
await Promise.all(myPromises)
} else if (parsedIdentifier.type === "datahub") {
// Remove trailing slash:
if(identifier.substr(-1) === '/' && identifier.length > 1) {
identifier = identifier.slice(0, identifier.length - 1)
}
// We assume that if /r/ is in identifier then it's r link.
if (identifier.includes('/r/')) {
pathToSave = await saveFileFromUrl(identifier, argv.format)
} else {
// Try to guess owner and dataset name here. We're not loading Dataset object
// because we want to handle private datasets as well:
const idParts = identifier.split('/')
const owner = idParts[idParts.length - 2]
const name = idParts[idParts.length - 1]
const token = config.get('token')
pathToSave = path.join(owner, name)
if (!checkDestIsEmpty(owner, name)) {
throw new Error(`${owner}/${name} is not empty!`)
}
/** For datasets from the datahub we get zipped version and unzip it.
- less traffic
- zipped version has a fancy file structure
#issue: https://github.com/datahq/datahub-qa/issues/86 */
const zipped_dataset_url = `https://datahub.io/${owner}/${name}/r/${name}_zip.zip?jwt=${token}`
const archive_path = await saveFileFromUrl(zipped_dataset_url, 'zip')
// unzip archive into destination folder
fs.createReadStream(archive_path)
.pipe(unzip.Extract({ path: pathToSave }))
// removing the archive file once we extracted all the dataset files
.on('finish', () => {fs.unlinkSync(archive_path)})
}
} else { // If it is not a dataset - download the file
if (parsedIdentifier.type === 'github' && !githubDataset) {
identifier += `?raw=true`
}
pathToSave = await saveFileFromUrl(identifier, argv.format)
}
// show time statistic & success message
stopSpinner()
const end = new Date() - start
console.log(`Time elapsed: ${(end / 1000).toFixed(2)} s`)
console.log(`Dataset/file is saved in "${pathToSave}"`)
} catch (err) {
stopSpinner()
if (argv.debug) {
console.log('> [debug]\n' + err.stack)
}
await handleError(err)
process.exit(1)
}
}
run()
/**
* Download file from url and save it locally using data.js 'File' object.
* returns path, where the file was saved ( ${filename}.${fileformat} )
* Using: let savedPath = await saveFileFromUrl(url, format)
* @param url: url to get the file
* @param format: csv, json, zip, etc
* @returns {Promise}
*/
const saveFileFromUrl = (url, format) => {
return new Promise(async (resolve, reject) =>{
const file = await File.load(url, {format: format})
const destPath = [file.descriptor.name, file.descriptor.format].join('.')
let stream
try {
stream = await file.stream()
} catch (err) {
if (err.message === 'Not Found') {
err.message += ' or Forbidden.'
}
await handleError(err)
process.exit(1)
}
stream.pipe(fs.createWriteStream(destPath)).on('finish', () => {
resolve(destPath)
})
})
}
const saveIt = (owner, name, resource) => {
return new Promise(async (resolve, reject) => {
// We only can save if path is defined
if (resource.descriptor.path) {
const pathParts = url.parse(resource.descriptor.path)
let destPath
if (pathParts.protocol === 'http:' || pathParts.protocol === 'https:') {
const relativePath = resource.descriptor.path.split('/').slice(5).join('/')
destPath = path.join(owner, name, relativePath)
} else {
destPath = path.join(owner, name, resource.descriptor.path)
}
mkdirp.sync(path.dirname(destPath))
const stream = await resource.stream()
stream.pipe(fs.createWriteStream(destPath)).on('finish', () => {
resolve()
})
}
})
}
// TODO: Move this somewhere to utils
const checkDestIsEmpty = (owner, name) => {
const dest = path.join(owner, name)
return !fs.existsSync(dest) || fs.readdirSync(dest).length === 0;
}
module.exports = {
checkDestIsEmpty
}