firestore-to-bigquery-export
Version:
NPM package for copying and converting Firestore data to BigQuery.
298 lines (262 loc) • 8.54 kB
JavaScript
/*!
* firestore-to-bigquery-export
*
* Copyright © 2019-2020 Johannes Berggren <johannes@berggren.co>
* MIT Licensed
*
*/
/**
* Module dependencies.
*
* @private
*/
const { BigQuery } = require('@google-cloud/bigquery'),
firebase = require('firebase-admin')
let bigQuery = {},
currentRow = {},
firestore = {}
/**
* Connecting to the given Firebase project.
*
* @param {JSON} serviceAccountFile
* @public
*/
exports.setFirebaseConfig = serviceAccountFile => {
firestore = firebase.initializeApp({
credential: firebase.credential.cert(serviceAccountFile)
}, 'firestore-to-bigquery-export-instance').firestore()
}
/**
* Connecting to the given BigQuery project.
*
* @param {JSON} serviceAccountFile
* @public
*/
exports.setBigQueryConfig = serviceAccountFile => {
bigQuery = new BigQuery({
projectId: serviceAccountFile.project_id,
credentials: serviceAccountFile
})
}
/**
* Runs through all documents in the given collection to ensure all properties are added to the schema.
* Generating schema. Creating a table with the created schema in the given dataset.
*
* @param {string} datasetID
* @param {string} collectionName
* @param {boolean} [verbose = false]
* @param {Array<string>} exclude
* @returns {Promise<BigQuery.Table>}
* @private
*/
exports.createBigQueryTable = async (datasetID, collectionName, verbose = false, exclude = []) => {
const index = {}
const options = {
schema: {
fields: [
{
name: 'doc_ID',
type: 'STRING',
mode: 'REQUIRED'
}
]
}
}
const snapshot = await firestore.collection(collectionName).get()
if (verbose) console.log('Creating schema and table ' + collectionName + '.')
snapshot.forEach(document => {
document = document.data()
Object.keys(document).forEach(propName => {
if (!exclude.includes(propName)) {
const schemaField = _getSchemaField(document[propName], propName)
if (schemaField !== undefined) {
if (!Object.prototype.hasOwnProperty.call(index, schemaField.name)) {
options.schema.fields.push(schemaField)
schemaField.index = options.schema.fields.length - 1
index[schemaField.name] = schemaField
}
else {
const currentValue = index[schemaField.name]
if (schemaField.type === 'FLOAT' && currentValue.type === 'INTEGER') {
options.schema.fields[currentValue.index] = schemaField
index[schemaField.name].type = 'FLOAT'
}
}
}
}
})
})
if (verbose) {
console.log('Completed schema generation for table ' + collectionName + ':')
options.schema.fields.forEach(o => {
console.log(o)
})
}
return bigQuery.dataset(datasetID).createTable(collectionName, options)
/**
* Determines schema field properties based on the given document property.
*
* @param {string||number||Array||Object} val
* @param {string} propName
* @param {string} parent
* @returns {Object||undefined}
* @private
*/
function _getSchemaField (val, propName, parent) {
const field = {
name: parent ? parent + '__' + propName : propName,
mode: '',
type: ''
}
if (val === null) {
field.type = 'STRING'
field.mode = 'NULLABLE'
return field
}
else if (typeof val === 'undefined') {
field.type = 'STRING'
field.mode = 'NULLABLE'
return field
}
else if (typeof val === 'string') {
field.type = 'STRING'
field.mode = 'NULLABLE'
return field
}
else if (typeof val === 'number' && !isNaN(val)) {
Number.isInteger(val) ? field.type = 'INTEGER' : field.type = 'FLOAT'
field.mode = 'NULLABLE'
return field
}
else if (typeof val === 'boolean') {
field.type = 'BOOL'
field.mode = 'NULLABLE'
return field
}
else if (Array.isArray(val)) {
for (let i = 0; i < val.length; i++) {
const schemaField = _getSchemaField(val[i], i, field.name)
if (schemaField !== undefined && !Object.prototype.hasOwnProperty.call(index, schemaField.name)) {
options.schema.fields.push(schemaField)
index[schemaField.name] = schemaField
}
}
return undefined
}
else if (typeof val === 'object' && Object.keys(val).length) {
Object.keys(val).forEach(subPropName => {
const schemaField = _getSchemaField(val[subPropName], subPropName, field.name)
if (schemaField !== undefined && !Object.prototype.hasOwnProperty.call(index, schemaField.name)) {
options.schema.fields.push(schemaField)
index[schemaField.name] = schemaField
}
})
return undefined
}
}
}
/**
* Runs through the given QuerySnapshot and converts and copies it to an array.
* Inserts the array into a BigQuery table with the given collectionName.
*
* @param {string} datasetID
* @param {string} collectionName
* @param {firebase.firestore.QuerySnapshot} snapshot
* @param {boolean} [verbose = false]
* @param {Number} [insertSize = 5000]
* @param {Array<string>} exclude
* @returns {Promise<Number>}
* @public
*/
exports.copyToBigQuery = async (datasetID, collectionName, snapshot, verbose = false, insertSize = 5000, exclude = []) => {
if (verbose) {
console.log('Copying ' + snapshot.docs.length + ' documents from collection ' + collectionName + ' to dataset ' + datasetID + '.')
console.log('Inserting ' + insertSize + ' documents at a time.')
}
let counter = 0
let rows = []
try {
for (let i = 0; i < snapshot.docs.length; i++) {
const data = snapshot.docs[i].data()
currentRow = {
doc_ID: snapshot.docs[i].id
}
Object.keys(data).forEach(propName => {
if (!exclude.includes(propName)) {
const formattedProp = _formatProp(data[propName], propName)
if (formattedProp !== undefined) currentRow[_formatName(propName)] = formattedProp
}
})
rows.push(currentRow)
counter++
if (rows.length === insertSize || i === snapshot.docs.length - 1) {
if (verbose) console.log('Inserting ' + rows.length + ' docs. ' + (snapshot.docs.length - i - 1) + ' docs left.')
await bigQuery.dataset(datasetID).table(collectionName).insert(rows)
rows = []
}
}
if (verbose) console.log('Successfully copied collection ' + collectionName + ' to BigQuery.')
return counter
}
catch (e) {
console.error(e)
if (e.errors.length) {
console.error(e.errors.length + ' errors.')
for (let z = 0; z < e.errors.length; z++) {
console.error(e.errors[z])
}
}
}
}
/**
* Converting a given Firestore property to a format suitable for BigQuery.
*
* @param {string||number||Array||Object} val
* @param {string} propName
* @param {string} parent
* @returns {string||number||Array||Object}
* @private
*/
function _formatProp (val, propName, parent) {
if (val === null || typeof val === 'number' || typeof val === 'string' || typeof val === 'boolean') return val
const name = _formatName(propName, parent)
if (Array.isArray(val)) {
for (let i = 0; i < val.length; i++) {
const formattedProp = _formatProp(val[i], i, name)
if (formattedProp !== undefined) currentRow[_formatName(i, name)] = formattedProp
}
}
else if (typeof val === 'object' && Object.keys(val).length) {
Object.keys(val).forEach(subPropName => {
const formattedProp = _formatProp(val[subPropName], subPropName, name)
if (formattedProp !== undefined) currentRow[_formatName(subPropName, name)] = formattedProp
})
return undefined
}
}
/**
* Formatting the property name to work with BigQuery.
* Objects with child props are prefixed with the parent name.
*
* @param {string} propName
* @param {string} [parent = undefined]
* @returns {string}
* @private
*/
function _formatName (propName, parent = undefined) {
return parent ? parent + '__' + propName : propName
}
/**
* Deletes all the given tables.
*
* @param {string} datasetID
* @param {String} tableName
* @param {boolean} [verbose = false]
* @returns {Promise<number>}
* @public
*/
exports.deleteBigQueryTable = (datasetID, tableName, verbose = false) => {
if (verbose) console.log('Deleting table ' + tableName + ' from dataset ' + datasetID + '.')
return bigQuery.dataset(datasetID).table(tableName).delete()
}