UNPKG

@cumulus/ingest

Version:
409 lines (408 loc) 17.8 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.duplicateHandlingType = exports.generateUniqueGranuleId = exports.unversionFilename = exports.moveGranuleFile = exports.generateMoveFileParams = exports.getNameOfFile = exports.handleDuplicateFile = exports.moveGranuleFileWithVersioning = exports.listVersionedObjects = exports.renameS3FileWithTimestamp = void 0; const crypto = __importStar(require("crypto")); const moment_1 = __importDefault(require("moment")); const S3 = __importStar(require("@cumulus/aws-client/S3")); const services_1 = require("@cumulus/aws-client/services"); const log = __importStar(require("@cumulus/common/log")); const db_1 = require("@cumulus/db"); const errors = __importStar(require("@cumulus/errors")); const errors_1 = require("@cumulus/errors"); /** * rename s3 file with timestamp * * @param {string} bucket - bucket of the file * @param {string} key - s3 key of the file * @returns {Promise} promise that resolves when file is renamed */ async function renameS3FileWithTimestamp(bucket, key) { const formatString = 'YYYYMMDDTHHmmssSSS'; const timestamp = (await S3.headObject(bucket, key)).LastModified; if (!timestamp) { throw new Error(`s3://${bucket}/${key} does not have a LastModified property`); } let renamedKey = `${key}.v${moment_1.default.utc(timestamp).format(formatString)}`; // if the renamed file already exists, get a new name // eslint-disable-next-line no-await-in-loop while (await S3.s3ObjectExists({ Bucket: bucket, Key: renamedKey })) { renamedKey = `${key}.v${moment_1.default.utc(timestamp).add(1, 'milliseconds').format(formatString)}`; } log.debug(`renameS3FileWithTimestamp renaming ${bucket} ${key} to ${renamedKey}`); await S3.moveObject({ sourceBucket: bucket, sourceKey: key, destinationBucket: bucket, destinationKey: renamedKey, copyTags: true, }); } exports.renameS3FileWithTimestamp = renameS3FileWithTimestamp; /** * get all renamed s3 files for a given bucket and key * * @param {string} bucket - bucket of the file * @param {string} key - s3 key of the file * @returns {Array<Object>} returns renamed files */ async function listVersionedObjects(bucket, key) { const s3list = await S3.listS3ObjectsV2({ Bucket: bucket, Prefix: `${key}.v`, }); if (s3list) { return s3list.map(({ Key, Size }) => ({ Bucket: bucket, Key, size: Size, })); } return []; } exports.listVersionedObjects = listVersionedObjects; /** * Move granule file from one s3 bucket & keypath to another, * creating a versioned copy of any file already existing at the target location * and returning an array of the moved file and all versioned filenames. * * @param {Object} source - source * @param {string} source.Bucket - source * @param {string} source.Key - source * @param {Object} target - target * @param {string} target.Bucket - target * @param {string} target.Key - target * @param {Object} sourceChecksumObject - source checksum information * @param {string} sourceChecksumObject.checksumType - checksum type, e.g. 'md5' * @param {Object} sourceChecksumObject.checksum - checksum value * @param {string} ACL - an S3 [Canned ACL](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl) * @returns {Promise<Array>} returns a promise that resolves to a list of s3 version file objects. * * @private **/ async function moveGranuleFileWithVersioning(source, target, sourceChecksumObject = {}, ACL) { const { checksumType, checksum } = sourceChecksumObject; // compare the checksum of the existing file and new file, and handle them accordingly const targetFileSum = await S3.calculateObjectHash({ s3: (0, services_1.s3)(), algorithm: checksumType ?? 'CKSUM', bucket: target.Bucket, key: target.Key, }); const sourceFileSum = checksum ?? await S3.calculateObjectHash({ s3: (0, services_1.s3)(), algorithm: 'CKSUM', bucket: source.Bucket, key: source.Key, }); // if the checksum of the existing file is the same as the new one, keep the existing file, // else rename the existing file, and both files are part of the granule. if (targetFileSum === sourceFileSum) { await S3.deleteS3Object(source.Bucket, source.Key); } else { log.debug(`Renaming ${target.Key}...`); await renameS3FileWithTimestamp(target.Bucket, target.Key); await S3.moveObject({ sourceBucket: source.Bucket, sourceKey: source.Key, destinationBucket: target.Bucket, destinationKey: target.Key, copyTags: true, ACL, }); } // return renamed files return listVersionedObjects(target.Bucket, target.Key); } exports.moveGranuleFileWithVersioning = moveGranuleFileWithVersioning; /** * handle duplicate file in S3 syncs and moves * * @param {Object} params - params object * @param {Object} params.source - source object: { Bucket, Key } * @param {Object} params.target - target object: { Bucket, Key } * @param {string} params.ACL - an S3 [Canned ACL](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl) * @param {string} params.duplicateHandling - duplicateHandling config string * One of [`error`, `skip`, `replace`, `version`]. * @param {Function} [params.checksumFunction] - optional function to verify source & target: * Called as `await checksumFunction(bucket, key);`, expected to return array where: * array[0] - string - checksum type * array[1] - string - checksum value * For example of partial application of expected values see `ingestFile` in this module. * @param {Function} [params.syncFileFunction] - optional function to sync file from non-s3 source. * Syncs to temporary source location for `version` case and to target location for `replace` case. * Called as `await syncFileFunction(bucket, key);`, expected to create file on S3. * For example of function prepared with partial application see `ingestFile` in this module. * @param {Function} [params.moveGranuleFileWithVersioningFunction] - optional - * override for moveGranuleFileWithVersioning. Defaults to local module method * @param {Object} [params.s3Object] - optional - replacement for S3 import object, * intended for use in testing * @throws {DuplicateFile} DuplicateFile error in `error` case. * @returns {Array<Object>} List of file version S3 Objects in `version` case, otherwise empty. */ async function handleDuplicateFile(params) { const { ACL, checksumFunction, duplicateHandling, fileRemotePath, moveGranuleFileWithVersioningFunction = moveGranuleFileWithVersioning, s3Object = S3, source, sourceBucket, syncFileFunction, target, } = params; if (duplicateHandling === 'error') { // Have to throw DuplicateFile and not WorkflowError, because the latter // is not treated as a failure by the message adapter. throw new errors.DuplicateFile(`${target.Key} already exists in ${target.Bucket} bucket`); } else if (duplicateHandling === 'version') { // sync to staging location if required if (syncFileFunction) { if (!fileRemotePath) { throw new Error('fileRemotePath must be defined if syncFileFunction is provided'); } await syncFileFunction({ bucket: sourceBucket, destinationBucket: source.Bucket, destinationKey: source.Key, fileRemotePath, }); } let sourceChecksumObject = {}; if (checksumFunction) { // verify integrity const [checksumType, checksum] = await checksumFunction(source.Bucket, source.Key); sourceChecksumObject = { checksumType, checksum }; } // return list of renamed files return moveGranuleFileWithVersioningFunction(source, target, sourceChecksumObject, ACL); } else if (duplicateHandling === 'replace') { if (syncFileFunction) { if (!fileRemotePath) { throw new Error('fileRemotePath must be defined if syncFileFunction is provided'); } // sync directly to target location await syncFileFunction({ destinationBucket: target.Bucket, destinationKey: target.Key, bucket: sourceBucket, fileRemotePath, }); } else { await s3Object.moveObject({ ACL, copyTags: true, destinationBucket: target.Bucket, destinationKey: target.Key, sourceBucket: source.Bucket, sourceKey: source.Key, }); } // verify integrity after sync/move if (checksumFunction) await checksumFunction(target.Bucket, target.Key); } // other values (including skip) return return []; } exports.handleDuplicateFile = handleDuplicateFile; /** * Get the name of the file from the following properties (in order of preference): * 1. fileName (e.g. 'granuleFileNamec.md') * 2. name (e.g. 'granuleFileName.md') * 3. key (e.g. 'stackname/filepath/granuleFileName.md') * @param {File} file - file object with the above properties * @returns {string | undefined} - The file name as a string or undefined */ const getNameOfFile = (file) => { const fileName = file.fileName ?? file.name; if (!fileName) { return file.key?.split('/').pop(); } return fileName; }; exports.getNameOfFile = getNameOfFile; /** * For each source file, see if there is a destination and generate the source * and target for the file moves. * @param {Array<Object>} sourceFiles - granule file objects * @param {Array<Object>} destinations - array of objects defining the destination of granule files * @returns {Array<Object>} - array containing the parameters for moving the file: * { * source: { Bucket, Key }, * target: { Bucket, Key }, * file: file object * } */ function generateMoveFileParams(sourceFiles, destinations) { return sourceFiles.map((file) => { const fileName = (0, exports.getNameOfFile)(file); if (fileName === undefined) return { file }; const destination = destinations.find((dest) => fileName.match(dest.regex)); // if there's no match, we skip the file if (!destination) return { file }; let source; if (file.bucket && file.key) { source = { Bucket: file.bucket, Key: file.key, }; } else if (file.filename) { source = S3.parseS3Uri(file.filename); } else { throw new Error(`granule.generateMoveFileParams unable to determine location of file: ${JSON.stringify(file)}`); } const targetKey = destination.filepath ? `${destination.filepath}/${(0, exports.getNameOfFile)(file)}` : (0, exports.getNameOfFile)(file); if (targetKey === undefined) { return { file }; } const target = { Bucket: destination.bucket, Key: targetKey, }; return { source, target, file }; }); } exports.generateMoveFileParams = generateMoveFileParams; /** * Moves a granule file and updates the datastore accordingly * @summary Moves a granule file record according to MoveFileParams and updates database accordingly * @param {MoveFileParams} moveFileParam - Parameter object describing the move operation * @param {FilePgModel} filesPgModel - FilePgModel instance * @param {Knex.Transaction | Knex} trx - Knex transaction or (optionally) Knex object * @param {number | undefined } postgresCumulusGranuleId - postgres internal granule id * @returns {Promise<Object>} - Returns object of type Omit<ApiFile, 'granuleId>> */ async function moveGranuleFile(moveFileParam, filesPgModel, trx, postgresCumulusGranuleId) { const { source, target, file } = moveFileParam; if (source && target) { log.debug('moveGranuleS3Object', source, target); if (!postgresCumulusGranuleId) { throw new Error('postgresCumulusGranuleId must be defined to move granule file if writeToPostgres is true'); } const updatedPgRecords = await filesPgModel.update(trx, { granule_cumulus_id: postgresCumulusGranuleId, bucket: source.Bucket, key: source.Key, }, { bucket: target.Bucket, key: target.Key, file_name: (0, exports.getNameOfFile)(file), }, ['*']); if (updatedPgRecords.length !== 1) { throw new errors_1.RecordDoesNotExist('Attempted to update granule on move, but granule does not exist'); } const updatedPgRecord = updatedPgRecords[0]; await S3.moveObject({ sourceBucket: source.Bucket, sourceKey: source.Key, destinationBucket: target.Bucket, destinationKey: target.Key, copyTags: true, }); return { ...(0, db_1.translatePostgresFileToApiFile)(updatedPgRecord), bucket: target.Bucket, key: target.Key, fileName: (0, exports.getNameOfFile)({ key: target.Key }), }; } if (!(file.bucket || file.key) && file.filename) { const parsed = S3.parseS3Uri(file.filename); file.bucket = parsed.Bucket; file.key = parsed.Key; } const postgresFileRecord = await filesPgModel.get(trx, { granule_cumulus_id: postgresCumulusGranuleId, bucket: file.bucket, key: file.key, }); return (0, db_1.translatePostgresFileToApiFile)(postgresFileRecord); } exports.moveGranuleFile = moveGranuleFile; /** * check to see if the file has the suffix with timestamp '.vYYYYMMDDTHHmmssSSS' * * @param {string} filename - name of the file * @returns {boolean} whether the file is renamed */ function isFileRenamed(filename) { const suffixRegex = '\\.v[0-9]{4}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])T(2[0-3]|[01][0-9])[0-5][0-9][0-5][0-9][0-9]{3}$'; return (filename.match(suffixRegex) !== null); } /** * Returns the input filename stripping off any versioned timestamp. * * @param {string} filename * @returns {string} - filename with timestamp removed */ function unversionFilename(filename) { return isFileRenamed(filename) ? filename.split('.').slice(0, -1).join('.') : filename; } exports.unversionFilename = unversionFilename; /** * Generates a unique granule ID by appending a truncated MD5 hash of values from * a producer provided granule object * * @param id - An ID associated with the object to be hashed. Likely the ID * assigned by the granule producer * @param collectionId - The api collection ID (name___version) associated with the granule * @param hashLength - The length of the hash to append to the granuleId. * @param includeTimestampHashKey - Boolean value for whether hash string should contain timestamp * @returns - A unique granule ID in the format: granuleId_hash. */ function generateUniqueGranuleId(id, collectionId, hashLength, includeTimestampHashKey) { // use MD5 to generate truncated hash of granule object const hashStringWithTimestamp = `${collectionId}_${process.hrtime.bigint().toString()}`; const hashStringWithoutTimestamp = `${collectionId}`; const hashString = includeTimestampHashKey ? hashStringWithTimestamp : hashStringWithoutTimestamp; const hashBuffer = crypto.createHash('md5').update(hashString).digest(); return `${id}_${hashBuffer.toString('base64url').replace(/_/g, '').slice(0, hashLength)}`; } exports.generateUniqueGranuleId = generateUniqueGranuleId; /** * Returns a directive on how to act when duplicate files are encountered. * * @param {Object} event - lambda function event. * @param {Object} event.config - the config object * @param {Object} event.config.collection - collection object. * @returns {DuplicateHandling} - duplicate handling directive. */ function duplicateHandlingType(event) { if (event?.cumulus_config?.cumulus_context?.forceDuplicateOverwrite) { return 'replace'; } return event.config.duplicateHandling ?? event.config.collection.duplicateHandling ?? 'error'; } exports.duplicateHandlingType = duplicateHandlingType; //# sourceMappingURL=granule.js.map