@cumulus/ingest
Version:
Ingest utilities
409 lines (408 loc) • 17.8 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.duplicateHandlingType = exports.generateUniqueGranuleId = exports.unversionFilename = exports.moveGranuleFile = exports.generateMoveFileParams = exports.getNameOfFile = exports.handleDuplicateFile = exports.moveGranuleFileWithVersioning = exports.listVersionedObjects = exports.renameS3FileWithTimestamp = void 0;
const crypto = __importStar(require("crypto"));
const moment_1 = __importDefault(require("moment"));
const S3 = __importStar(require("@cumulus/aws-client/S3"));
const services_1 = require("@cumulus/aws-client/services");
const log = __importStar(require("@cumulus/common/log"));
const db_1 = require("@cumulus/db");
const errors = __importStar(require("@cumulus/errors"));
const errors_1 = require("@cumulus/errors");
/**
* rename s3 file with timestamp
*
* @param {string} bucket - bucket of the file
* @param {string} key - s3 key of the file
* @returns {Promise} promise that resolves when file is renamed
*/
async function renameS3FileWithTimestamp(bucket, key) {
const formatString = 'YYYYMMDDTHHmmssSSS';
const timestamp = (await S3.headObject(bucket, key)).LastModified;
if (!timestamp) {
throw new Error(`s3://${bucket}/${key} does not have a LastModified property`);
}
let renamedKey = `${key}.v${moment_1.default.utc(timestamp).format(formatString)}`;
// if the renamed file already exists, get a new name
// eslint-disable-next-line no-await-in-loop
while (await S3.s3ObjectExists({ Bucket: bucket, Key: renamedKey })) {
renamedKey = `${key}.v${moment_1.default.utc(timestamp).add(1, 'milliseconds').format(formatString)}`;
}
log.debug(`renameS3FileWithTimestamp renaming ${bucket} ${key} to ${renamedKey}`);
await S3.moveObject({
sourceBucket: bucket,
sourceKey: key,
destinationBucket: bucket,
destinationKey: renamedKey,
copyTags: true,
});
}
exports.renameS3FileWithTimestamp = renameS3FileWithTimestamp;
/**
* get all renamed s3 files for a given bucket and key
*
* @param {string} bucket - bucket of the file
* @param {string} key - s3 key of the file
* @returns {Array<Object>} returns renamed files
*/
async function listVersionedObjects(bucket, key) {
const s3list = await S3.listS3ObjectsV2({
Bucket: bucket,
Prefix: `${key}.v`,
});
if (s3list) {
return s3list.map(({ Key, Size }) => ({
Bucket: bucket,
Key,
size: Size,
}));
}
return [];
}
exports.listVersionedObjects = listVersionedObjects;
/**
* Move granule file from one s3 bucket & keypath to another,
* creating a versioned copy of any file already existing at the target location
* and returning an array of the moved file and all versioned filenames.
*
* @param {Object} source - source
* @param {string} source.Bucket - source
* @param {string} source.Key - source
* @param {Object} target - target
* @param {string} target.Bucket - target
* @param {string} target.Key - target
* @param {Object} sourceChecksumObject - source checksum information
* @param {string} sourceChecksumObject.checksumType - checksum type, e.g. 'md5'
* @param {Object} sourceChecksumObject.checksum - checksum value
* @param {string} ACL - an S3 [Canned ACL](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl)
* @returns {Promise<Array>} returns a promise that resolves to a list of s3 version file objects.
*
* @private
**/
async function moveGranuleFileWithVersioning(source, target, sourceChecksumObject = {}, ACL) {
const { checksumType, checksum } = sourceChecksumObject;
// compare the checksum of the existing file and new file, and handle them accordingly
const targetFileSum = await S3.calculateObjectHash({
s3: (0, services_1.s3)(),
algorithm: checksumType ?? 'CKSUM',
bucket: target.Bucket,
key: target.Key,
});
const sourceFileSum = checksum ?? await S3.calculateObjectHash({
s3: (0, services_1.s3)(),
algorithm: 'CKSUM',
bucket: source.Bucket,
key: source.Key,
});
// if the checksum of the existing file is the same as the new one, keep the existing file,
// else rename the existing file, and both files are part of the granule.
if (targetFileSum === sourceFileSum) {
await S3.deleteS3Object(source.Bucket, source.Key);
}
else {
log.debug(`Renaming ${target.Key}...`);
await renameS3FileWithTimestamp(target.Bucket, target.Key);
await S3.moveObject({
sourceBucket: source.Bucket,
sourceKey: source.Key,
destinationBucket: target.Bucket,
destinationKey: target.Key,
copyTags: true,
ACL,
});
}
// return renamed files
return listVersionedObjects(target.Bucket, target.Key);
}
exports.moveGranuleFileWithVersioning = moveGranuleFileWithVersioning;
/**
* handle duplicate file in S3 syncs and moves
*
* @param {Object} params - params object
* @param {Object} params.source - source object: { Bucket, Key }
* @param {Object} params.target - target object: { Bucket, Key }
* @param {string} params.ACL - an S3 [Canned ACL](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl)
* @param {string} params.duplicateHandling - duplicateHandling config string
* One of [`error`, `skip`, `replace`, `version`].
* @param {Function} [params.checksumFunction] - optional function to verify source & target:
* Called as `await checksumFunction(bucket, key);`, expected to return array where:
* array[0] - string - checksum type
* array[1] - string - checksum value
* For example of partial application of expected values see `ingestFile` in this module.
* @param {Function} [params.syncFileFunction] - optional function to sync file from non-s3 source.
* Syncs to temporary source location for `version` case and to target location for `replace` case.
* Called as `await syncFileFunction(bucket, key);`, expected to create file on S3.
* For example of function prepared with partial application see `ingestFile` in this module.
* @param {Function} [params.moveGranuleFileWithVersioningFunction] - optional -
* override for moveGranuleFileWithVersioning. Defaults to local module method
* @param {Object} [params.s3Object] - optional - replacement for S3 import object,
* intended for use in testing
* @throws {DuplicateFile} DuplicateFile error in `error` case.
* @returns {Array<Object>} List of file version S3 Objects in `version` case, otherwise empty.
*/
async function handleDuplicateFile(params) {
const { ACL, checksumFunction, duplicateHandling, fileRemotePath, moveGranuleFileWithVersioningFunction = moveGranuleFileWithVersioning, s3Object = S3, source, sourceBucket, syncFileFunction, target, } = params;
if (duplicateHandling === 'error') {
// Have to throw DuplicateFile and not WorkflowError, because the latter
// is not treated as a failure by the message adapter.
throw new errors.DuplicateFile(`${target.Key} already exists in ${target.Bucket} bucket`);
}
else if (duplicateHandling === 'version') {
// sync to staging location if required
if (syncFileFunction) {
if (!fileRemotePath) {
throw new Error('fileRemotePath must be defined if syncFileFunction is provided');
}
await syncFileFunction({
bucket: sourceBucket,
destinationBucket: source.Bucket,
destinationKey: source.Key,
fileRemotePath,
});
}
let sourceChecksumObject = {};
if (checksumFunction) {
// verify integrity
const [checksumType, checksum] = await checksumFunction(source.Bucket, source.Key);
sourceChecksumObject = { checksumType, checksum };
}
// return list of renamed files
return moveGranuleFileWithVersioningFunction(source, target, sourceChecksumObject, ACL);
}
else if (duplicateHandling === 'replace') {
if (syncFileFunction) {
if (!fileRemotePath) {
throw new Error('fileRemotePath must be defined if syncFileFunction is provided');
}
// sync directly to target location
await syncFileFunction({
destinationBucket: target.Bucket,
destinationKey: target.Key,
bucket: sourceBucket,
fileRemotePath,
});
}
else {
await s3Object.moveObject({
ACL,
copyTags: true,
destinationBucket: target.Bucket,
destinationKey: target.Key,
sourceBucket: source.Bucket,
sourceKey: source.Key,
});
}
// verify integrity after sync/move
if (checksumFunction)
await checksumFunction(target.Bucket, target.Key);
}
// other values (including skip) return
return [];
}
exports.handleDuplicateFile = handleDuplicateFile;
/**
* Get the name of the file from the following properties (in order of preference):
* 1. fileName (e.g. 'granuleFileNamec.md')
* 2. name (e.g. 'granuleFileName.md')
* 3. key (e.g. 'stackname/filepath/granuleFileName.md')
* @param {File} file - file object with the above properties
* @returns {string | undefined} - The file name as a string or undefined
*/
const getNameOfFile = (file) => {
const fileName = file.fileName ?? file.name;
if (!fileName) {
return file.key?.split('/').pop();
}
return fileName;
};
exports.getNameOfFile = getNameOfFile;
/**
* For each source file, see if there is a destination and generate the source
* and target for the file moves.
* @param {Array<Object>} sourceFiles - granule file objects
* @param {Array<Object>} destinations - array of objects defining the destination of granule files
* @returns {Array<Object>} - array containing the parameters for moving the file:
* {
* source: { Bucket, Key },
* target: { Bucket, Key },
* file: file object
* }
*/
function generateMoveFileParams(sourceFiles, destinations) {
return sourceFiles.map((file) => {
const fileName = (0, exports.getNameOfFile)(file);
if (fileName === undefined)
return { file };
const destination = destinations.find((dest) => fileName.match(dest.regex));
// if there's no match, we skip the file
if (!destination)
return { file };
let source;
if (file.bucket && file.key) {
source = {
Bucket: file.bucket,
Key: file.key,
};
}
else if (file.filename) {
source = S3.parseS3Uri(file.filename);
}
else {
throw new Error(`granule.generateMoveFileParams unable to determine location of file: ${JSON.stringify(file)}`);
}
const targetKey = destination.filepath
? `${destination.filepath}/${(0, exports.getNameOfFile)(file)}`
: (0, exports.getNameOfFile)(file);
if (targetKey === undefined) {
return { file };
}
const target = {
Bucket: destination.bucket,
Key: targetKey,
};
return { source, target, file };
});
}
exports.generateMoveFileParams = generateMoveFileParams;
/**
* Moves a granule file and updates the datastore accordingly
* @summary Moves a granule file record according to MoveFileParams and updates database accordingly
* @param {MoveFileParams} moveFileParam - Parameter object describing the move operation
* @param {FilePgModel} filesPgModel - FilePgModel instance
* @param {Knex.Transaction | Knex} trx - Knex transaction or (optionally) Knex object
* @param {number | undefined } postgresCumulusGranuleId - postgres internal granule id
* @returns {Promise<Object>} - Returns object of type Omit<ApiFile, 'granuleId>>
*/
async function moveGranuleFile(moveFileParam, filesPgModel, trx, postgresCumulusGranuleId) {
const { source, target, file } = moveFileParam;
if (source && target) {
log.debug('moveGranuleS3Object', source, target);
if (!postgresCumulusGranuleId) {
throw new Error('postgresCumulusGranuleId must be defined to move granule file if writeToPostgres is true');
}
const updatedPgRecords = await filesPgModel.update(trx, {
granule_cumulus_id: postgresCumulusGranuleId,
bucket: source.Bucket,
key: source.Key,
}, {
bucket: target.Bucket,
key: target.Key,
file_name: (0, exports.getNameOfFile)(file),
}, ['*']);
if (updatedPgRecords.length !== 1) {
throw new errors_1.RecordDoesNotExist('Attempted to update granule on move, but granule does not exist');
}
const updatedPgRecord = updatedPgRecords[0];
await S3.moveObject({
sourceBucket: source.Bucket,
sourceKey: source.Key,
destinationBucket: target.Bucket,
destinationKey: target.Key,
copyTags: true,
});
return {
...(0, db_1.translatePostgresFileToApiFile)(updatedPgRecord),
bucket: target.Bucket,
key: target.Key,
fileName: (0, exports.getNameOfFile)({ key: target.Key }),
};
}
if (!(file.bucket || file.key) && file.filename) {
const parsed = S3.parseS3Uri(file.filename);
file.bucket = parsed.Bucket;
file.key = parsed.Key;
}
const postgresFileRecord = await filesPgModel.get(trx, {
granule_cumulus_id: postgresCumulusGranuleId,
bucket: file.bucket,
key: file.key,
});
return (0, db_1.translatePostgresFileToApiFile)(postgresFileRecord);
}
exports.moveGranuleFile = moveGranuleFile;
/**
* check to see if the file has the suffix with timestamp '.vYYYYMMDDTHHmmssSSS'
*
* @param {string} filename - name of the file
* @returns {boolean} whether the file is renamed
*/
function isFileRenamed(filename) {
const suffixRegex = '\\.v[0-9]{4}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])T(2[0-3]|[01][0-9])[0-5][0-9][0-5][0-9][0-9]{3}$';
return (filename.match(suffixRegex) !== null);
}
/**
* Returns the input filename stripping off any versioned timestamp.
*
* @param {string} filename
* @returns {string} - filename with timestamp removed
*/
function unversionFilename(filename) {
return isFileRenamed(filename)
? filename.split('.').slice(0, -1).join('.')
: filename;
}
exports.unversionFilename = unversionFilename;
/**
* Generates a unique granule ID by appending a truncated MD5 hash of values from
* a producer provided granule object
*
* @param id - An ID associated with the object to be hashed. Likely the ID
* assigned by the granule producer
* @param collectionId - The api collection ID (name___version) associated with the granule
* @param hashLength - The length of the hash to append to the granuleId.
* @param includeTimestampHashKey - Boolean value for whether hash string should contain timestamp
* @returns - A unique granule ID in the format: granuleId_hash.
*/
function generateUniqueGranuleId(id, collectionId, hashLength, includeTimestampHashKey) {
// use MD5 to generate truncated hash of granule object
const hashStringWithTimestamp = `${collectionId}_${process.hrtime.bigint().toString()}`;
const hashStringWithoutTimestamp = `${collectionId}`;
const hashString = includeTimestampHashKey ? hashStringWithTimestamp : hashStringWithoutTimestamp;
const hashBuffer = crypto.createHash('md5').update(hashString).digest();
return `${id}_${hashBuffer.toString('base64url').replace(/_/g, '').slice(0, hashLength)}`;
}
exports.generateUniqueGranuleId = generateUniqueGranuleId;
/**
* Returns a directive on how to act when duplicate files are encountered.
*
* @param {Object} event - lambda function event.
* @param {Object} event.config - the config object
* @param {Object} event.config.collection - collection object.
* @returns {DuplicateHandling} - duplicate handling directive.
*/
function duplicateHandlingType(event) {
if (event?.cumulus_config?.cumulus_context?.forceDuplicateOverwrite) {
return 'replace';
}
return event.config.duplicateHandling
?? event.config.collection.duplicateHandling
?? 'error';
}
exports.duplicateHandlingType = duplicateHandlingType;
//# sourceMappingURL=granule.js.map