openrxiv-cli
Version:
CLI tool to download openRxiv MECA files from AWS S3 for text and data mining
221 lines (220 loc) • 9.21 kB
JavaScript
import { ListObjectsV2Command, HeadObjectCommand } from '@aws-sdk/client-s3';
import chalk from 'chalk';
import { getS3Client } from './config.js';
import { getFolderStructure } from 'openrxiv-utils';
import { getDefaultServer } from '../utils/default-server.js';
/**
* Get the S3 bucket name based on the server
*/
export function getBucketName(server = getDefaultServer()) {
switch (server.toLowerCase()) {
case 'medrxiv':
return 'medrxiv-src-monthly';
case 'biorxiv':
return 'biorxiv-src-monthly';
default:
console.error(`❌ Error: Invalid server ${server}, must be "biorxiv" or "medrxiv"`);
process.exit(1);
}
}
export async function listBucketContent(options) {
const client = await getS3Client();
const { month, batch, limit = 50, server = getDefaultServer() } = options;
const bucketName = getBucketName(server);
console.log(chalk.blue(`Listing ${server} bucket content...`));
console.log(chalk.blue('===================================='));
try {
// If no month or batch specified, show the available content structure
if (!month && !batch) {
await listFolder(client, server);
return;
}
let prefix = '';
let folder = null;
if (month || batch) {
// Use folder structure utility to determine the correct prefix
folder = getFolderStructure({ month, batch, server });
prefix = folder.prefix;
console.log(chalk.gray(`🔍 Content Type: ${folder.type === 'current' ? 'Current Content' : 'Back Content'}`));
if (folder.batch) {
console.log(chalk.gray(`🔍 Batch: ${folder.batch}`));
}
}
const commandOptions = {
Bucket: bucketName,
Prefix: prefix,
MaxKeys: parseInt(limit.toString()),
RequestPayer: 'requester',
};
const command = new ListObjectsV2Command(commandOptions);
const response = await client.send(command);
if (!response.Contents || response.Contents.length === 0) {
console.log(chalk.yellow('No content found'));
return;
}
console.log(chalk.green(`Found ${response.Contents.length} items:`));
console.log('');
for (const item of response.Contents) {
if (!item.Key)
continue;
const type = getContentType(item.Key);
const size = formatFileSize(item.Size || 0);
const date = item.LastModified ? item.LastModified.toLocaleDateString() : 'Unknown';
console.log(`${chalk.cyan(item.Key)}`);
console.log(` Type: ${chalk.yellow(type)} | Size: ${chalk.blue(size)} | Modified: ${chalk.gray(date)}`);
console.log('');
}
}
catch (error) {
if (error instanceof Error) {
throw new Error(`Failed to list bucket content: ${error.message}`);
}
throw error;
}
}
/**
* Lists the available content structure in the specified server bucket
* Shows available months and batches
*/
async function listFolder(client, server = getDefaultServer()) {
console.log(chalk.cyan('📁 Available Content Structure'));
console.log(chalk.cyan('=============================='));
console.log('');
try {
// List Current_Content folders (monthly content)
console.log(chalk.blue('📅 Current Content (Monthly):'));
console.log(chalk.gray(' Recent content organized by month'));
console.log('');
const bucketName = getBucketName(server);
const currentContentCommand = new ListObjectsV2Command({
Bucket: bucketName,
Prefix: 'Current_Content/',
Delimiter: '/',
MaxKeys: 1000,
RequestPayer: 'requester',
});
const currentResponse = await client.send(currentContentCommand);
if (currentResponse.CommonPrefixes && currentResponse.CommonPrefixes.length > 0) {
const months = currentResponse.CommonPrefixes.map((prefix) => { var _a; return (_a = prefix.Prefix) === null || _a === void 0 ? void 0 : _a.replace('Current_Content/', '').replace('/', ''); })
.filter(Boolean)
.sort((a, b) => {
// Sort by year first, then by month
const [monthA, yearA] = a.split('_');
const [monthB, yearB] = b.split('_');
if (yearA !== yearB)
return parseInt(yearB) - parseInt(yearA); // Newest year first
const monthOrder = [
'January',
'February',
'March',
'April',
'May',
'June',
'July',
'August',
'September',
'October',
'November',
'December',
];
return monthOrder.indexOf(monthB) - monthOrder.indexOf(monthA);
});
for (const month of months) {
console.log(` ${chalk.green('📁')} ${chalk.cyan(month)}`);
}
}
else {
console.log(chalk.gray(' No monthly content found'));
}
console.log('');
// List Back_Content batches
console.log(chalk.blue('📦 Back Content (Historical Batches):'));
console.log(chalk.gray(' Legacy content organized in batches'));
console.log('');
const backContentCommand = new ListObjectsV2Command({
Bucket: bucketName,
Prefix: 'Back_Content/',
Delimiter: '/',
MaxKeys: 1000,
RequestPayer: 'requester',
});
const backResponse = await client.send(backContentCommand);
if (backResponse.CommonPrefixes && backResponse.CommonPrefixes.length > 0) {
const batches = backResponse.CommonPrefixes.map((prefix) => { var _a; return (_a = prefix.Prefix) === null || _a === void 0 ? void 0 : _a.replace('Back_Content/', '').replace('/', ''); })
.filter(Boolean)
.sort();
for (const batch of batches) {
console.log(` ${chalk.green('📁')} ${chalk.cyan(batch)}`);
}
}
else {
console.log(chalk.gray(' No historical batches found'));
}
console.log('');
console.log(chalk.blue('💡 Usage Examples:'));
console.log(chalk.gray(` List specific month: ${server} list --month 2024-01`));
console.log(chalk.gray(` List specific batch: ${server} list --batch Batch_01`));
console.log(chalk.gray(` List with limit: ${server} list --month 2024-01 --limit 100`));
console.log('');
}
catch (error) {
if (error instanceof Error) {
console.log(chalk.yellow(`⚠️ Warning: Could not fetch content structure: ${error.message}`));
console.log(chalk.gray(' This may be due to AWS permissions or network issues'));
console.log('');
}
}
}
export async function getContentInfo(path, options = {}) {
var _a;
const client = await getS3Client();
const { detailed = false, server = getDefaultServer() } = options;
const bucketName = getBucketName(server);
console.log(chalk.blue(`Getting info for: ${path}`));
console.log(chalk.blue('=============================='));
try {
const commandOptions = {
Bucket: bucketName,
Key: path,
RequestPayer: 'requester',
};
const command = new HeadObjectCommand(commandOptions);
const response = await client.send(command);
console.log(chalk.green('✓ Content found'));
console.log('');
console.log(`Key: ${chalk.cyan(path)}`);
console.log(`Size: ${chalk.blue(formatFileSize(response.ContentLength || 0))}`);
console.log(`Type: ${chalk.yellow(response.ContentType || 'Unknown')}`);
console.log(`Last Modified: ${chalk.gray(((_a = response.LastModified) === null || _a === void 0 ? void 0 : _a.toLocaleString()) || 'Unknown')}`);
if (detailed && response.Metadata) {
console.log('');
console.log(chalk.blue('Metadata:'));
for (const [key, value] of Object.entries(response.Metadata)) {
console.log(` ${key}: ${value}`);
}
}
}
catch (error) {
if (error instanceof Error) {
throw new Error(`Failed to get content info: ${error.message}`);
}
throw error;
}
}
function getContentType(key) {
if (key.endsWith('.meca'))
return 'meca';
if (key.endsWith('.pdf'))
return 'pdf';
if (key.endsWith('.xml'))
return 'xml';
return 'other';
}
function formatFileSize(bytes) {
if (bytes === 0)
return '0 B';
const k = 1024;
const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}