UNPKG

openrxiv-utils

Version:

Utility functions for bioRxiv operations including URL parsing and DOI handling

258 lines (228 loc) 7.72 kB
/** * Utility functions for determining bioRxiv folder structure * based on the date requested. * * The bioRxiv structure is: * - Before late 2018: Files are in Back_Content/Batch_[nn]/ folders * - After late 2018: Files are in Current_Content/[Month]_[Year]/ folders */ export interface FolderStructure { server: 'biorxiv' | 'medrxiv'; type: 'current' | 'back'; prefix: string; batch: string; } export interface FolderStructureOptions { server?: 'biorxiv' | 'medrxiv'; month?: string; batch?: string; } /** * Normalizes batch input to the standard "Batch_XX" format * @param batch - Batch input in various formats (e.g., "1", "batch-1", "Batch_01", "batch_01") * @param server - Server type to determine batch format (e.g., "biorxiv", "medrxiv") * @returns Normalized batch string in appropriate format */ export function normalizeBatch(batch: string | number, server: string = 'biorxiv'): string { if (typeof batch === 'number') { if (batch < 1) { throw new Error( `Invalid batch format: ${batch}. Expected a positive number or batch identifier.`, ); } const batchNum = batch.toString().padStart(2, '0'); return server.toLowerCase() === 'medrxiv' ? `medRxiv_Batch_${batchNum}` : `Batch_${batchNum}`; } // Remove common prefixes and normalize const normalized = batch .toLowerCase() .replace(/^batch[-_]?/i, '') // Remove "batch", "batch-", "batch_" .replace(/^medrxiv[-_]?batch[-_]?/i, '') // Remove "medrxiv_batch", "medrxiv-batch", etc. .replace(/^0+/, '') // Remove leading zeros .trim(); const matchInt = normalized.match(/^\d+$/); if (!matchInt) { throw new Error( `Invalid batch format: ${batch}. Expected a positive number or batch identifier.`, ); } // Parse the number and format it const batchNum = parseInt(normalized, 10); if (isNaN(batchNum) || batchNum < 1) { throw new Error( `Invalid batch format: ${batch}. Expected a positive number or batch identifier.`, ); } const formattedBatchNum = batchNum.toString().padStart(2, '0'); return server.toLowerCase() === 'medrxiv' ? `medRxiv_Batch_${formattedBatchNum}` : `Batch_${formattedBatchNum}`; } /** * Determines the folder structure for a given month or batch * @param options - Options containing month or batch * @returns FolderStructure with the appropriate prefix and type */ export function getFolderStructure(options: FolderStructureOptions): FolderStructure { if (options.month && options.batch) { throw new Error('Either month or batch must be specified, not both'); } if (!options.month && !options.batch) { throw new Error('Either month or batch must be specified'); } if (options.batch) { // If batch is specified, use Back_Content structure const normalizedBatch = normalizeBatch(options.batch, options.server); return { server: options.server || 'biorxiv', type: 'back', prefix: `Back_Content/${normalizedBatch}/`, batch: normalizedBatch, }; } if (options.month) { // Normalize month format to YYYY-MM const normalizedMonth = normalizeMonthToYYYYMM(options.month); if (!normalizedMonth) { throw new Error( `Invalid month format: ${options.month}. Expected YYYY-MM or Month_YYYY format.`, ); } const [year, monthNum] = normalizedMonth.split('-').map(Number); // bioRxiv switched from Back_Content to Current_Content in late 2018 // We'll use December 2018 as the cutoff point to be safe const cutoffDate = new Date(2018, 11, 1); // December 1, 2018 (0-indexed month) const requestedDate = new Date(year, monthNum - 1, 1); if (requestedDate < cutoffDate) { // Use Back_Content structure - but we don't know which batch // User should specify batch explicitly for pre-2019 content throw new Error( `Date ${options.month} is in the Back_Content period. Please specify a batch using --batch option. ` + `Available batches can be listed with 'biorxiv list' command.`, ); } else { // Use Current_Content structure const monthName = getMonthName(monthNum); return { server: options.server || 'biorxiv', type: 'current', prefix: `Current_Content/${monthName}_${year}/`, batch: `${monthName}_${year}`, }; } } throw new Error('Invalid folder structure options'); } export function removeDuplicateFolders(folders: FolderStructure[]): FolderStructure[] { return folders.filter( (folder, index, arr) => arr.findIndex( (f) => f.batch === folder.batch && f.server === folder.server && f.type === folder.type && f.prefix === folder.prefix, ) === index, ); } /** * Sort folders chronologically, putting batches before months */ export function sortFoldersChronologically(folders: FolderStructure[]): FolderStructure[] { return folders.sort((a, b) => { // Put batches before months if (a.type === 'back' && b.type === 'current') return -1; if (a.type === 'current' && b.type === 'back') return 1; // For batches, sort by batch number if (a.type === 'back' && b.type === 'back') { const aNum = parseInt(a.batch.replace(/\D/g, '')); const bNum = parseInt(b.batch.replace(/\D/g, '')); return aNum - bNum; } // For months, sort chronologically (newest first) if (a.type === 'current' && b.type === 'current') { const aDate = new Date(a.batch); const bDate = new Date(b.batch); return aDate.getTime() - bDate.getTime(); } return 0; }); } /** * Normalizes various month formats to YYYY-MM * @param month - Month in various formats * @returns Normalized YYYY-MM format or null if invalid */ export function normalizeMonthToYYYYMM(month: string): string | null { // Already in YYYY-MM format if (month.match(/^\d{4}-\d{2}$/)) { const [, monthNum] = month.split('-').map(Number); if (monthNum < 1 || monthNum > 12) { return null; // Invalid month number } return month; } // Month_YYYY format (e.g., "November_2018") const monthYearMatch = month.match(/^([A-Za-z]+)(?:[-_])(\d{4})$/); if (monthYearMatch) { const monthName = monthYearMatch[1]; const year = monthYearMatch[2]; const monthNum = getMonthNumber(monthName); if (monthNum !== null) { return `${year}-${monthNum.toString().padStart(2, '0')}`; } } return null; } /** * Gets month number from month name * @param monthName - Month name (case insensitive) * @returns Month number (1-12) or null if invalid */ function getMonthNumber(monthName: string): number | null { const monthNames = [ 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', ]; const normalizedName = monthName.toLowerCase(); let monthIndex = monthNames.indexOf(normalizedName); if (monthIndex === -1) { monthIndex = monthNames.map((m) => m.slice(0, 3).toLowerCase()).indexOf(normalizedName); } return monthIndex !== -1 ? monthIndex + 1 : null; } /** * Gets month name from month number * @param monthNum - Month number (1-12) * @returns Month name (e.g., "January") */ function getMonthName(monthNum: number): string { const monthNames = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', ]; if (monthNum < 1 || monthNum > 12) { throw new Error(`Invalid month number: ${monthNum}. Must be 1-12.`); } return monthNames[monthNum - 1]; }