UNPKG

openrxiv-utils

Version:

Utility functions for bioRxiv operations including URL parsing and DOI handling

259 lines (233 loc) 8.28 kB
import { describe, it, expect } from 'vitest'; import { extractDOIFromURL, parseDOI, extractBaseDOI, extractVersion, isValidBiorxivDOI, isValidBiorxivURL, parseBiorxivURL, } from './biorxiv-parser.js'; describe('BioRxiv URL Parser', () => { describe('extractDOIFromURL', () => { it.each([ [ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3', '10.1101/2024.01.25.577295v3', ], [ 'https://www.medrxiv.org/content/10.1101/2020.03.19.20039131v2', '10.1101/2020.03.19.20039131v2', ], [ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.article-info', '10.1101/2024.01.25.577295v3', ], [ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.full', '10.1101/2024.01.25.577295v3', ], [ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.abstract', '10.1101/2024.01.25.577295v3', ], [ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.pdf', '10.1101/2024.01.25.577295v3', ], [ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.suppl', '10.1101/2024.01.25.577295v3', ], ['https://doi.org/10.1101/2024.01.25.577295v3', '10.1101/2024.01.25.577295v3'], ['10.1101/2024.01.25.577295v3', '10.1101/2024.01.25.577295v3'], ])('should extract DOI from standard content URL', (url, expected) => { const result = extractDOIFromURL(url); expect(result).toBe(expected); }); it.each([ ['https://example.com/not-biorxiv', null], ['https://biorxiv.org/invalid-path', null], ['not-a-url', null], ['', null], ['https://biorxiv.org/', null], ])('should return null for invalid URLs', (url, expected) => { const result = extractDOIFromURL(url); expect(result).toBe(expected); }); }); describe('parseDOI', () => { it('should parse DOI with version', () => { const doi = '10.1101/2024.01.25.577295v3'; const result = parseDOI(doi); expect(result).toEqual({ doi: '10.1101/2024.01.25.577295v3', prefix: '10.1101', date: '2024-01-25', identifier: '577295', suffix: '2024.01.25.577295', version: 'v3', }); }); it('should parse DOI without version', () => { const doi = '10.1101/2024.01.25.577295'; const result = parseDOI(doi); expect(result).toEqual({ doi: '10.1101/2024.01.25.577295', prefix: '10.1101', date: '2024-01-25', identifier: '577295', suffix: '2024.01.25.577295', version: null, }); }); it('should parse DOI with single digit month and day', () => { const doi = '10.1101/2024.01.05.123456v1'; const result = parseDOI(doi); expect(result).toEqual({ doi: '10.1101/2024.01.05.123456v1', prefix: '10.1101', date: '2024-01-05', identifier: '123456', suffix: '2024.01.05.123456', version: 'v1', }); }); it('should return null for invalid DOI format', () => { const invalidDOIs = [ '10.1000/123.456.789', '10.1101/2024.1.25.577295', '10.1101/2024.01.25.57729', '10.1101/2024.01.25.5772955666', '10.1101/2024.01.25.577295v', '10.1101/2024.01.25.577295v', 'invalid-doi', '', ]; invalidDOIs.forEach((doi) => { const result = parseDOI(doi); expect(result).toBeNull(); }); }); }); describe('extractBaseDOI', () => { it.each([ ['10.1101/2024.01.25.577295', '10.1101/2024.01.25.577295'], // Same DOI ['10.1101/2024.01.25.577295v3', '10.1101/2024.01.25.577295'], // Remove version ['10.1101/2024.01.25.577295v12', '10.1101/2024.01.25.577295'], // Remove double digit version ['10.1101/2020.03.19.20039131v2', '10.1101/2020.03.19.20039131'], // medrxiv variant ])('should extract base DOI from versioned DOI', (doi, expected) => { const result = extractBaseDOI(doi); expect(result).toBe(expected); }); }); describe('extractVersion', () => { it.each([ ['10.1101/2024.01.25.577295v3', '3'], ['10.1101/2024.01.25.577295', null], ['10.1101/2024.01.25.577295v12', '12'], ])('should extract version from DOI', (doi, expected) => { const result = extractVersion(doi); expect(result).toBe(expected); }); }); describe('isValidBiorxivDOI', () => { it.each([ // Valid DOIs ['10.1101/2024.01.25.577295v3', true], ['10.1101/2024.01.25.577295', true], ['10.1101/2020.01.15.123456v2', true], ['10.1101/2018.01.15.789012', true], ['10.1101/789012', true], ['10.1101/789012v12', true], ['10.1101/789012v3', true], ['10.1101/2020.03.19.20039131v2', true], ['10.1101/2024.1.25.577295', false], ['10.1101/2024.01.25.57729', false], ['invalid-doi', false], ['10.1101/78901', false], ['10.1101/78901v3', false], ['', false], ])('should validate correct bioRxiv DOIs %s', (doi, expected) => { const result = isValidBiorxivDOI(doi); expect(result).toBe(expected); }); }); describe('isValidBiorxivURL', () => { it.each([ ['https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3', true], ['https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.article-info', true], ['https://doi.org/10.1101/2024.01.25.577295v3', true], ['10.1101/2024.01.25.577295v3', true], ['https://www.biorxiv.org/content/10.1101/486050v2.article-info', true], ['https://www.biorxiv.org/content/10.1101/486050', true], // Invalid ['https://example.com/not-biorxiv', false], ['https://biorxiv.org/invalid-path', false], ['10.1000/123.456.789', false], ['invalid-url', false], ['', false], ])('should validate correct bioRxiv URLs', (url, expected) => { const result = isValidBiorxivURL(url); expect(result).toBe(expected); }); }); describe('parseBiorxivURL', () => { it('should parse valid bioRxiv URL', () => { const url = 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.article-info'; const result = parseBiorxivURL(url); expect(result).toEqual({ doi: '10.1101/2024.01.25.577295v3', baseDOI: '10.1101/2024.01.25.577295', version: '3', fullURL: url, isValid: true, }); }); it('should parse URL without version', () => { const url = 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295'; const result = parseBiorxivURL(url); expect(result).toEqual({ doi: '10.1101/2024.01.25.577295', baseDOI: '10.1101/2024.01.25.577295', version: null, fullURL: url, isValid: true, }); }); it('should return null for invalid URL', () => { const url = 'https://example.com/not-biorxiv'; const result = parseBiorxivURL(url); expect(result).toBeNull(); }); }); describe('Edge Cases', () => { it('should handle DOIs with leading zeros in month/day', () => { const doi = '10.1101/2024.01.05.123456v1'; const result = parseDOI(doi); expect(result).toEqual({ doi: '10.1101/2024.01.05.123456v1', prefix: '10.1101', date: '2024-01-05', identifier: '123456', suffix: '2024.01.05.123456', version: 'v1', }); }); it('should handle DOIs with different identifier lengths', () => { const doi = '10.1101/2024.01.25.123456v1'; const result = parseDOI(doi); expect(result?.identifier).toBe('123456'); }); it('should handle URLs with query parameters', () => { const url = 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3?query=test'; const result = extractDOIFromURL(url); expect(result).toBe('10.1101/2024.01.25.577295v3'); }); it('should handle URLs with fragments', () => { const url = 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3#section'; const result = extractDOIFromURL(url); expect(result).toBe('10.1101/2024.01.25.577295v3'); }); }); });