UNPKG

apr144-bam

Version:

Parser for BAM and BAM index (bai) files

1,124 lines (1,076 loc) 35.1 kB
import { Buffer } from 'buffer' import crc32 from 'buffer-crc32' import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle' import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle' import AbortablePromiseCache from 'abortable-promise-cache' import QuickLRU from 'quick-lru' // locals import BAI from './bai' import CSI from './csi' import Chunk from './chunk' import BAMFeature from './record' import { parseHeaderText } from './sam' import { checkAbortSignal, timeout, makeOpts, BamOpts, BaseOpts } from './util' export const BAM_MAGIC = 21840194 const blockLen = 1 << 16 async function gen2array<T>(gen: AsyncIterable<T[]>): Promise<T[]> { let out: T[] = [] for await (const x of gen) { out = out.concat(x) } return out } interface Args { chunk: Chunk opts: BaseOpts } class NullFilehandle { public read(): Promise<any> { throw new Error('never called') } public stat(): Promise<any> { throw new Error('never called') } public readFile(): Promise<any> { throw new Error('never called') } public close(): Promise<any> { throw new Error('never called') } } export default class BamFile { public renameRefSeq: (a: string) => string public bam: GenericFilehandle public header?: string public chrToIndex?: Record<string, number> public indexToChr?: { refName: string; length: number }[] public yieldThreadTime: number public index?: BAI | CSI public htsget = false public headerP?: ReturnType<BamFile['getHeaderPre']> private featureCache = new AbortablePromiseCache<Args, BAMFeature[]>({ cache: new QuickLRU({ maxSize: 50, }), fill: async (args: Args, signal) => { const { chunk, opts } = args const { data, cpositions, dpositions } = await this._readChunk({ chunk, opts: { ...opts, signal }, }) return this.readBamFeatures(data, cpositions, dpositions, chunk) }, }) constructor({ bamFilehandle, bamPath, bamUrl, baiPath, baiFilehandle, baiUrl, csiPath, csiFilehandle, csiUrl, htsget, yieldThreadTime = 100, renameRefSeqs = n => n, }: { bamFilehandle?: GenericFilehandle bamPath?: string bamUrl?: string baiPath?: string baiFilehandle?: GenericFilehandle baiUrl?: string csiPath?: string csiFilehandle?: GenericFilehandle csiUrl?: string renameRefSeqs?: (a: string) => string yieldThreadTime?: number htsget?: boolean }) { this.renameRefSeq = renameRefSeqs if (bamFilehandle) { this.bam = bamFilehandle } else if (bamPath) { this.bam = new LocalFile(bamPath) } else if (bamUrl) { const bamUrlObj = new URL(bamUrl) const bamUrlUsername = bamUrlObj.username const bamUrlPassword = bamUrlObj.password if (bamUrlUsername && bamUrlPassword) { bamUrl = `${bamUrlObj.protocol}//${bamUrlObj.host}${bamUrlObj.pathname}${bamUrlObj.search}` this.bam = new RemoteFile(bamUrl, { overrides: { credentials: 'include', headers: { Authorization: 'Basic ' + btoa(bamUrlUsername + ':' + bamUrlPassword), }, }, }) } else { this.bam = new RemoteFile(bamUrl) } } else if (htsget) { this.htsget = true this.bam = new NullFilehandle() } else { throw new Error('unable to initialize bam') } if (csiFilehandle) { this.index = new CSI({ filehandle: csiFilehandle }) } else if (csiPath) { this.index = new CSI({ filehandle: new LocalFile(csiPath) }) } else if (csiUrl) { this.index = new CSI({ filehandle: new RemoteFile(csiUrl) }) } else if (baiFilehandle) { this.index = new BAI({ filehandle: baiFilehandle }) } else if (baiPath) { this.index = new BAI({ filehandle: new LocalFile(baiPath) }) } else if (baiUrl) { const baiUrlObj = new URL(baiUrl) const baiUrlUsername = baiUrlObj.username const baiUrlPassword = baiUrlObj.password if (baiUrlUsername && baiUrlPassword) { baiUrl = `${baiUrlObj.protocol}//${baiUrlObj.host}${baiUrlObj.pathname}${baiUrlObj.search}` // console.log( // `baiUrl | ${baiUrl} | ${baiUrlUsername} | ${baiUrlPassword}`, // ) this.index = new BAI({ filehandle: new RemoteFile(baiUrl, { overrides: { credentials: 'include', headers: { Authorization: 'Basic ' + btoa(baiUrlUsername + ':' + baiUrlPassword), }, }, }), }) } else { this.index = new BAI({ filehandle: new RemoteFile(baiUrl) }) } } else if (bamPath) { this.index = new BAI({ filehandle: new LocalFile(`${bamPath}.bai`) }) } else if (bamUrl) { const baiUrlObj = new URL(bamUrl) const baiUrlUsername = baiUrlObj.username const baiUrlPassword = baiUrlObj.password if (baiUrlUsername && baiUrlPassword) { const baiUrl = `${baiUrlObj.protocol}//${baiUrlObj.host}${baiUrlObj.pathname}.bai${baiUrlObj.search}` // console.log( // `baiUrl | ${baiUrl} | ${baiUrlUsername} | ${baiUrlPassword}`, // ) this.index = new BAI({ filehandle: new RemoteFile(baiUrl, { overrides: { credentials: 'include', headers: { Authorization: 'Basic ' + btoa(baiUrlUsername + ':' + baiUrlPassword), }, }, }), }) } else { this.index = new BAI({ filehandle: new RemoteFile(`${bamUrl}.bai`) }) } } else if (htsget) { this.htsget = true } else { throw new Error('unable to infer index format') } this.yieldThreadTime = yieldThreadTime } async getHeaderPre(origOpts?: BaseOpts) { const opts = makeOpts(origOpts) // console.log(`[bam-js] getHeaderPre: ${JSON.stringify(opts)}`) // if (opts.assemblyName && opts.assemblyName === 'hg38') { // this.chrToIndex = { // chr1: 0, // chr10: 1, // chr11: 2, // chr12: 3, // chr13: 4, // chr14: 5, // chr15: 6, // chr16: 7, // chr17: 8, // chr18: 9, // chr19: 10, // chr2: 11, // chr20: 12, // chr21: 13, // chr22: 14, // chr3: 15, // chr4: 16, // chr5: 17, // chr6: 18, // chr7: 19, // chr8: 20, // chr9: 21, // chrM: 22, // chrX: 23, // chrY: 24, // } // this.indexToChr = [ // { // refName: 'chr1', // length: 248956422, // }, // { // refName: 'chr10', // length: 133797422, // }, // { // refName: 'chr11', // length: 135086622, // }, // { // refName: 'chr12', // length: 133275309, // }, // { // refName: 'chr13', // length: 114364328, // }, // { // refName: 'chr14', // length: 107043718, // }, // { // refName: 'chr15', // length: 101991189, // }, // { // refName: 'chr16', // length: 90338345, // }, // { // refName: 'chr17', // length: 83257441, // }, // { // refName: 'chr18', // length: 80373285, // }, // { // refName: 'chr19', // length: 58617616, // }, // { // refName: 'chr2', // length: 242193529, // }, // { // refName: 'chr20', // length: 64444167, // }, // { // refName: 'chr21', // length: 46709983, // }, // { // refName: 'chr22', // length: 50818468, // }, // { // refName: 'chr3', // length: 198295559, // }, // { // refName: 'chr4', // length: 190214555, // }, // { // refName: 'chr5', // length: 181538259, // }, // { // refName: 'chr6', // length: 170805979, // }, // { // refName: 'chr7', // length: 159345973, // }, // { // refName: 'chr8', // length: 145138636, // }, // { // refName: 'chr9', // length: 138394717, // }, // { // refName: 'chrM', // length: 16569, // }, // { // refName: 'chrX', // length: 156040895, // }, // { // refName: 'chrY', // length: 57227415, // }, // ] // return // } if (opts.assemblyName && opts.assemblyName === 'hg38') { this.chrToIndex = { chr1: 0, chr2: 1, chr3: 2, chr4: 3, chr5: 4, chr6: 5, chr7: 6, chr8: 7, chr9: 8, chr10: 9, chr11: 10, chr12: 11, chr13: 12, chr14: 13, chr15: 14, chr16: 15, chr17: 16, chr18: 17, chr19: 18, chr20: 19, chr21: 20, chr22: 21, chrX: 22, chrY: 23, chrM: 24, 'GL000008.2': 25, 'GL000009.2': 26, 'GL000194.1': 27, 'GL000195.1': 28, 'GL000205.2': 29, 'GL000208.1': 30, 'GL000213.1': 31, 'GL000214.1': 32, 'GL000216.2': 33, 'GL000218.1': 34, 'GL000219.1': 35, 'GL000220.1': 36, 'GL000221.1': 37, 'GL000224.1': 38, 'GL000225.1': 39, 'GL000226.1': 40, 'KI270302.1': 41, 'KI270303.1': 42, 'KI270304.1': 43, 'KI270305.1': 44, 'KI270310.1': 45, 'KI270311.1': 46, 'KI270312.1': 47, 'KI270315.1': 48, 'KI270316.1': 49, 'KI270317.1': 50, 'KI270320.1': 51, 'KI270322.1': 52, 'KI270329.1': 53, 'KI270330.1': 54, 'KI270333.1': 55, 'KI270334.1': 56, 'KI270335.1': 57, 'KI270336.1': 58, 'KI270337.1': 59, 'KI270338.1': 60, 'KI270340.1': 61, 'KI270362.1': 62, 'KI270363.1': 63, 'KI270364.1': 64, 'KI270366.1': 65, 'KI270371.1': 66, 'KI270372.1': 67, 'KI270373.1': 68, 'KI270374.1': 69, 'KI270375.1': 70, 'KI270376.1': 71, 'KI270378.1': 72, 'KI270379.1': 73, 'KI270381.1': 74, 'KI270382.1': 75, 'KI270383.1': 76, 'KI270384.1': 77, 'KI270385.1': 78, 'KI270386.1': 79, 'KI270387.1': 80, 'KI270388.1': 81, 'KI270389.1': 82, 'KI270390.1': 83, 'KI270391.1': 84, 'KI270392.1': 85, 'KI270393.1': 86, 'KI270394.1': 87, 'KI270395.1': 88, 'KI270396.1': 89, 'KI270411.1': 90, 'KI270412.1': 91, 'KI270414.1': 92, 'KI270417.1': 93, 'KI270418.1': 94, 'KI270419.1': 95, 'KI270420.1': 96, 'KI270422.1': 97, 'KI270423.1': 98, 'KI270424.1': 99, 'KI270425.1': 100, 'KI270429.1': 101, 'KI270435.1': 102, 'KI270438.1': 103, 'KI270442.1': 104, 'KI270448.1': 105, 'KI270465.1': 106, 'KI270466.1': 107, 'KI270467.1': 108, 'KI270468.1': 109, 'KI270507.1': 110, 'KI270508.1': 111, 'KI270509.1': 112, 'KI270510.1': 113, 'KI270511.1': 114, 'KI270512.1': 115, 'KI270515.1': 116, 'KI270516.1': 117, 'KI270517.1': 118, 'KI270518.1': 119, 'KI270519.1': 120, 'KI270521.1': 121, 'KI270522.1': 122, 'KI270528.1': 123, 'KI270529.1': 124, 'KI270530.1': 125, 'KI270538.1': 126, 'KI270539.1': 127, 'KI270544.1': 128, 'KI270548.1': 129, 'KI270579.1': 130, 'KI270580.1': 131, 'KI270581.1': 132, 'KI270582.1': 133, 'KI270583.1': 134, 'KI270584.1': 135, 'KI270587.1': 136, 'KI270588.1': 137, 'KI270589.1': 138, 'KI270590.1': 139, 'KI270591.1': 140, 'KI270593.1': 141, 'KI270706.1': 142, 'KI270707.1': 143, 'KI270708.1': 144, 'KI270709.1': 145, 'KI270710.1': 146, 'KI270711.1': 147, 'KI270712.1': 148, 'KI270713.1': 149, 'KI270714.1': 150, 'KI270715.1': 151, 'KI270716.1': 152, 'KI270717.1': 153, 'KI270718.1': 154, 'KI270719.1': 155, 'KI270720.1': 156, 'KI270721.1': 157, 'KI270722.1': 158, 'KI270723.1': 159, 'KI270724.1': 160, 'KI270725.1': 161, 'KI270726.1': 162, 'KI270727.1': 163, 'KI270728.1': 164, 'KI270729.1': 165, 'KI270730.1': 166, 'KI270731.1': 167, 'KI270732.1': 168, 'KI270733.1': 169, 'KI270734.1': 170, 'KI270735.1': 171, 'KI270736.1': 172, 'KI270737.1': 173, 'KI270738.1': 174, 'KI270739.1': 175, 'KI270740.1': 176, 'KI270741.1': 177, 'KI270742.1': 178, 'KI270743.1': 179, 'KI270744.1': 180, 'KI270745.1': 181, 'KI270746.1': 182, 'KI270747.1': 183, 'KI270748.1': 184, 'KI270749.1': 185, 'KI270750.1': 186, 'KI270751.1': 187, 'KI270752.1': 188, 'KI270753.1': 189, 'KI270754.1': 190, 'KI270755.1': 191, 'KI270756.1': 192, 'KI270757.1': 193, } this.indexToChr = [ { refName: 'chr1', length: 248956422 }, { refName: 'chr2', length: 242193529 }, { refName: 'chr3', length: 198295559 }, { refName: 'chr4', length: 190214555 }, { refName: 'chr5', length: 181538259 }, { refName: 'chr6', length: 170805979 }, { refName: 'chr7', length: 159345973 }, { refName: 'chr8', length: 145138636 }, { refName: 'chr9', length: 138394717 }, { refName: 'chr10', length: 133797422 }, { refName: 'chr11', length: 135086622 }, { refName: 'chr12', length: 133275309 }, { refName: 'chr13', length: 114364328 }, { refName: 'chr14', length: 107043718 }, { refName: 'chr15', length: 101991189 }, { refName: 'chr16', length: 90338345 }, { refName: 'chr17', length: 83257441 }, { refName: 'chr18', length: 80373285 }, { refName: 'chr19', length: 58617616 }, { refName: 'chr20', length: 64444167 }, { refName: 'chr21', length: 46709983 }, { refName: 'chr22', length: 50818468 }, { refName: 'chrX', length: 156040895 }, { refName: 'chrY', length: 57227415 }, { refName: 'chrM', length: 16569 }, { refName: 'GL000008.2', length: 209709 }, { refName: 'GL000009.2', length: 201709 }, { refName: 'GL000194.1', length: 191469 }, { refName: 'GL000195.1', length: 182896 }, { refName: 'GL000205.2', length: 185591 }, { refName: 'GL000208.1', length: 92689 }, { refName: 'GL000213.1', length: 164239 }, { refName: 'GL000214.1', length: 137718 }, { refName: 'GL000216.2', length: 176608 }, { refName: 'GL000218.1', length: 161147 }, { refName: 'GL000219.1', length: 179198 }, { refName: 'GL000220.1', length: 161802 }, { refName: 'GL000221.1', length: 155397 }, { refName: 'GL000224.1', length: 179693 }, { refName: 'GL000225.1', length: 211173 }, { refName: 'GL000226.1', length: 15008 }, { refName: 'KI270302.1', length: 2274 }, { refName: 'KI270303.1', length: 1942 }, { refName: 'KI270304.1', length: 2165 }, { refName: 'KI270305.1', length: 1472 }, { refName: 'KI270310.1', length: 1201 }, { refName: 'KI270311.1', length: 12399 }, { refName: 'KI270312.1', length: 998 }, { refName: 'KI270315.1', length: 2276 }, { refName: 'KI270316.1', length: 1444 }, { refName: 'KI270317.1', length: 37690 }, { refName: 'KI270320.1', length: 4416 }, { refName: 'KI270322.1', length: 21476 }, { refName: 'KI270329.1', length: 1040 }, { refName: 'KI270330.1', length: 1652 }, { refName: 'KI270333.1', length: 2699 }, { refName: 'KI270334.1', length: 1368 }, { refName: 'KI270335.1', length: 1048 }, { refName: 'KI270336.1', length: 1026 }, { refName: 'KI270337.1', length: 1121 }, { refName: 'KI270338.1', length: 1428 }, { refName: 'KI270340.1', length: 1428 }, { refName: 'KI270362.1', length: 3530 }, { refName: 'KI270363.1', length: 1803 }, { refName: 'KI270364.1', length: 2855 }, { refName: 'KI270366.1', length: 8320 }, { refName: 'KI270371.1', length: 2805 }, { refName: 'KI270372.1', length: 1650 }, { refName: 'KI270373.1', length: 1451 }, { refName: 'KI270374.1', length: 2656 }, { refName: 'KI270375.1', length: 2378 }, { refName: 'KI270376.1', length: 1136 }, { refName: 'KI270378.1', length: 1048 }, { refName: 'KI270379.1', length: 1045 }, { refName: 'KI270381.1', length: 1930 }, { refName: 'KI270382.1', length: 4215 }, { refName: 'KI270383.1', length: 1750 }, { refName: 'KI270384.1', length: 1658 }, { refName: 'KI270385.1', length: 990 }, { refName: 'KI270386.1', length: 1788 }, { refName: 'KI270387.1', length: 1537 }, { refName: 'KI270388.1', length: 1216 }, { refName: 'KI270389.1', length: 1298 }, { refName: 'KI270390.1', length: 2387 }, { refName: 'KI270391.1', length: 1484 }, { refName: 'KI270392.1', length: 971 }, { refName: 'KI270393.1', length: 1308 }, { refName: 'KI270394.1', length: 970 }, { refName: 'KI270395.1', length: 1143 }, { refName: 'KI270396.1', length: 1880 }, { refName: 'KI270411.1', length: 2646 }, { refName: 'KI270412.1', length: 1179 }, { refName: 'KI270414.1', length: 2489 }, { refName: 'KI270417.1', length: 2043 }, { refName: 'KI270418.1', length: 2145 }, { refName: 'KI270419.1', length: 1029 }, { refName: 'KI270420.1', length: 2321 }, { refName: 'KI270422.1', length: 1445 }, { refName: 'KI270423.1', length: 981 }, { refName: 'KI270424.1', length: 2140 }, { refName: 'KI270425.1', length: 1884 }, { refName: 'KI270429.1', length: 1361 }, { refName: 'KI270435.1', length: 92983 }, { refName: 'KI270438.1', length: 112505 }, { refName: 'KI270442.1', length: 392061 }, { refName: 'KI270448.1', length: 7992 }, { refName: 'KI270465.1', length: 1774 }, { refName: 'KI270466.1', length: 1233 }, { refName: 'KI270467.1', length: 3920 }, { refName: 'KI270468.1', length: 4055 }, { refName: 'KI270507.1', length: 5353 }, { refName: 'KI270508.1', length: 1951 }, { refName: 'KI270509.1', length: 2318 }, { refName: 'KI270510.1', length: 2415 }, { refName: 'KI270511.1', length: 8127 }, { refName: 'KI270512.1', length: 22689 }, { refName: 'KI270515.1', length: 6361 }, { refName: 'KI270516.1', length: 1300 }, { refName: 'KI270517.1', length: 3253 }, { refName: 'KI270518.1', length: 2186 }, { refName: 'KI270519.1', length: 138126 }, { refName: 'KI270521.1', length: 7642 }, { refName: 'KI270522.1', length: 5674 }, { refName: 'KI270528.1', length: 2983 }, { refName: 'KI270529.1', length: 1899 }, { refName: 'KI270530.1', length: 2168 }, { refName: 'KI270538.1', length: 91309 }, { refName: 'KI270539.1', length: 993 }, { refName: 'KI270544.1', length: 1202 }, { refName: 'KI270548.1', length: 1599 }, { refName: 'KI270579.1', length: 31033 }, { refName: 'KI270580.1', length: 1553 }, { refName: 'KI270581.1', length: 7046 }, { refName: 'KI270582.1', length: 6504 }, { refName: 'KI270583.1', length: 1400 }, { refName: 'KI270584.1', length: 4513 }, { refName: 'KI270587.1', length: 2969 }, { refName: 'KI270588.1', length: 6158 }, { refName: 'KI270589.1', length: 44474 }, { refName: 'KI270590.1', length: 4685 }, { refName: 'KI270591.1', length: 5796 }, { refName: 'KI270593.1', length: 3041 }, { refName: 'KI270706.1', length: 175055 }, { refName: 'KI270707.1', length: 32032 }, { refName: 'KI270708.1', length: 127682 }, { refName: 'KI270709.1', length: 66860 }, { refName: 'KI270710.1', length: 40176 }, { refName: 'KI270711.1', length: 42210 }, { refName: 'KI270712.1', length: 176043 }, { refName: 'KI270713.1', length: 40745 }, { refName: 'KI270714.1', length: 41717 }, { refName: 'KI270715.1', length: 161471 }, { refName: 'KI270716.1', length: 153799 }, { refName: 'KI270717.1', length: 40062 }, { refName: 'KI270718.1', length: 38054 }, { refName: 'KI270719.1', length: 176845 }, { refName: 'KI270720.1', length: 39050 }, { refName: 'KI270721.1', length: 100316 }, { refName: 'KI270722.1', length: 194050 }, { refName: 'KI270723.1', length: 38115 }, { refName: 'KI270724.1', length: 39555 }, { refName: 'KI270725.1', length: 172810 }, { refName: 'KI270726.1', length: 43739 }, { refName: 'KI270727.1', length: 448248 }, { refName: 'KI270728.1', length: 1872759 }, { refName: 'KI270729.1', length: 280839 }, { refName: 'KI270730.1', length: 112551 }, { refName: 'KI270731.1', length: 150754 }, { refName: 'KI270732.1', length: 41543 }, { refName: 'KI270733.1', length: 179772 }, { refName: 'KI270734.1', length: 165050 }, { refName: 'KI270735.1', length: 42811 }, { refName: 'KI270736.1', length: 181920 }, { refName: 'KI270737.1', length: 103838 }, { refName: 'KI270738.1', length: 99375 }, { refName: 'KI270739.1', length: 73985 }, { refName: 'KI270740.1', length: 37240 }, { refName: 'KI270741.1', length: 157432 }, { refName: 'KI270742.1', length: 186739 }, { refName: 'KI270743.1', length: 210658 }, { refName: 'KI270744.1', length: 168472 }, { refName: 'KI270745.1', length: 41891 }, { refName: 'KI270746.1', length: 66486 }, { refName: 'KI270747.1', length: 198735 }, { refName: 'KI270748.1', length: 93321 }, { refName: 'KI270749.1', length: 158759 }, { refName: 'KI270750.1', length: 148850 }, { refName: 'KI270751.1', length: 150742 }, { refName: 'KI270752.1', length: 27745 }, { refName: 'KI270753.1', length: 62944 }, { refName: 'KI270754.1', length: 40191 }, { refName: 'KI270755.1', length: 36723 }, { refName: 'KI270756.1', length: 79590 }, { refName: 'KI270757.1', length: 71251 }, ] } if (!this.index) { return } const indexData = await this.index.parse(opts) const ret = indexData.firstDataLine ? indexData.firstDataLine.blockPosition + 65535 : undefined let buffer if (ret) { const s = ret + blockLen // console.log(`[bam-js] reading header [ ret ${ret} | s ${s} ]`) const res = await this.bam.read(Buffer.alloc(s), 0, s, 0, opts) if (!res.bytesRead) { throw new Error('Error reading header') } buffer = res.buffer.subarray(0, Math.min(res.bytesRead, ret)) // console.log(`[bam-js] reading header [ res.bytesRead ${res.bytesRead} ]`) } else { // console.log(`[bam-js] reading all of header`) buffer = await this.bam.readFile(opts) } const uncba = await unzip(buffer) if (uncba.readInt32LE(0) !== BAM_MAGIC) { throw new Error('Not a BAM file') } const headLen = uncba.readInt32LE(4) // console.log(`[bam-js] headLen ${headLen}`); this.header = uncba.toString('utf8', 8, 8 + headLen) const { chrToIndex, indexToChr } = await this._readRefSeqs( headLen + 8, 65535, opts, ) this.chrToIndex = chrToIndex this.indexToChr = indexToChr // console.log(`this.chrToIndex ${JSON.stringify(this.chrToIndex)}`) // console.log(`this.indexToChr ${JSON.stringify(this.indexToChr)}`) return parseHeaderText(this.header) } getHeader(opts?: BaseOpts) { if (!this.headerP) { this.headerP = this.getHeaderPre(opts).catch(e => { this.headerP = undefined throw e }) } return this.headerP } async getHeaderText(opts: BaseOpts = {}) { await this.getHeader(opts) return this.header } // the full length of the refseq block is not given in advance so this grabs // a chunk and doubles it if all refseqs haven't been processed async _readRefSeqs( start: number, refSeqBytes: number, opts?: BaseOpts, ): Promise<{ chrToIndex: Record<string, number> indexToChr: { refName: string; length: number }[] }> { if (start > refSeqBytes) { return this._readRefSeqs(start, refSeqBytes * 2, opts) } const size = refSeqBytes + blockLen const { bytesRead, buffer } = await this.bam.read( Buffer.alloc(size), 0, refSeqBytes, 0, opts, ) if (!bytesRead) { throw new Error('Error reading refseqs from header') } const uncba = await unzip( buffer.subarray(0, Math.min(bytesRead, refSeqBytes)), ) const nRef = uncba.readInt32LE(start) let p = start + 4 const chrToIndex: Record<string, number> = {} const indexToChr: { refName: string; length: number }[] = [] for (let i = 0; i < nRef; i += 1) { const lName = uncba.readInt32LE(p) const refName = this.renameRefSeq( uncba.toString('utf8', p + 4, p + 4 + lName - 1), ) const lRef = uncba.readInt32LE(p + lName + 4) chrToIndex[refName] = i indexToChr.push({ refName, length: lRef }) p = p + 8 + lName if (p > uncba.length) { console.warn( `BAM header is very big. Re-fetching ${refSeqBytes} bytes.`, ) return this._readRefSeqs(start, refSeqBytes * 2, opts) } } // console.log(`[bam-js] chrToIndex: ${JSON.stringify(chrToIndex)}`) // console.log(`[bam-js] indexToChr: ${JSON.stringify(indexToChr)}`) return { chrToIndex, indexToChr } } async getRecordsForRange( chr: string, min: number, max: number, opts?: BamOpts, ) { return gen2array(this.streamRecordsForRange(chr, min, max, opts)) } async *streamRecordsForRange( chr: string, min: number, max: number, opts?: BamOpts, ) { // console.log( // `[bam-js] streamRecordsForRange | ${chr} | ${min} | ${max} | ${JSON.stringify(opts)}`, // ) // console.log(`[bam-js] opts?.assemblyName ${opts?.assemblyName}`) if (opts?.assemblyName && opts?.assemblyName !== 'hg38') { await this.getHeader(opts) } const chrId = this.chrToIndex?.[chr] if (chrId === undefined || !this.index) { yield [] } else { const chunks = await this.index.blocksForRange(chrId, min - 1, max, opts) yield* this._fetchChunkFeatures(chunks, chrId, min, max, opts) } } async *_fetchChunkFeatures( chunks: Chunk[], chrId: number, min: number, max: number, opts: BamOpts = {}, ) { const { viewAsPairs } = opts const feats = [] as BAMFeature[][] let done = false for (const chunk of chunks) { const records = await this.featureCache.get( chunk.toString(), { chunk, opts }, opts.signal, ) const recs = [] as BAMFeature[] for (const feature of records) { if (feature.seq_id() === chrId) { if (feature.get('start') >= max) { // past end of range, can stop iterating done = true break } else if (feature.get('end') >= min) { // must be in range recs.push(feature) } } } feats.push(recs) yield recs if (done) { break } } checkAbortSignal(opts.signal) if (viewAsPairs) { yield this.fetchPairs(chrId, feats, opts) } } async fetchPairs(chrId: number, feats: BAMFeature[][], opts: BamOpts) { const { pairAcrossChr, maxInsertSize = 200000 } = opts const unmatedPairs: Record<string, boolean> = {} const readIds: Record<string, number> = {} feats.map(ret => { const readNames: Record<string, number> = {} for (const element of ret) { const name = element.name() const id = element.id() if (!readNames[name]) { readNames[name] = 0 } readNames[name]++ readIds[id] = 1 } for (const [k, v] of Object.entries(readNames)) { if (v === 1) { unmatedPairs[k] = true } } }) const matePromises: Promise<Chunk[]>[] = [] feats.map(ret => { for (const f of ret) { const name = f.name() const start = f.get('start') const pnext = f._next_pos() const rnext = f._next_refid() if ( this.index && unmatedPairs[name] && (pairAcrossChr || (rnext === chrId && Math.abs(start - pnext) < maxInsertSize)) ) { matePromises.push( this.index.blocksForRange(rnext, pnext, pnext + 1, opts), ) } } }) // filter out duplicate chunks (the blocks are lists of chunks, blocks are // concatenated, then filter dup chunks) const map = new Map<string, Chunk>() const res = await Promise.all(matePromises) for (const m of res.flat()) { if (!map.has(m.toString())) { map.set(m.toString(), m) } } const mateFeatPromises = await Promise.all( [...map.values()].map(async c => { const { data, cpositions, dpositions, chunk } = await this._readChunk({ chunk: c, opts, }) const mateRecs = [] as BAMFeature[] for (const feature of await this.readBamFeatures( data, cpositions, dpositions, chunk, )) { if (unmatedPairs[feature.get('name')] && !readIds[feature.id()]) { mateRecs.push(feature) } } return mateRecs }), ) return mateFeatPromises.flat() } async _readRegion(position: number, size: number, opts: BaseOpts = {}) { const { bytesRead, buffer } = await this.bam.read( Buffer.alloc(size), 0, size, position, opts, ) return buffer.subarray(0, Math.min(bytesRead, size)) } async _readChunk({ chunk, opts }: { chunk: Chunk; opts: BaseOpts }) { const buffer = await this._readRegion( chunk.minv.blockPosition, chunk.fetchedSize(), opts, ) const { buffer: data, cpositions, dpositions, } = await unzipChunkSlice(buffer, chunk) return { data, cpositions, dpositions, chunk } } async readBamFeatures( ba: Buffer, cpositions: number[], dpositions: number[], chunk: Chunk, ) { let blockStart = 0 const sink = [] as BAMFeature[] let pos = 0 let last = +Date.now() while (blockStart + 4 < ba.length) { const blockSize = ba.readInt32LE(blockStart) const blockEnd = blockStart + 4 + blockSize - 1 // increment position to the current decompressed status if (dpositions) { while (blockStart + chunk.minv.dataPosition >= dpositions[pos++]) {} pos-- } // only try to read the feature if we have all the bytes for it if (blockEnd < ba.length) { const feature = new BAMFeature({ bytes: { byteArray: ba, start: blockStart, end: blockEnd, }, // the below results in an automatically calculated file-offset based // ID if the info for that is available, otherwise crc32 of the // features // // cpositions[pos] refers to actual file offset of a bgzip block // boundaries // // we multiply by (1 <<8) in order to make sure each block has a // "unique" address space so that data in that block could never // overlap // // then the blockStart-dpositions is an uncompressed file offset from // that bgzip block boundary, and since the cpositions are multiplied // by (1 << 8) these uncompressed offsets get a unique space // // this has an extra chunk.minv.dataPosition added on because it // blockStart starts at 0 instead of chunk.minv.dataPosition // // the +1 is just to avoid any possible uniqueId 0 but this does not // realistically happen fileOffset: cpositions.length > 0 ? cpositions[pos] * (1 << 8) + (blockStart - dpositions[pos]) + chunk.minv.dataPosition + 1 : // must be slice, not subarray for buffer polyfill on web crc32.signed(ba.slice(blockStart, blockEnd)), }) sink.push(feature) if (this.yieldThreadTime && +Date.now() - last > this.yieldThreadTime) { await timeout(1) last = +Date.now() } } blockStart = blockEnd + 1 } return sink } async hasRefSeq(seqName: string) { const seqId = this.chrToIndex?.[seqName] return seqId === undefined ? false : this.index?.hasRefSeq(seqId) } async lineCount(seqName: string) { const seqId = this.chrToIndex?.[seqName] return seqId === undefined || !this.index ? 0 : this.index.lineCount(seqId) } async indexCov(seqName: string, start?: number, end?: number) { if (!this.index) { return [] } await this.index.parse() const seqId = this.chrToIndex?.[seqName] return seqId === undefined ? [] : this.index.indexCov(seqId, start, end) } async blocksForRange( seqName: string, start: number, end: number, opts?: BaseOpts, ) { if (!this.index) { return [] } await this.index.parse() const seqId = this.chrToIndex?.[seqName] return seqId === undefined ? [] : this.index.blocksForRange(seqId, start, end, opts) } }