@gmod/bam

import AbortablePromiseCache from '@gmod/abortable-promise-cache' import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle' import crc32 from 'crc/calculators/crc32' import { LocalFile, RemoteFile } from 'generic-filehandle2' import QuickLRU from 'quick-lru' import BAI from './bai' import Chunk from './chunk' import CSI from './csi' import NullFilehandle from './nullFilehandle' import BAMFeature from './record' import { parseHeaderText } from './sam' import { BamOpts, BaseOpts, checkAbortSignal, gen2array, makeOpts, timeout, } from './util' import type { GenericFilehandle } from 'generic-filehandle2' export const BAM_MAGIC = 21840194 const blockLen = 1 << 16 interface Args { chunk: Chunk opts: BaseOpts } export default class BamFile { public renameRefSeq: (a: string) => string public bam: GenericFilehandle public header?: string public chrToIndex?: Record<string, number> public indexToChr?: { refName: string; length: number }[] public yieldThreadTime: number public index?: BAI | CSI public htsget = false public headerP?: ReturnType<BamFile['getHeaderPre']> private featureCache = new AbortablePromiseCache<Args, BAMFeature[]>({ cache: new QuickLRU({ maxSize: 50, }), fill: async (args: Args, signal) => { const { chunk, opts } = args const { data, cpositions, dpositions } = await this._readChunk({ chunk, opts: { ...opts, signal }, }) return this.readBamFeatures(data, cpositions, dpositions, chunk) }, }) constructor({ bamFilehandle, bamPath, bamUrl, baiPath, baiFilehandle, baiUrl, csiPath, csiFilehandle, csiUrl, htsget, yieldThreadTime = 100, renameRefSeqs = n => n, }: { bamFilehandle?: GenericFilehandle bamPath?: string bamUrl?: string baiPath?: string baiFilehandle?: GenericFilehandle baiUrl?: string csiPath?: string csiFilehandle?: GenericFilehandle csiUrl?: string renameRefSeqs?: (a: string) => string yieldThreadTime?: number htsget?: boolean }) { this.renameRefSeq = renameRefSeqs if (bamFilehandle) { this.bam = bamFilehandle } else if (bamPath) { this.bam = new LocalFile(bamPath) } else if (bamUrl) { this.bam = new RemoteFile(bamUrl) } else if (htsget) { this.htsget = true this.bam = new NullFilehandle() } else { throw new Error('unable to initialize bam') } if (csiFilehandle) { this.index = new CSI({ filehandle: csiFilehandle }) } else if (csiPath) { this.index = new CSI({ filehandle: new LocalFile(csiPath) }) } else if (csiUrl) { this.index = new CSI({ filehandle: new RemoteFile(csiUrl) }) } else if (baiFilehandle) { this.index = new BAI({ filehandle: baiFilehandle }) } else if (baiPath) { this.index = new BAI({ filehandle: new LocalFile(baiPath) }) } else if (baiUrl) { this.index = new BAI({ filehandle: new RemoteFile(baiUrl) }) } else if (bamPath) { this.index = new BAI({ filehandle: new LocalFile(`${bamPath}.bai`) }) } else if (bamUrl) { this.index = new BAI({ filehandle: new RemoteFile(`${bamUrl}.bai`) }) } else if (htsget) { this.htsget = true } else { throw new Error('unable to infer index format') } this.yieldThreadTime = yieldThreadTime } async getHeaderPre(origOpts?: BaseOpts) { const opts = makeOpts(origOpts) if (!this.index) { return } const indexData = await this.index.parse(opts) const ret = indexData.firstDataLine ? indexData.firstDataLine.blockPosition + 65535 : undefined let buffer if (ret) { const s = ret + blockLen buffer = await this.bam.read(s, 0) } else { buffer = await this.bam.readFile(opts) } const uncba = await unzip(buffer) const dataView = new DataView(uncba.buffer) if (dataView.getInt32(0, true) !== BAM_MAGIC) { throw new Error('Not a BAM file') } const headLen = dataView.getInt32(4, true) const decoder = new TextDecoder('utf8') this.header = decoder.decode(uncba.subarray(8, 8 + headLen)) const { chrToIndex, indexToChr } = await this._readRefSeqs( headLen + 8, 65535, opts, ) this.chrToIndex = chrToIndex this.indexToChr = indexToChr return parseHeaderText(this.header) } getHeader(opts?: BaseOpts) { if (!this.headerP) { this.headerP = this.getHeaderPre(opts).catch((e: unknown) => { this.headerP = undefined throw e }) } return this.headerP } async getHeaderText(opts: BaseOpts = {}) { await this.getHeader(opts) return this.header } // the full length of the refseq block is not given in advance so this grabs // a chunk and doubles it if all refseqs haven't been processed async _readRefSeqs( start: number, refSeqBytes: number, opts?: BaseOpts, ): Promise<{ chrToIndex: Record<string, number> indexToChr: { refName: string; length: number }[] }> { if (start > refSeqBytes) { return this._readRefSeqs(start, refSeqBytes * 2, opts) } // const size = refSeqBytes + blockLen <-- use this? const buffer = await this.bam.read(refSeqBytes, 0, opts) const uncba = await unzip(buffer) const dataView = new DataView(uncba.buffer) const nRef = dataView.getInt32(start, true) let p = start + 4 const chrToIndex: Record<string, number> = {} const indexToChr: { refName: string; length: number }[] = [] const decoder = new TextDecoder('utf8') for (let i = 0; i < nRef; i += 1) { const lName = dataView.getInt32(p, true) const refName = this.renameRefSeq( decoder.decode(uncba.subarray(p + 4, p + 4 + lName - 1)), ) const lRef = dataView.getInt32(p + lName + 4, true) chrToIndex[refName] = i indexToChr.push({ refName, length: lRef }) p = p + 8 + lName if (p > uncba.length) { console.warn( `BAM header is very big. Re-fetching ${refSeqBytes} bytes.`, ) return this._readRefSeqs(start, refSeqBytes * 2, opts) } } return { chrToIndex, indexToChr } } async getRecordsForRange( chr: string, min: number, max: number, opts?: BamOpts, ) { return gen2array(this.streamRecordsForRange(chr, min, max, opts)) } async *streamRecordsForRange( chr: string, min: number, max: number, opts?: BamOpts, ) { await this.getHeader(opts) const chrId = this.chrToIndex?.[chr] if (chrId === undefined || !this.index) { yield [] } else { const chunks = await this.index.blocksForRange(chrId, min - 1, max, opts) yield* this._fetchChunkFeatures(chunks, chrId, min, max, opts) } } async *_fetchChunkFeatures( chunks: Chunk[], chrId: number, min: number, max: number, opts: BamOpts = {}, ) { const { viewAsPairs } = opts const feats = [] as BAMFeature[][] let done = false for (const chunk of chunks) { const records = await this.featureCache.get( chunk.toString(), { chunk, opts }, opts.signal, ) const recs = [] as BAMFeature[] for (const feature of records) { if (feature.ref_id === chrId) { if (feature.start >= max) { // past end of range, can stop iterating done = true break } else if (feature.end >= min) { // must be in range recs.push(feature) } } } feats.push(recs) yield recs if (done) { break } } checkAbortSignal(opts.signal) if (viewAsPairs) { yield this.fetchPairs(chrId, feats, opts) } } async fetchPairs(chrId: number, feats: BAMFeature[][], opts: BamOpts) { const { pairAcrossChr, maxInsertSize = 200000 } = opts const unmatedPairs: Record<string, boolean> = {} const readIds: Record<string, number> = {} feats.map(ret => { const readNames: Record<string, number> = {} for (const element of ret) { const name = element.name const id = element.id if (!readNames[name]) { readNames[name] = 0 } readNames[name]++ readIds[id] = 1 } for (const [k, v] of Object.entries(readNames)) { if (v === 1) { unmatedPairs[k] = true } } }) const matePromises: Promise<Chunk[]>[] = [] feats.map(ret => { for (const f of ret) { const name = f.name const start = f.start const pnext = f.next_pos const rnext = f.next_refid if ( this.index && unmatedPairs[name] && (pairAcrossChr || (rnext === chrId && Math.abs(start - pnext) < maxInsertSize)) ) { matePromises.push( this.index.blocksForRange(rnext, pnext, pnext + 1, opts), ) } } }) // filter out duplicate chunks (the blocks are lists of chunks, blocks are // concatenated, then filter dup chunks) const map = new Map<string, Chunk>() const res = await Promise.all(matePromises) for (const m of res.flat()) { if (!map.has(m.toString())) { map.set(m.toString(), m) } } const mateFeatPromises = await Promise.all( [...map.values()].map(async c => { const { data, cpositions, dpositions, chunk } = await this._readChunk({ chunk: c, opts, }) const mateRecs = [] as BAMFeature[] for (const feature of await this.readBamFeatures( data, cpositions, dpositions, chunk, )) { if (unmatedPairs[feature.name] && !readIds[feature.id]) { mateRecs.push(feature) } } return mateRecs }), ) return mateFeatPromises.flat() } async _readChunk({ chunk, opts }: { chunk: Chunk; opts: BaseOpts }) { const buf = await this.bam.read( chunk.fetchedSize(), chunk.minv.blockPosition, opts, ) const { buffer: data, cpositions, dpositions, } = await unzipChunkSlice(buf, chunk) return { data, cpositions, dpositions, chunk } } async readBamFeatures( ba: Uint8Array, cpositions: number[], dpositions: number[], chunk: Chunk, ) { let blockStart = 0 const sink = [] as BAMFeature[] let pos = 0 let last = +Date.now() const dataView = new DataView(ba.buffer) while (blockStart + 4 < ba.length) { const blockSize = dataView.getInt32(blockStart, true) const blockEnd = blockStart + 4 + blockSize - 1 // increment position to the current decompressed status // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition if (dpositions) { while (blockStart + chunk.minv.dataPosition >= dpositions[pos++]!) {} pos-- } // only try to read the feature if we have all the bytes for it if (blockEnd < ba.length) { const feature = new BAMFeature({ bytes: { byteArray: ba, start: blockStart, end: blockEnd, }, // the below results in an automatically calculated file-offset based // ID if the info for that is available, otherwise crc32 of the // features // // cpositions[pos] refers to actual file offset of a bgzip block // boundaries // // we multiply by (1 <<8) in order to make sure each block has a // "unique" address space so that data in that block could never // overlap // // then the blockStart-dpositions is an uncompressed file offset from // that bgzip block boundary, and since the cpositions are multiplied // by (1 << 8) these uncompressed offsets get a unique space // // this has an extra chunk.minv.dataPosition added on because it // blockStart starts at 0 instead of chunk.minv.dataPosition // // the +1 is just to avoid any possible uniqueId 0 but this does not // realistically happen fileOffset: cpositions.length > 0 ? cpositions[pos]! * (1 << 8) + (blockStart - dpositions[pos]!) + chunk.minv.dataPosition + 1 : // this shift >>> 0 is equivalent to crc32(b).unsigned but uses the // internal calculator of crc32 to avoid accidentally importing buffer // https://github.com/alexgorbatchev/crc/blob/31fc3853e417b5fb5ec83335428805842575f699/src/define_crc.ts#L5 crc32(ba.subarray(blockStart, blockEnd)) >>> 0, }) sink.push(feature) if (this.yieldThreadTime && +Date.now() - last > this.yieldThreadTime) { await timeout(1) last = +Date.now() } } blockStart = blockEnd + 1 } return sink } async hasRefSeq(seqName: string) { const seqId = this.chrToIndex?.[seqName] return seqId === undefined ? false : this.index?.hasRefSeq(seqId) } async lineCount(seqName: string) { const seqId = this.chrToIndex?.[seqName] return seqId === undefined || !this.index ? 0 : this.index.lineCount(seqId) } async indexCov(seqName: string, start?: number, end?: number) { if (!this.index) { return [] } await this.index.parse() const seqId = this.chrToIndex?.[seqName] return seqId === undefined ? [] : this.index.indexCov(seqId, start, end) } async blocksForRange( seqName: string, start: number, end: number, opts?: BaseOpts, ) { if (!this.index) { return [] } await this.index.parse() const seqId = this.chrToIndex?.[seqName] return seqId === undefined ? [] : this.index.blocksForRange(seqId, start, end, opts) } }