UNPKG

@gmod/bam

Version:

Parser for BAM and BAM index (bai) files

295 lines 11.9 kB
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'; import QuickLRU from '@jbrowse/quick-lru'; import crc32 from 'crc/calculators/crc32'; import { LocalFile, RemoteFile } from 'generic-filehandle2'; import BAI from "./bai.js"; import CSI from "./csi.js"; import NullFilehandle from "./nullFilehandle.js"; import BAMFeature from "./record.js"; import { parseHeaderText } from "./sam.js"; import { appendInRange, applyFilters, filterCacheKey, parseRefSeqs, } from "./util.js"; export const BAM_MAGIC = 21840194; const blockLen = 1 << 16; function resolveFilehandle(filehandle, path, url) { return (filehandle ?? (path ? new LocalFile(path) : url ? new RemoteFile(url) : undefined)); } function chunkCacheKey(chunk, filterBy) { const { minv, maxv } = chunk; return `${minv.blockPosition}:${minv.dataPosition}-${maxv.blockPosition}:${maxv.dataPosition}${filterCacheKey(filterBy)}`; } export default class BamFile { renameRefSeq; bam; header; chrToIndex; indexToChr; index; htsget = false; headerP; // Cache for parsed features by chunk // When a new chunk overlaps a cached chunk, we evict the cached one chunkFeatureCache = new QuickLRU({ maxSize: 100, }); RecordClass; constructor({ bamFilehandle, bamPath, bamUrl, baiPath, baiFilehandle, baiUrl, csiPath, csiFilehandle, csiUrl, htsget, renameRefSeqs = n => n, recordClass, }) { this.renameRefSeq = renameRefSeqs; this.RecordClass = (recordClass ?? BAMFeature); const bamFh = resolveFilehandle(bamFilehandle, bamPath, bamUrl); if (bamFh) { this.bam = bamFh; } else if (htsget) { this.htsget = true; this.bam = new NullFilehandle(); } else { throw new Error('no bam source: pass bamFilehandle, bamPath, bamUrl, or htsget: true'); } const csiFh = resolveFilehandle(csiFilehandle, csiPath, csiUrl); const baiFh = resolveFilehandle(baiFilehandle, baiPath, baiUrl) ?? resolveFilehandle(undefined, bamPath ? `${bamPath}.bai` : undefined, bamUrl ? `${bamUrl}.bai` : undefined); if (csiFh) { this.index = new CSI({ filehandle: csiFh }); } else if (baiFh) { this.index = new BAI({ filehandle: baiFh }); } else if (!htsget) { throw new Error('no index source: pass csi*/bai* options or a bamPath/bamUrl so the .bai sibling can be inferred'); } // htsget mode operates without a parsed index } async getHeaderPre(opts = {}) { if (!this.index) { return undefined; } const indexData = await this.index.parse(opts); // firstDataLine is not defined in cases where there is no data in the file // (just bam header and nothing else) const readLen = indexData.firstDataLine === undefined ? undefined : indexData.firstDataLine.blockPosition + blockLen; const buffer = readLen === undefined ? await this.bam.readFile() : await this.bam.read(readLen, 0); let uncba = await unzip(buffer); const dataView = new DataView(uncba.buffer); if (dataView.getInt32(0, true) !== BAM_MAGIC) { throw new Error('Not a BAM file'); } const headLen = dataView.getInt32(4, true); this.header = new TextDecoder('utf8').decode(uncba.subarray(8, 8 + headLen)); // BAM files with many reference sequences may need more data than the // initial read covers. If the first attempt comes up short, fall back to // reading the whole file (the index's firstDataLine is just an // optimization hint, not a guaranteed cap on the ref-seq table size). const refSeqStart = headLen + 8; let parsed = parseRefSeqs(uncba, refSeqStart, this.renameRefSeq); if (!parsed) { uncba = await unzip(await this.bam.readFile()); parsed = parseRefSeqs(uncba, refSeqStart, this.renameRefSeq); } if (!parsed) { throw new Error('Insufficient data for reference sequences'); } this.chrToIndex = parsed.chrToIndex; this.indexToChr = parsed.indexToChr; return parseHeaderText(this.header); } getHeader(opts) { if (!this.headerP) { this.headerP = this.getHeaderPre(opts).catch((e) => { this.headerP = undefined; throw e; }); } return this.headerP; } async getHeaderText(opts = {}) { await this.getHeader(opts); return this.header; } async getRecordsForRange(chr, min, max, opts) { await this.getHeader(opts); const chrId = this.chrToIndex?.[chr]; if (chrId === undefined || !this.index) { return []; } const chunks = await this.index.blocksForRange(chrId, min - 1, max, opts); return this._fetchChunkFeaturesDirect(chunks, chrId, min, max, opts); } // Evict any cached chunks whose block range overlaps [minBlock, maxBlock] evictOverlappingChunks(minBlock, maxBlock) { for (const [key, entry] of this.chunkFeatureCache) { if (minBlock <= entry.maxBlock && maxBlock >= entry.minBlock) { this.chunkFeatureCache.delete(key); } } } async _fetchChunkFeaturesDirect(chunks, chrId, min, max, opts = {}) { const { viewAsPairs, filterBy } = opts; const result = []; for (let ci = 0, cl = chunks.length; ci < cl; ci++) { const chunk = chunks[ci]; const cacheKey = chunkCacheKey(chunk, filterBy); const minBlock = chunk.minv.blockPosition; const maxBlock = chunk.maxv.blockPosition; let records; const cached = this.chunkFeatureCache.get(cacheKey); if (cached) { records = cached.features; } else { this.evictOverlappingChunks(minBlock, maxBlock); const allRecords = await this._readChunkFeatures(chunk, opts); records = filterBy ? applyFilters(allRecords, filterBy) : allRecords; this.chunkFeatureCache.set(cacheKey, { minBlock, maxBlock, features: records, }); } appendInRange(records, chrId, min, max, result); } if (viewAsPairs) { const pairs = await this.fetchPairs(chrId, result, opts); for (let i = 0, l = pairs.length; i < l; i++) { result.push(pairs[i]); } } return result; } async fetchPairs(chrId, records, opts) { const { pairAcrossChr, maxInsertSize = 200000 } = opts; const readNameCounts = {}; const readIds = new Set(); for (let i = 0, l = records.length; i < l; i++) { const r = records[i]; const name = r.name; readNameCounts[name] = (readNameCounts[name] ?? 0) + 1; readIds.add(r.fileOffset); } const matePromises = []; for (let i = 0, l = records.length; i < l; i++) { const f = records[i]; const name = f.name; if (this.index && readNameCounts[name] === 1 && (pairAcrossChr || (f.next_refid === chrId && Math.abs(f.start - f.next_pos) < maxInsertSize))) { matePromises.push(this.index.blocksForRange(f.next_refid, f.next_pos, f.next_pos + 1, opts)); } } const map = new Map(); const res = await Promise.all(matePromises); for (let i = 0, l = res.length; i < l; i++) { const chunks = res[i]; for (let j = 0, jl = chunks.length; j < jl; j++) { const m = chunks[j]; map.set(m.toString(), m); } } const mateFeatLists = await Promise.all([...map.values()].map(async (c) => { const features = await this._readChunkFeatures(c, opts); const mateRecs = []; for (let i = 0, l = features.length; i < l; i++) { const feature = features[i]; if (readNameCounts[feature.name] === 1 && !readIds.has(feature.fileOffset)) { mateRecs.push(feature); } } return mateRecs; })); return mateFeatLists.flat(); } async _readChunkFeatures(chunk, opts) { const buf = await this.bam.read(chunk.fetchedSize(), chunk.minv.blockPosition, opts); const { buffer: data, cpositions, dpositions, } = await unzipChunkSlice(buf, chunk); return this.readBamFeatures(data, cpositions, dpositions, chunk); } async readBamFeatures(ba, cpositions, dpositions, chunk) { let blockStart = 0; const sink = []; let pos = 0; const dataView = new DataView(ba.buffer); const hasDpositions = dpositions.length > 0; const hasCpositions = cpositions.length > 0; while (blockStart + 4 < ba.length) { const blockSize = dataView.getInt32(blockStart, true); const blockEnd = blockStart + 4 + blockSize - 1; if (hasDpositions) { const target = blockStart + chunk.minv.dataPosition; while (pos < dpositions.length && target >= dpositions[pos]) { pos++; } } if (blockEnd < ba.length) { const feature = new this.RecordClass({ bytes: { byteArray: ba, start: blockStart, end: blockEnd, }, fileOffset: hasCpositions ? cpositions[pos] * (1 << 8) + (blockStart - dpositions[pos]) + chunk.minv.dataPosition + 1 : crc32(ba.subarray(blockStart, blockEnd)) >>> 0, dataView, }); sink.push(feature); } blockStart = blockEnd + 1; } return sink; } async hasRefSeq(seqName) { const seqId = this.chrToIndex?.[seqName]; return !this.index || seqId === undefined ? false : this.index.hasRefSeq(seqId); } async lineCount(seqName) { const seqId = this.chrToIndex?.[seqName]; return !this.index || seqId === undefined ? 0 : this.index.lineCount(seqId); } async indexCov(seqName, start, end) { const seqId = this.chrToIndex?.[seqName]; return !this.index || seqId === undefined ? [] : this.index.indexCov(seqId, start, end); } async blocksForRange(seqName, start, end, opts) { const seqId = this.chrToIndex?.[seqName]; return !this.index || seqId === undefined ? [] : this.index.blocksForRange(seqId, start, end, opts); } clearFeatureCache() { this.chunkFeatureCache.clear(); } async estimatedBytesForRegions(regions, opts) { if (!this.index) { return 0; } await this.getHeader(opts); const chrToIndex = this.chrToIndex; if (!chrToIndex) { throw new Error('Header not yet parsed'); } const mapped = regions.flatMap(r => { const refId = chrToIndex[r.refName]; if (refId === undefined) { return []; } return [{ refId, start: r.start, end: r.end }]; }); return this.index.estimatedBytesForRegions(mapped, opts); } } //# sourceMappingURL=bamFile.js.map