@gmod/bam
Version:
Parser for BAM and BAM index (bai) files
295 lines • 11.9 kB
JavaScript
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle';
import QuickLRU from '@jbrowse/quick-lru';
import crc32 from 'crc/calculators/crc32';
import { LocalFile, RemoteFile } from 'generic-filehandle2';
import BAI from "./bai.js";
import CSI from "./csi.js";
import NullFilehandle from "./nullFilehandle.js";
import BAMFeature from "./record.js";
import { parseHeaderText } from "./sam.js";
import { appendInRange, applyFilters, filterCacheKey, parseRefSeqs, } from "./util.js";
export const BAM_MAGIC = 21840194;
const blockLen = 1 << 16;
function resolveFilehandle(filehandle, path, url) {
return (filehandle ??
(path ? new LocalFile(path) : url ? new RemoteFile(url) : undefined));
}
function chunkCacheKey(chunk, filterBy) {
const { minv, maxv } = chunk;
return `${minv.blockPosition}:${minv.dataPosition}-${maxv.blockPosition}:${maxv.dataPosition}${filterCacheKey(filterBy)}`;
}
export default class BamFile {
renameRefSeq;
bam;
header;
chrToIndex;
indexToChr;
index;
htsget = false;
headerP;
// Cache for parsed features by chunk
// When a new chunk overlaps a cached chunk, we evict the cached one
chunkFeatureCache = new QuickLRU({
maxSize: 100,
});
RecordClass;
constructor({ bamFilehandle, bamPath, bamUrl, baiPath, baiFilehandle, baiUrl, csiPath, csiFilehandle, csiUrl, htsget, renameRefSeqs = n => n, recordClass, }) {
this.renameRefSeq = renameRefSeqs;
this.RecordClass = (recordClass ?? BAMFeature);
const bamFh = resolveFilehandle(bamFilehandle, bamPath, bamUrl);
if (bamFh) {
this.bam = bamFh;
}
else if (htsget) {
this.htsget = true;
this.bam = new NullFilehandle();
}
else {
throw new Error('no bam source: pass bamFilehandle, bamPath, bamUrl, or htsget: true');
}
const csiFh = resolveFilehandle(csiFilehandle, csiPath, csiUrl);
const baiFh = resolveFilehandle(baiFilehandle, baiPath, baiUrl) ??
resolveFilehandle(undefined, bamPath ? `${bamPath}.bai` : undefined, bamUrl ? `${bamUrl}.bai` : undefined);
if (csiFh) {
this.index = new CSI({ filehandle: csiFh });
}
else if (baiFh) {
this.index = new BAI({ filehandle: baiFh });
}
else if (!htsget) {
throw new Error('no index source: pass csi*/bai* options or a bamPath/bamUrl so the .bai sibling can be inferred');
}
// htsget mode operates without a parsed index
}
async getHeaderPre(opts = {}) {
if (!this.index) {
return undefined;
}
const indexData = await this.index.parse(opts);
// firstDataLine is not defined in cases where there is no data in the file
// (just bam header and nothing else)
const readLen = indexData.firstDataLine === undefined
? undefined
: indexData.firstDataLine.blockPosition + blockLen;
const buffer = readLen === undefined
? await this.bam.readFile()
: await this.bam.read(readLen, 0);
let uncba = await unzip(buffer);
const dataView = new DataView(uncba.buffer);
if (dataView.getInt32(0, true) !== BAM_MAGIC) {
throw new Error('Not a BAM file');
}
const headLen = dataView.getInt32(4, true);
this.header = new TextDecoder('utf8').decode(uncba.subarray(8, 8 + headLen));
// BAM files with many reference sequences may need more data than the
// initial read covers. If the first attempt comes up short, fall back to
// reading the whole file (the index's firstDataLine is just an
// optimization hint, not a guaranteed cap on the ref-seq table size).
const refSeqStart = headLen + 8;
let parsed = parseRefSeqs(uncba, refSeqStart, this.renameRefSeq);
if (!parsed) {
uncba = await unzip(await this.bam.readFile());
parsed = parseRefSeqs(uncba, refSeqStart, this.renameRefSeq);
}
if (!parsed) {
throw new Error('Insufficient data for reference sequences');
}
this.chrToIndex = parsed.chrToIndex;
this.indexToChr = parsed.indexToChr;
return parseHeaderText(this.header);
}
getHeader(opts) {
if (!this.headerP) {
this.headerP = this.getHeaderPre(opts).catch((e) => {
this.headerP = undefined;
throw e;
});
}
return this.headerP;
}
async getHeaderText(opts = {}) {
await this.getHeader(opts);
return this.header;
}
async getRecordsForRange(chr, min, max, opts) {
await this.getHeader(opts);
const chrId = this.chrToIndex?.[chr];
if (chrId === undefined || !this.index) {
return [];
}
const chunks = await this.index.blocksForRange(chrId, min - 1, max, opts);
return this._fetchChunkFeaturesDirect(chunks, chrId, min, max, opts);
}
// Evict any cached chunks whose block range overlaps [minBlock, maxBlock]
evictOverlappingChunks(minBlock, maxBlock) {
for (const [key, entry] of this.chunkFeatureCache) {
if (minBlock <= entry.maxBlock && maxBlock >= entry.minBlock) {
this.chunkFeatureCache.delete(key);
}
}
}
async _fetchChunkFeaturesDirect(chunks, chrId, min, max, opts = {}) {
const { viewAsPairs, filterBy } = opts;
const result = [];
for (let ci = 0, cl = chunks.length; ci < cl; ci++) {
const chunk = chunks[ci];
const cacheKey = chunkCacheKey(chunk, filterBy);
const minBlock = chunk.minv.blockPosition;
const maxBlock = chunk.maxv.blockPosition;
let records;
const cached = this.chunkFeatureCache.get(cacheKey);
if (cached) {
records = cached.features;
}
else {
this.evictOverlappingChunks(minBlock, maxBlock);
const allRecords = await this._readChunkFeatures(chunk, opts);
records = filterBy ? applyFilters(allRecords, filterBy) : allRecords;
this.chunkFeatureCache.set(cacheKey, {
minBlock,
maxBlock,
features: records,
});
}
appendInRange(records, chrId, min, max, result);
}
if (viewAsPairs) {
const pairs = await this.fetchPairs(chrId, result, opts);
for (let i = 0, l = pairs.length; i < l; i++) {
result.push(pairs[i]);
}
}
return result;
}
async fetchPairs(chrId, records, opts) {
const { pairAcrossChr, maxInsertSize = 200000 } = opts;
const readNameCounts = {};
const readIds = new Set();
for (let i = 0, l = records.length; i < l; i++) {
const r = records[i];
const name = r.name;
readNameCounts[name] = (readNameCounts[name] ?? 0) + 1;
readIds.add(r.fileOffset);
}
const matePromises = [];
for (let i = 0, l = records.length; i < l; i++) {
const f = records[i];
const name = f.name;
if (this.index &&
readNameCounts[name] === 1 &&
(pairAcrossChr ||
(f.next_refid === chrId &&
Math.abs(f.start - f.next_pos) < maxInsertSize))) {
matePromises.push(this.index.blocksForRange(f.next_refid, f.next_pos, f.next_pos + 1, opts));
}
}
const map = new Map();
const res = await Promise.all(matePromises);
for (let i = 0, l = res.length; i < l; i++) {
const chunks = res[i];
for (let j = 0, jl = chunks.length; j < jl; j++) {
const m = chunks[j];
map.set(m.toString(), m);
}
}
const mateFeatLists = await Promise.all([...map.values()].map(async (c) => {
const features = await this._readChunkFeatures(c, opts);
const mateRecs = [];
for (let i = 0, l = features.length; i < l; i++) {
const feature = features[i];
if (readNameCounts[feature.name] === 1 &&
!readIds.has(feature.fileOffset)) {
mateRecs.push(feature);
}
}
return mateRecs;
}));
return mateFeatLists.flat();
}
async _readChunkFeatures(chunk, opts) {
const buf = await this.bam.read(chunk.fetchedSize(), chunk.minv.blockPosition, opts);
const { buffer: data, cpositions, dpositions, } = await unzipChunkSlice(buf, chunk);
return this.readBamFeatures(data, cpositions, dpositions, chunk);
}
async readBamFeatures(ba, cpositions, dpositions, chunk) {
let blockStart = 0;
const sink = [];
let pos = 0;
const dataView = new DataView(ba.buffer);
const hasDpositions = dpositions.length > 0;
const hasCpositions = cpositions.length > 0;
while (blockStart + 4 < ba.length) {
const blockSize = dataView.getInt32(blockStart, true);
const blockEnd = blockStart + 4 + blockSize - 1;
if (hasDpositions) {
const target = blockStart + chunk.minv.dataPosition;
while (pos < dpositions.length && target >= dpositions[pos]) {
pos++;
}
}
if (blockEnd < ba.length) {
const feature = new this.RecordClass({
bytes: {
byteArray: ba,
start: blockStart,
end: blockEnd,
},
fileOffset: hasCpositions
? cpositions[pos] * (1 << 8) +
(blockStart - dpositions[pos]) +
chunk.minv.dataPosition +
1
: crc32(ba.subarray(blockStart, blockEnd)) >>> 0,
dataView,
});
sink.push(feature);
}
blockStart = blockEnd + 1;
}
return sink;
}
async hasRefSeq(seqName) {
const seqId = this.chrToIndex?.[seqName];
return !this.index || seqId === undefined
? false
: this.index.hasRefSeq(seqId);
}
async lineCount(seqName) {
const seqId = this.chrToIndex?.[seqName];
return !this.index || seqId === undefined ? 0 : this.index.lineCount(seqId);
}
async indexCov(seqName, start, end) {
const seqId = this.chrToIndex?.[seqName];
return !this.index || seqId === undefined
? []
: this.index.indexCov(seqId, start, end);
}
async blocksForRange(seqName, start, end, opts) {
const seqId = this.chrToIndex?.[seqName];
return !this.index || seqId === undefined
? []
: this.index.blocksForRange(seqId, start, end, opts);
}
clearFeatureCache() {
this.chunkFeatureCache.clear();
}
async estimatedBytesForRegions(regions, opts) {
if (!this.index) {
return 0;
}
await this.getHeader(opts);
const chrToIndex = this.chrToIndex;
if (!chrToIndex) {
throw new Error('Header not yet parsed');
}
const mapped = regions.flatMap(r => {
const refId = chrToIndex[r.refName];
if (refId === undefined) {
return [];
}
return [{ refId, start: r.start, end: r.end }];
});
return this.index.estimatedBytesForRegions(mapped, opts);
}
}
//# sourceMappingURL=bamFile.js.map