@gmod/bam
Version:
Parser for BAM and BAM index (bai) files
359 lines • 14.2 kB
JavaScript
import AbortablePromiseCache from '@gmod/abortable-promise-cache';
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle';
import crc32 from 'crc/calculators/crc32';
import { LocalFile, RemoteFile } from 'generic-filehandle2';
import QuickLRU from 'quick-lru';
import BAI from './bai';
import CSI from './csi';
import NullFilehandle from './nullFilehandle';
import BAMFeature from './record';
import { parseHeaderText } from './sam';
import { checkAbortSignal, gen2array, makeOpts, timeout, } from './util';
export const BAM_MAGIC = 21840194;
const blockLen = 1 << 16;
export default class BamFile {
renameRefSeq;
bam;
header;
chrToIndex;
indexToChr;
yieldThreadTime;
index;
htsget = false;
headerP;
featureCache = new AbortablePromiseCache({
cache: new QuickLRU({
maxSize: 50,
}),
fill: async (args, signal) => {
const { chunk, opts } = args;
const { data, cpositions, dpositions } = await this._readChunk({
chunk,
opts: { ...opts, signal },
});
return this.readBamFeatures(data, cpositions, dpositions, chunk);
},
});
constructor({ bamFilehandle, bamPath, bamUrl, baiPath, baiFilehandle, baiUrl, csiPath, csiFilehandle, csiUrl, htsget, yieldThreadTime = 100, renameRefSeqs = n => n, }) {
this.renameRefSeq = renameRefSeqs;
if (bamFilehandle) {
this.bam = bamFilehandle;
}
else if (bamPath) {
this.bam = new LocalFile(bamPath);
}
else if (bamUrl) {
this.bam = new RemoteFile(bamUrl);
}
else if (htsget) {
this.htsget = true;
this.bam = new NullFilehandle();
}
else {
throw new Error('unable to initialize bam');
}
if (csiFilehandle) {
this.index = new CSI({ filehandle: csiFilehandle });
}
else if (csiPath) {
this.index = new CSI({ filehandle: new LocalFile(csiPath) });
}
else if (csiUrl) {
this.index = new CSI({ filehandle: new RemoteFile(csiUrl) });
}
else if (baiFilehandle) {
this.index = new BAI({ filehandle: baiFilehandle });
}
else if (baiPath) {
this.index = new BAI({ filehandle: new LocalFile(baiPath) });
}
else if (baiUrl) {
this.index = new BAI({ filehandle: new RemoteFile(baiUrl) });
}
else if (bamPath) {
this.index = new BAI({ filehandle: new LocalFile(`${bamPath}.bai`) });
}
else if (bamUrl) {
this.index = new BAI({ filehandle: new RemoteFile(`${bamUrl}.bai`) });
}
else if (htsget) {
this.htsget = true;
}
else {
throw new Error('unable to infer index format');
}
this.yieldThreadTime = yieldThreadTime;
}
async getHeaderPre(origOpts) {
const opts = makeOpts(origOpts);
if (!this.index) {
return;
}
const indexData = await this.index.parse(opts);
const ret = indexData.firstDataLine
? indexData.firstDataLine.blockPosition + 65535
: undefined;
let buffer;
if (ret) {
const s = ret + blockLen;
buffer = await this.bam.read(s, 0);
}
else {
buffer = await this.bam.readFile(opts);
}
const uncba = await unzip(buffer);
const dataView = new DataView(uncba.buffer);
if (dataView.getInt32(0, true) !== BAM_MAGIC) {
throw new Error('Not a BAM file');
}
const headLen = dataView.getInt32(4, true);
const decoder = new TextDecoder('utf8');
this.header = decoder.decode(uncba.subarray(8, 8 + headLen));
const { chrToIndex, indexToChr } = await this._readRefSeqs(headLen + 8, 65535, opts);
this.chrToIndex = chrToIndex;
this.indexToChr = indexToChr;
return parseHeaderText(this.header);
}
getHeader(opts) {
if (!this.headerP) {
this.headerP = this.getHeaderPre(opts).catch((e) => {
this.headerP = undefined;
throw e;
});
}
return this.headerP;
}
async getHeaderText(opts = {}) {
await this.getHeader(opts);
return this.header;
}
// the full length of the refseq block is not given in advance so this grabs
// a chunk and doubles it if all refseqs haven't been processed
async _readRefSeqs(start, refSeqBytes, opts) {
if (start > refSeqBytes) {
return this._readRefSeqs(start, refSeqBytes * 2, opts);
}
// const size = refSeqBytes + blockLen <-- use this?
const buffer = await this.bam.read(refSeqBytes, 0, opts);
const uncba = await unzip(buffer);
const dataView = new DataView(uncba.buffer);
const nRef = dataView.getInt32(start, true);
let p = start + 4;
const chrToIndex = {};
const indexToChr = [];
const decoder = new TextDecoder('utf8');
for (let i = 0; i < nRef; i += 1) {
const lName = dataView.getInt32(p, true);
const refName = this.renameRefSeq(decoder.decode(uncba.subarray(p + 4, p + 4 + lName - 1)));
const lRef = dataView.getInt32(p + lName + 4, true);
chrToIndex[refName] = i;
indexToChr.push({ refName, length: lRef });
p = p + 8 + lName;
if (p > uncba.length) {
console.warn(`BAM header is very big. Re-fetching ${refSeqBytes} bytes.`);
return this._readRefSeqs(start, refSeqBytes * 2, opts);
}
}
return { chrToIndex, indexToChr };
}
async getRecordsForRange(chr, min, max, opts) {
return gen2array(this.streamRecordsForRange(chr, min, max, opts));
}
async *streamRecordsForRange(chr, min, max, opts) {
await this.getHeader(opts);
const chrId = this.chrToIndex?.[chr];
if (chrId === undefined || !this.index) {
yield [];
}
else {
const chunks = await this.index.blocksForRange(chrId, min - 1, max, opts);
yield* this._fetchChunkFeatures(chunks, chrId, min, max, opts);
}
}
async *_fetchChunkFeatures(chunks, chrId, min, max, opts = {}) {
const { viewAsPairs } = opts;
const feats = [];
let done = false;
for (const chunk of chunks) {
const records = await this.featureCache.get(chunk.toString(), { chunk, opts }, opts.signal);
const recs = [];
for (const feature of records) {
if (feature.ref_id === chrId) {
if (feature.start >= max) {
// past end of range, can stop iterating
done = true;
break;
}
else if (feature.end >= min) {
// must be in range
recs.push(feature);
}
}
}
feats.push(recs);
yield recs;
if (done) {
break;
}
}
checkAbortSignal(opts.signal);
if (viewAsPairs) {
yield this.fetchPairs(chrId, feats, opts);
}
}
async fetchPairs(chrId, feats, opts) {
const { pairAcrossChr, maxInsertSize = 200000 } = opts;
const unmatedPairs = {};
const readIds = {};
feats.map(ret => {
const readNames = {};
for (const element of ret) {
const name = element.name;
const id = element.id;
if (!readNames[name]) {
readNames[name] = 0;
}
readNames[name]++;
readIds[id] = 1;
}
for (const [k, v] of Object.entries(readNames)) {
if (v === 1) {
unmatedPairs[k] = true;
}
}
});
const matePromises = [];
feats.map(ret => {
for (const f of ret) {
const name = f.name;
const start = f.start;
const pnext = f.next_pos;
const rnext = f.next_refid;
if (this.index &&
unmatedPairs[name] &&
(pairAcrossChr ||
(rnext === chrId && Math.abs(start - pnext) < maxInsertSize))) {
matePromises.push(this.index.blocksForRange(rnext, pnext, pnext + 1, opts));
}
}
});
// filter out duplicate chunks (the blocks are lists of chunks, blocks are
// concatenated, then filter dup chunks)
const map = new Map();
const res = await Promise.all(matePromises);
for (const m of res.flat()) {
if (!map.has(m.toString())) {
map.set(m.toString(), m);
}
}
const mateFeatPromises = await Promise.all([...map.values()].map(async (c) => {
const { data, cpositions, dpositions, chunk } = await this._readChunk({
chunk: c,
opts,
});
const mateRecs = [];
for (const feature of await this.readBamFeatures(data, cpositions, dpositions, chunk)) {
if (unmatedPairs[feature.name] && !readIds[feature.id]) {
mateRecs.push(feature);
}
}
return mateRecs;
}));
return mateFeatPromises.flat();
}
async _readChunk({ chunk, opts }) {
const buf = await this.bam.read(chunk.fetchedSize(), chunk.minv.blockPosition, opts);
const { buffer: data, cpositions, dpositions, } = await unzipChunkSlice(buf, chunk);
return { data, cpositions, dpositions, chunk };
}
async readBamFeatures(ba, cpositions, dpositions, chunk) {
let blockStart = 0;
const sink = [];
let pos = 0;
let last = +Date.now();
const dataView = new DataView(ba.buffer);
while (blockStart + 4 < ba.length) {
const blockSize = dataView.getInt32(blockStart, true);
const blockEnd = blockStart + 4 + blockSize - 1;
// increment position to the current decompressed status
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (dpositions) {
while (blockStart + chunk.minv.dataPosition >= dpositions[pos++]) { }
pos--;
}
// only try to read the feature if we have all the bytes for it
if (blockEnd < ba.length) {
const feature = new BAMFeature({
bytes: {
byteArray: ba,
start: blockStart,
end: blockEnd,
},
// the below results in an automatically calculated file-offset based
// ID if the info for that is available, otherwise crc32 of the
// features
//
// cpositions[pos] refers to actual file offset of a bgzip block
// boundaries
//
// we multiply by (1 <<8) in order to make sure each block has a
// "unique" address space so that data in that block could never
// overlap
//
// then the blockStart-dpositions is an uncompressed file offset from
// that bgzip block boundary, and since the cpositions are multiplied
// by (1 << 8) these uncompressed offsets get a unique space
//
// this has an extra chunk.minv.dataPosition added on because it
// blockStart starts at 0 instead of chunk.minv.dataPosition
//
// the +1 is just to avoid any possible uniqueId 0 but this does not
// realistically happen
fileOffset: cpositions.length > 0
? cpositions[pos] * (1 << 8) +
(blockStart - dpositions[pos]) +
chunk.minv.dataPosition +
1
: // this shift >>> 0 is equivalent to crc32(b).unsigned but uses the
// internal calculator of crc32 to avoid accidentally importing buffer
// https://github.com/alexgorbatchev/crc/blob/31fc3853e417b5fb5ec83335428805842575f699/src/define_crc.ts#L5
crc32(ba.subarray(blockStart, blockEnd)) >>> 0,
});
sink.push(feature);
if (this.yieldThreadTime && +Date.now() - last > this.yieldThreadTime) {
await timeout(1);
last = +Date.now();
}
}
blockStart = blockEnd + 1;
}
return sink;
}
async hasRefSeq(seqName) {
const seqId = this.chrToIndex?.[seqName];
return seqId === undefined ? false : this.index?.hasRefSeq(seqId);
}
async lineCount(seqName) {
const seqId = this.chrToIndex?.[seqName];
return seqId === undefined || !this.index ? 0 : this.index.lineCount(seqId);
}
async indexCov(seqName, start, end) {
if (!this.index) {
return [];
}
await this.index.parse();
const seqId = this.chrToIndex?.[seqName];
return seqId === undefined ? [] : this.index.indexCov(seqId, start, end);
}
async blocksForRange(seqName, start, end, opts) {
if (!this.index) {
return [];
}
await this.index.parse();
const seqId = this.chrToIndex?.[seqName];
return seqId === undefined
? []
: this.index.blocksForRange(seqId, start, end, opts);
}
}
//# sourceMappingURL=bamFile.js.map