@gmod/bam
Version:
Parser for BAM and BAM index (bai) files
179 lines • 6.92 kB
JavaScript
import Chunk from "./chunk.js";
import IndexFile, { memoizeByRefId } from "./indexFile.js";
import { findFirstData, parsePseudoBin } from "./util.js";
import { fromBytes } from "./virtualOffset.js";
const BAI_MAGIC = 21578050; // BAI\1
// BAI uses a fixed 5-level binning scheme with a 14-bit (16KB) linear index
// resolution. See SAMv1.pdf §5.1.3 (hts-specs).
// https://github.com/samtools/hts-specs/blob/master/SAMv1.pdf
const BAI_LINEAR_SHIFT = 14;
const BAI_LINEAR_INTERVAL = 1 << BAI_LINEAR_SHIFT; // 16384
function roundDown(n, multiple) {
return n - (n % multiple);
}
function roundUp(n, multiple) {
return n - (n % multiple) + multiple;
}
// Compute bin ranges that overlap [beg, end). Each level's first-bin offset
// is (8^L - 1) / 7. See SAMv1.pdf §5.1.1 for the binning derivation.
function reg2bins(beg, end) {
end -= 1;
return [
[0, 0],
[1 + (beg >> 26), 1 + (end >> 26)],
[9 + (beg >> 23), 9 + (end >> 23)],
[73 + (beg >> 20), 73 + (end >> 20)],
[585 + (beg >> 17), 585 + (end >> 17)],
[4681 + (beg >> BAI_LINEAR_SHIFT), 4681 + (end >> BAI_LINEAR_SHIFT)],
];
}
export default class BAI extends IndexFile {
async _parse(opts) {
const bytes = await this.filehandle.readFile(opts);
const dataView = new DataView(bytes.buffer);
// check BAI magic numbers
if (dataView.getUint32(0, true) !== BAI_MAGIC) {
throw new Error('Not a BAI file');
}
const refCount = dataView.getInt32(4, true);
const depth = 5;
const binLimit = ((1 << ((depth + 1) * 3)) - 1) / 7;
// read the indexes for each reference sequence
let curr = 8;
let firstDataLine;
const offsets = [];
for (let i = 0; i < refCount; i++) {
offsets.push(curr);
const binCount = dataView.getInt32(curr, true);
curr += 4;
for (let j = 0; j < binCount; j += 1) {
const bin = dataView.getUint32(curr, true);
curr += 4;
if (bin === binLimit + 1) {
curr += 4;
curr += 32;
}
else if (bin > binLimit + 1) {
throw new Error('bai index contains too many bins, please use CSI');
}
else {
const chunkCount = dataView.getInt32(curr, true);
curr += 4;
for (let k = 0; k < chunkCount; k++) {
curr += 8;
curr += 8;
}
}
}
// walk the linear index to find the smallest virtual offset, which
// marks where the BAM header ends and data begins
const linearCount = dataView.getInt32(curr, true);
curr += 4;
for (let j = 0; j < linearCount; j++) {
firstDataLine = findFirstData(firstDataLine, fromBytes(bytes, curr));
curr += 8;
}
}
function getIndices(refId) {
let curr = offsets[refId];
if (curr === undefined) {
return undefined;
}
const binCount = dataView.getInt32(curr, true);
let stats;
curr += 4;
const binIndex = {};
for (let j = 0; j < binCount; j += 1) {
const bin = dataView.getUint32(curr, true);
curr += 4;
if (bin === binLimit + 1) {
curr += 4;
stats = parsePseudoBin(bytes, curr + 16);
curr += 32;
}
else if (bin > binLimit + 1) {
throw new Error('bai index contains too many bins, please use CSI');
}
else {
const chunkCount = dataView.getInt32(curr, true);
curr += 4;
const chunks = new Array(chunkCount);
for (let k = 0; k < chunkCount; k++) {
const u = fromBytes(bytes, curr);
curr += 8;
const v = fromBytes(bytes, curr);
curr += 8;
chunks[k] = new Chunk(u, v, bin);
}
binIndex[bin] = chunks;
}
}
const linearCount = dataView.getInt32(curr, true);
curr += 4;
const linearIndex = new Array(linearCount);
for (let j = 0; j < linearCount; j++) {
linearIndex[j] = fromBytes(bytes, curr);
curr += 8;
}
return {
binIndex,
linearIndex,
stats,
};
}
return {
bai: true,
firstDataLine,
maxBlockSize: 1 << 16,
indices: memoizeByRefId(getIndices),
refCount,
};
}
async indexCov(seqId, start, end, opts) {
const v = BAI_LINEAR_INTERVAL;
const range = start !== undefined;
const indexData = await this.parse(opts);
const seqIdx = indexData.indices(seqId);
if (!seqIdx) {
return [];
}
const { linearIndex, stats } = seqIdx;
if (linearIndex.length === 0) {
return [];
}
const e = end === undefined ? (linearIndex.length - 1) * v : roundUp(end, v);
const s = start === undefined ? 0 : roundDown(start, v);
const depths = range
? new Array((e - s) / v)
: new Array(linearIndex.length - 1);
const totalSize = linearIndex[linearIndex.length - 1].blockPosition;
if (e > (linearIndex.length - 1) * v) {
throw new Error('query outside of range of linear index');
}
let currentPos = linearIndex[s / v].blockPosition;
for (let i = s / v, j = 0; i < e / v; i++, j++) {
depths[j] = {
score: linearIndex[i + 1].blockPosition - currentPos,
start: i * v,
end: i * v + v,
};
currentPos = linearIndex[i + 1].blockPosition;
}
return depths.map(d => ({
...d,
score: (d.score * (stats?.lineCount ?? 0)) / totalSize,
}));
}
reg2bins(min, max) {
return reg2bins(min, max);
}
// Use the linear index to find minimum file position of chunks that could
// contain alignments in the region. Linear index entries are monotonically
// non-decreasing, so the first entry at minLin is the minimum.
getLowestChunk(refIndex, min) {
const { linearIndex } = refIndex;
const nintv = linearIndex.length;
return linearIndex[Math.min(min >> BAI_LINEAR_SHIFT, nintv - 1)];
}
}
//# sourceMappingURL=bai.js.map