@gmod/bam
Version:
Parser for BAM and BAM index (bai) files
677 lines • 26 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const cigar_ts_1 = require("./cigar.js");
const constants_ts_1 = __importDefault(require("./constants.js"));
const SEQRET_DECODER = '=ACMGRSVTWYHKDBN'.split('');
// precomputed pair orientation strings indexed by ((flags >> 4) & 0xF) | (isize > 0 ? 16 : 0)
// bits 0-3 encode flag bits 0x10(reverse),0x20(mate reverse),0x40(read1),0x80(read2)
// bit 4 encodes whether isize > 0
// prettier-ignore
const PAIR_ORIENTATION_TABLE = [
'F F ', 'F R ', 'R F ', 'R R ', 'F2F1', 'F2R1', 'R2F1', 'R2R1',
'F1F2', 'F1R2', 'R1F2', 'R1R2', 'F2F1', 'F2R1', 'R2F1', 'R2R1',
'F F ', 'R F ', 'F R ', 'R R ', 'F1F2', 'R1F2', 'F1R2', 'R1R2',
'F2F1', 'R2F1', 'F2R1', 'R2R1', 'F1F2', 'R1F2', 'F1R2', 'R1R2',
];
const ASCII_CIGAR_CODES = [
77, 73, 68, 78, 83, 72, 80, 61, 88, 63, 63, 63, 63, 63, 63, 63,
];
const textDecoder = new TextDecoder();
// Bitmask for ops that consume ref: M=0, D=2, N=3, P=6, ==7, X=8
// Binary: 0b111001101 = 0x1CD
const CIGAR_CONSUMES_REF_MASK = 0x1cd;
class BamRecord {
fileOffset;
_byteArray;
_start;
_end;
_dataView;
_cachedEnd;
_cachedTags;
_cachedLengthOnRef;
_cachedNumericCigar;
_cachedNUMERIC_MD;
_cachedSeqStart;
constructor(args) {
this._byteArray = args.bytes.byteArray;
this._start = args.bytes.start;
this._end = args.bytes.end;
this.fileOffset = args.fileOffset;
this._dataView = args.dataView;
}
get byteArray() {
return this._byteArray;
}
get flags() {
return (this._dataView.getInt32(this._start + 16, true) & 0xffff0000) >> 16;
}
get ref_id() {
return this._dataView.getInt32(this._start + 4, true);
}
get start() {
return this._dataView.getInt32(this._start + 8, true);
}
get end() {
if (this._cachedEnd === undefined) {
this._cachedEnd = this.start + this.length_on_ref;
}
return this._cachedEnd;
}
get mq() {
const mq = (this.bin_mq_nl & 0xff00) >> 8;
return mq === 255 ? undefined : mq;
}
get score() {
return this.mq;
}
get qual() {
if (this.isSegmentUnmapped()) {
return null;
}
else {
const seqLen = this.seq_length;
const p = this.seqStart + ((seqLen + 1) >> 1);
return this._byteArray.subarray(p, p + seqLen);
}
}
get strand() {
return this.isReverseComplemented() ? -1 : 1;
}
get b0() {
return this._start + 36;
}
// start of the SEQ section (and end of CIGAR). All downstream sections
// (qual, tags) are offsets from here, so cache once and reuse.
get seqStart() {
if (this._cachedSeqStart === undefined) {
this._cachedSeqStart =
this.b0 + this.read_name_length + this.num_cigar_bytes;
}
return this._cachedSeqStart;
}
get tagsStart() {
const seqLen = this.seq_length;
return this.seqStart + ((seqLen + 1) >> 1) + seqLen;
}
// batch fromCharCode: fastest for typical name lengths (see benchmarks/string-building.bench.ts)
get name() {
const len = this.read_name_length - 1;
const start = this.b0;
const ba = this._byteArray;
const codes = new Array(len);
for (let i = 0; i < len; i++) {
codes[i] = ba[start + i];
}
return String.fromCharCode(...codes);
}
get NUMERIC_MD() {
if (this._cachedNUMERIC_MD === undefined) {
const result = this.getTagRaw('MD');
this._cachedNUMERIC_MD = result instanceof Uint8Array ? result : null;
}
return this._cachedNUMERIC_MD === null ? undefined : this._cachedNUMERIC_MD;
}
get tags() {
if (this._cachedTags === undefined) {
this._cachedTags = this._computeTags();
}
return this._cachedTags;
}
getTag(tagName) {
if (this._cachedTags !== undefined) {
return this._cachedTags[tagName];
}
return this._findTag(tagName, false);
}
getTagRaw(tagName) {
return this._findTag(tagName, true);
}
_findTag(tagName, raw) {
const tag1 = tagName.charCodeAt(0);
const tag2 = tagName.charCodeAt(1);
let p = this.tagsStart;
const blockEnd = this._end;
const ba = this._byteArray;
while (p < blockEnd) {
const currentTag1 = ba[p];
const currentTag2 = ba[p + 1];
const type = ba[p + 2];
p += 3;
const isMatch = currentTag1 === tag1 && currentTag2 === tag2;
switch (type) {
case 0x41: // 'A'
if (isMatch) {
return String.fromCharCode(ba[p]);
}
p += 1;
break;
case 0x69: // 'i'
if (isMatch) {
return this._dataView.getInt32(p, true);
}
p += 4;
break;
case 0x49: // 'I'
if (isMatch) {
return this._dataView.getUint32(p, true);
}
p += 4;
break;
case 0x63: // 'c'
if (isMatch) {
return this._dataView.getInt8(p);
}
p += 1;
break;
case 0x43: // 'C'
if (isMatch) {
return this._dataView.getUint8(p);
}
p += 1;
break;
case 0x73: // 's'
if (isMatch) {
return this._dataView.getInt16(p, true);
}
p += 2;
break;
case 0x53: // 'S'
if (isMatch) {
return this._dataView.getUint16(p, true);
}
p += 2;
break;
case 0x66: // 'f'
if (isMatch) {
return this._dataView.getFloat32(p, true);
}
p += 4;
break;
case 0x5a: // 'Z'
case 0x48: {
// 'H'
const start = p;
while (p < blockEnd && ba[p] !== 0) {
p++;
}
if (isMatch) {
return raw
? ba.subarray(start, p)
: textDecoder.decode(ba.subarray(start, p));
}
p++; // advance past null terminator
break;
}
case 0x42: {
// 'B'
const Btype = ba[p++];
const limit = this._dataView.getInt32(p, true);
p += 4;
const absOffset = ba.byteOffset + p;
if (isMatch) {
if (Btype === 0x69) {
// 'i'
if (absOffset % 4 === 0) {
return new Int32Array(ba.buffer, absOffset, limit);
}
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getInt32(p + i * 4, true);
}
return arr;
}
else if (Btype === 0x49) {
// 'I'
if (absOffset % 4 === 0) {
return new Uint32Array(ba.buffer, absOffset, limit);
}
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getUint32(p + i * 4, true);
}
return arr;
}
else if (Btype === 0x73) {
// 's'
if (absOffset % 2 === 0) {
return new Int16Array(ba.buffer, absOffset, limit);
}
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getInt16(p + i * 2, true);
}
return arr;
}
else if (Btype === 0x53) {
// 'S'
if (absOffset % 2 === 0) {
return new Uint16Array(ba.buffer, absOffset, limit);
}
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getUint16(p + i * 2, true);
}
return arr;
}
else if (Btype === 0x63) {
// 'c'
return new Int8Array(ba.buffer, absOffset, limit);
}
else if (Btype === 0x43) {
// 'C'
return new Uint8Array(ba.buffer, absOffset, limit);
}
else if (Btype === 0x66) {
// 'f'
if (absOffset % 4 === 0) {
return new Float32Array(ba.buffer, absOffset, limit);
}
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getFloat32(p + i * 4, true);
}
return arr;
}
}
if (Btype === 0x69 || Btype === 0x49 || Btype === 0x66) {
p += limit << 2;
}
else if (Btype === 0x73 || Btype === 0x53) {
p += limit << 1;
}
else if (Btype === 0x63 || Btype === 0x43) {
p += limit;
}
break;
}
}
}
return undefined;
}
_computeTags() {
let p = this.tagsStart;
const blockEnd = this._end;
const ba = this._byteArray;
const tags = {};
while (p < blockEnd) {
const tag = String.fromCharCode(ba[p], ba[p + 1]);
const type = ba[p + 2];
p += 3;
switch (type) {
case 0x41: // 'A'
tags[tag] = String.fromCharCode(ba[p]);
p += 1;
break;
case 0x69: // 'i'
tags[tag] = this._dataView.getInt32(p, true);
p += 4;
break;
case 0x49: // 'I'
tags[tag] = this._dataView.getUint32(p, true);
p += 4;
break;
case 0x63: // 'c'
tags[tag] = this._dataView.getInt8(p);
p += 1;
break;
case 0x43: // 'C'
tags[tag] = this._dataView.getUint8(p);
p += 1;
break;
case 0x73: // 's'
tags[tag] = this._dataView.getInt16(p, true);
p += 2;
break;
case 0x53: // 'S'
tags[tag] = this._dataView.getUint16(p, true);
p += 2;
break;
case 0x66: // 'f'
tags[tag] = this._dataView.getFloat32(p, true);
p += 4;
break;
case 0x5a: // 'Z'
case 0x48: {
// 'H'
const start = p;
while (p < blockEnd && ba[p] !== 0) {
p++;
}
tags[tag] = textDecoder.decode(ba.subarray(start, p));
p++; // advance past null terminator
break;
}
case 0x42: {
// 'B'
const Btype = ba[p++];
const limit = this._dataView.getInt32(p, true);
p += 4;
const absOffset = ba.byteOffset + p;
if (Btype === 0x69) {
// 'i'
if (absOffset % 4 === 0) {
tags[tag] = new Int32Array(ba.buffer, absOffset, limit);
}
else {
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getInt32(p + i * 4, true);
}
tags[tag] = arr;
}
p += limit << 2;
}
else if (Btype === 0x49) {
// 'I'
if (absOffset % 4 === 0) {
tags[tag] = new Uint32Array(ba.buffer, absOffset, limit);
}
else {
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getUint32(p + i * 4, true);
}
tags[tag] = arr;
}
p += limit << 2;
}
else if (Btype === 0x73) {
// 's'
if (absOffset % 2 === 0) {
tags[tag] = new Int16Array(ba.buffer, absOffset, limit);
}
else {
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getInt16(p + i * 2, true);
}
tags[tag] = arr;
}
p += limit << 1;
}
else if (Btype === 0x53) {
// 'S'
if (absOffset % 2 === 0) {
tags[tag] = new Uint16Array(ba.buffer, absOffset, limit);
}
else {
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getUint16(p + i * 2, true);
}
tags[tag] = arr;
}
p += limit << 1;
}
else if (Btype === 0x63) {
// 'c'
tags[tag] = new Int8Array(ba.buffer, absOffset, limit);
p += limit;
}
else if (Btype === 0x43) {
// 'C'
tags[tag] = new Uint8Array(ba.buffer, absOffset, limit);
p += limit;
}
else if (Btype === 0x66) {
// 'f'
if (absOffset % 4 === 0) {
tags[tag] = new Float32Array(ba.buffer, absOffset, limit);
}
else {
const arr = new Array(limit);
for (let i = 0; i < limit; i++) {
arr[i] = this._dataView.getFloat32(p + i * 4, true);
}
tags[tag] = arr;
}
p += limit << 2;
}
break;
}
default:
console.error('Unknown BAM tag type', type);
break;
}
}
return tags;
}
isPaired() {
return !!(this.flags & constants_ts_1.default.BAM_FPAIRED);
}
isProperlyPaired() {
return !!(this.flags & constants_ts_1.default.BAM_FPROPER_PAIR);
}
isSegmentUnmapped() {
return !!(this.flags & constants_ts_1.default.BAM_FUNMAP);
}
isMateUnmapped() {
return !!(this.flags & constants_ts_1.default.BAM_FMUNMAP);
}
isReverseComplemented() {
return !!(this.flags & constants_ts_1.default.BAM_FREVERSE);
}
isMateReverseComplemented() {
return !!(this.flags & constants_ts_1.default.BAM_FMREVERSE);
}
isRead1() {
return !!(this.flags & constants_ts_1.default.BAM_FREAD1);
}
isRead2() {
return !!(this.flags & constants_ts_1.default.BAM_FREAD2);
}
isSecondary() {
return !!(this.flags & constants_ts_1.default.BAM_FSECONDARY);
}
isFailedQc() {
return !!(this.flags & constants_ts_1.default.BAM_FQCFAIL);
}
isDuplicate() {
return !!(this.flags & constants_ts_1.default.BAM_FDUP);
}
isSupplementary() {
return !!(this.flags & constants_ts_1.default.BAM_FSUPPLEMENTARY);
}
// Benchmark results for CIGAR parsing strategies (see benchmarks/cigar-lifecycle.bench.ts):
//
// Aligned data:
// - Plain array is 1.6-1.8x faster than Uint32Array for small CIGARs (≤50 ops)
// - Uint32Array view is 1.3-2.2x faster for large CIGARs (≥200 ops)
// - Crossover point is around 50-100 ops
//
// Unaligned data (requires slice+copy for Uint32Array):
// - Plain array is 3.7-6.1x faster for typical sizes (50-200 ops)
// - Plain array is 9-10x faster for small CIGARs (1-7 ops)
// - Uint32Array slice+copy only wins at extreme sizes (10000 ops: 1.4x faster)
//
// Using |0 to force 32-bit integers in plain array path:
// - 1.67x faster for medium CIGARs (50 ops)
// - Neutral for small CIGARs (1-7 ops)
//
// Strategy: use plain array with |0 for small aligned (≤50 ops) and all unaligned,
// Uint32Array view only for large aligned CIGARs.
// CG tag pattern: first op is soft-clip consuming entire sequence, second op is N encoding length-on-ref
_isCGTagPattern(p) {
const cigop = this._dataView.getInt32(p, true);
return (cigop & 0xf) === cigar_ts_1.CIGAR_SOFT_CLIP && cigop >> 4 === this.seq_length;
}
_computeLengthOnRef() {
const flag_nc = this._dataView.getInt32(this._start + 16, true);
if (flag_nc & (constants_ts_1.default.BAM_FUNMAP << 16)) {
return 0;
}
const numCigarOps = flag_nc & 0xffff;
const p = this.b0 + this.read_name_length;
if (this._isCGTagPattern(p)) {
const cigop2 = this._dataView.getInt32(p + 4, true);
if ((cigop2 & 0xf) !== cigar_ts_1.CIGAR_REF_SKIP) {
console.warn('CG tag with no N tag');
}
return cigop2 >> 4;
}
const absOffset = this._byteArray.byteOffset + p;
if (absOffset % 4 === 0 && numCigarOps > 50) {
const cigarView = new Uint32Array(this._byteArray.buffer, absOffset, numCigarOps);
this._cachedNumericCigar = cigarView;
let lref = 0;
for (let c = 0; c < numCigarOps; ++c) {
const co = cigarView[c];
lref += (co >> 4) * ((CIGAR_CONSUMES_REF_MASK >> (co & 0xf)) & 1);
}
return lref;
}
let lref = 0;
for (let c = 0; c < numCigarOps; ++c) {
const co = this._dataView.getInt32(p + c * 4, true);
lref += (co >> 4) * ((CIGAR_CONSUMES_REF_MASK >> (co & 0xf)) & 1);
}
return lref;
}
_computeNumericCigar() {
const flag_nc = this._dataView.getInt32(this._start + 16, true);
if (flag_nc & (constants_ts_1.default.BAM_FUNMAP << 16)) {
return new Uint32Array(0);
}
const numCigarOps = flag_nc & 0xffff;
const p = this.b0 + this.read_name_length;
if (this._isCGTagPattern(p)) {
return (this.tags.CG ??
new Uint32Array(0));
}
const absOffset = this._byteArray.byteOffset + p;
if (absOffset % 4 === 0 && numCigarOps > 50) {
return new Uint32Array(this._byteArray.buffer, absOffset, numCigarOps);
}
const cigarArray = new Array(numCigarOps);
for (let c = 0; c < numCigarOps; ++c) {
cigarArray[c] = this._dataView.getInt32(p + c * 4, true) | 0;
}
return cigarArray;
}
get length_on_ref() {
if (this._cachedLengthOnRef === undefined) {
this._cachedLengthOnRef = this._computeLengthOnRef();
}
return this._cachedLengthOnRef;
}
get NUMERIC_CIGAR() {
if (this._cachedNumericCigar === undefined) {
this._cachedNumericCigar = this._computeNumericCigar();
}
return this._cachedNumericCigar;
}
get CIGAR() {
const numeric = this.NUMERIC_CIGAR;
let result = '';
for (let i = 0, l = numeric.length; i < l; i++) {
const packed = numeric[i];
const length = packed >> 4;
const opCode = ASCII_CIGAR_CODES[packed & 0xf];
result += length + String.fromCharCode(opCode);
}
return result;
}
get num_cigar_ops() {
return this.flag_nc & 0xffff;
}
get num_cigar_bytes() {
return this.num_cigar_ops << 2;
}
get read_name_length() {
return this.bin_mq_nl & 0xff;
}
get num_seq_bytes() {
return (this.seq_length + 1) >> 1;
}
get NUMERIC_SEQ() {
const p = this.seqStart;
return this._byteArray.subarray(p, p + this.num_seq_bytes);
}
get seq() {
const len = this.seq_length;
const seqStart = this.seqStart;
const numeric = this._byteArray;
const buf = new Array(len);
let i = 0;
const fullBytes = len >> 1;
for (let j = 0; j < fullBytes; ++j) {
const sb = numeric[seqStart + j];
buf[i++] = SEQRET_DECODER[(sb & 0xf0) >> 4];
buf[i++] = SEQRET_DECODER[sb & 0x0f];
}
if (i < len) {
const sb = numeric[seqStart + fullBytes];
buf[i] = SEQRET_DECODER[(sb & 0xf0) >> 4];
}
return buf.join('');
}
// adapted from igv.js
// uses precomputed lookup table indexed by flag bits + isize sign.
// the BAM spec defines tlen as positive for the leftmost segment and
// negative for the rightmost, so tlen > 0 reliably indicates which
// read comes first without needing position-based correction
// (see also: gmod/cram-js src/cramFile/record.ts getPairOrientation)
get pair_orientation() {
const f = this.flags;
// unmapped (0x4) or mate unmapped (0x8) -> undefined
if (f & 0xc || this.ref_id !== this.next_refid) {
return undefined;
}
return PAIR_ORIENTATION_TABLE[((f >> 4) & 0xf) | (this.template_length > 0 ? 16 : 0)];
}
get bin_mq_nl() {
return this._dataView.getInt32(this._start + 12, true);
}
get flag_nc() {
return this._dataView.getInt32(this._start + 16, true);
}
get seq_length() {
return this._dataView.getInt32(this._start + 20, true);
}
get next_refid() {
return this._dataView.getInt32(this._start + 24, true);
}
get next_pos() {
return this._dataView.getInt32(this._start + 28, true);
}
get template_length() {
return this._dataView.getInt32(this._start + 32, true);
}
seqAt(idx) {
if (idx < this.seq_length) {
const sb = this._byteArray[this.seqStart + (idx >> 1)];
return idx % 2 === 0
? SEQRET_DECODER[(sb & 0xf0) >> 4]
: SEQRET_DECODER[sb & 0x0f];
}
else {
return undefined;
}
}
// Most public BamRecord fields are getters on the prototype, so
// Object.keys(this) wouldn't include them — JSON.stringify needs an explicit
// list. Returns the meaningful BAM-spec fields. Return type is widened so
// subclasses can override with their own serialized shape.
toJSON() {
return {
fileOffset: this.fileOffset,
ref_id: this.ref_id,
start: this.start,
end: this.end,
name: this.name,
flags: this.flags,
mq: this.mq,
CIGAR: this.CIGAR,
seq: this.seq,
next_refid: this.next_refid,
next_pos: this.next_pos,
template_length: this.template_length,
tags: this.tags,
};
}
}
exports.default = BamRecord;
//# sourceMappingURL=record.js.map