UNPKG

@gmod/bam

Version:

Parser for BAM and BAM index (bai) files

671 lines 25.5 kB
import { CIGAR_REF_SKIP, CIGAR_SOFT_CLIP } from "./cigar.js"; import Constants from "./constants.js"; const SEQRET_DECODER = '=ACMGRSVTWYHKDBN'.split(''); // precomputed pair orientation strings indexed by ((flags >> 4) & 0xF) | (isize > 0 ? 16 : 0) // bits 0-3 encode flag bits 0x10(reverse),0x20(mate reverse),0x40(read1),0x80(read2) // bit 4 encodes whether isize > 0 // prettier-ignore const PAIR_ORIENTATION_TABLE = [ 'F F ', 'F R ', 'R F ', 'R R ', 'F2F1', 'F2R1', 'R2F1', 'R2R1', 'F1F2', 'F1R2', 'R1F2', 'R1R2', 'F2F1', 'F2R1', 'R2F1', 'R2R1', 'F F ', 'R F ', 'F R ', 'R R ', 'F1F2', 'R1F2', 'F1R2', 'R1R2', 'F2F1', 'R2F1', 'F2R1', 'R2R1', 'F1F2', 'R1F2', 'F1R2', 'R1R2', ]; const ASCII_CIGAR_CODES = [ 77, 73, 68, 78, 83, 72, 80, 61, 88, 63, 63, 63, 63, 63, 63, 63, ]; const textDecoder = new TextDecoder(); // Bitmask for ops that consume ref: M=0, D=2, N=3, P=6, ==7, X=8 // Binary: 0b111001101 = 0x1CD const CIGAR_CONSUMES_REF_MASK = 0x1cd; export default class BamRecord { fileOffset; _byteArray; _start; _end; _dataView; _cachedEnd; _cachedTags; _cachedLengthOnRef; _cachedNumericCigar; _cachedNUMERIC_MD; _cachedSeqStart; constructor(args) { this._byteArray = args.bytes.byteArray; this._start = args.bytes.start; this._end = args.bytes.end; this.fileOffset = args.fileOffset; this._dataView = args.dataView; } get byteArray() { return this._byteArray; } get flags() { return (this._dataView.getInt32(this._start + 16, true) & 0xffff0000) >> 16; } get ref_id() { return this._dataView.getInt32(this._start + 4, true); } get start() { return this._dataView.getInt32(this._start + 8, true); } get end() { if (this._cachedEnd === undefined) { this._cachedEnd = this.start + this.length_on_ref; } return this._cachedEnd; } get mq() { const mq = (this.bin_mq_nl & 0xff00) >> 8; return mq === 255 ? undefined : mq; } get score() { return this.mq; } get qual() { if (this.isSegmentUnmapped()) { return null; } else { const seqLen = this.seq_length; const p = this.seqStart + ((seqLen + 1) >> 1); return this._byteArray.subarray(p, p + seqLen); } } get strand() { return this.isReverseComplemented() ? -1 : 1; } get b0() { return this._start + 36; } // start of the SEQ section (and end of CIGAR). All downstream sections // (qual, tags) are offsets from here, so cache once and reuse. get seqStart() { if (this._cachedSeqStart === undefined) { this._cachedSeqStart = this.b0 + this.read_name_length + this.num_cigar_bytes; } return this._cachedSeqStart; } get tagsStart() { const seqLen = this.seq_length; return this.seqStart + ((seqLen + 1) >> 1) + seqLen; } // batch fromCharCode: fastest for typical name lengths (see benchmarks/string-building.bench.ts) get name() { const len = this.read_name_length - 1; const start = this.b0; const ba = this._byteArray; const codes = new Array(len); for (let i = 0; i < len; i++) { codes[i] = ba[start + i]; } return String.fromCharCode(...codes); } get NUMERIC_MD() { if (this._cachedNUMERIC_MD === undefined) { const result = this.getTagRaw('MD'); this._cachedNUMERIC_MD = result instanceof Uint8Array ? result : null; } return this._cachedNUMERIC_MD === null ? undefined : this._cachedNUMERIC_MD; } get tags() { if (this._cachedTags === undefined) { this._cachedTags = this._computeTags(); } return this._cachedTags; } getTag(tagName) { if (this._cachedTags !== undefined) { return this._cachedTags[tagName]; } return this._findTag(tagName, false); } getTagRaw(tagName) { return this._findTag(tagName, true); } _findTag(tagName, raw) { const tag1 = tagName.charCodeAt(0); const tag2 = tagName.charCodeAt(1); let p = this.tagsStart; const blockEnd = this._end; const ba = this._byteArray; while (p < blockEnd) { const currentTag1 = ba[p]; const currentTag2 = ba[p + 1]; const type = ba[p + 2]; p += 3; const isMatch = currentTag1 === tag1 && currentTag2 === tag2; switch (type) { case 0x41: // 'A' if (isMatch) { return String.fromCharCode(ba[p]); } p += 1; break; case 0x69: // 'i' if (isMatch) { return this._dataView.getInt32(p, true); } p += 4; break; case 0x49: // 'I' if (isMatch) { return this._dataView.getUint32(p, true); } p += 4; break; case 0x63: // 'c' if (isMatch) { return this._dataView.getInt8(p); } p += 1; break; case 0x43: // 'C' if (isMatch) { return this._dataView.getUint8(p); } p += 1; break; case 0x73: // 's' if (isMatch) { return this._dataView.getInt16(p, true); } p += 2; break; case 0x53: // 'S' if (isMatch) { return this._dataView.getUint16(p, true); } p += 2; break; case 0x66: // 'f' if (isMatch) { return this._dataView.getFloat32(p, true); } p += 4; break; case 0x5a: // 'Z' case 0x48: { // 'H' const start = p; while (p < blockEnd && ba[p] !== 0) { p++; } if (isMatch) { return raw ? ba.subarray(start, p) : textDecoder.decode(ba.subarray(start, p)); } p++; // advance past null terminator break; } case 0x42: { // 'B' const Btype = ba[p++]; const limit = this._dataView.getInt32(p, true); p += 4; const absOffset = ba.byteOffset + p; if (isMatch) { if (Btype === 0x69) { // 'i' if (absOffset % 4 === 0) { return new Int32Array(ba.buffer, absOffset, limit); } const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getInt32(p + i * 4, true); } return arr; } else if (Btype === 0x49) { // 'I' if (absOffset % 4 === 0) { return new Uint32Array(ba.buffer, absOffset, limit); } const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getUint32(p + i * 4, true); } return arr; } else if (Btype === 0x73) { // 's' if (absOffset % 2 === 0) { return new Int16Array(ba.buffer, absOffset, limit); } const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getInt16(p + i * 2, true); } return arr; } else if (Btype === 0x53) { // 'S' if (absOffset % 2 === 0) { return new Uint16Array(ba.buffer, absOffset, limit); } const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getUint16(p + i * 2, true); } return arr; } else if (Btype === 0x63) { // 'c' return new Int8Array(ba.buffer, absOffset, limit); } else if (Btype === 0x43) { // 'C' return new Uint8Array(ba.buffer, absOffset, limit); } else if (Btype === 0x66) { // 'f' if (absOffset % 4 === 0) { return new Float32Array(ba.buffer, absOffset, limit); } const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getFloat32(p + i * 4, true); } return arr; } } if (Btype === 0x69 || Btype === 0x49 || Btype === 0x66) { p += limit << 2; } else if (Btype === 0x73 || Btype === 0x53) { p += limit << 1; } else if (Btype === 0x63 || Btype === 0x43) { p += limit; } break; } } } return undefined; } _computeTags() { let p = this.tagsStart; const blockEnd = this._end; const ba = this._byteArray; const tags = {}; while (p < blockEnd) { const tag = String.fromCharCode(ba[p], ba[p + 1]); const type = ba[p + 2]; p += 3; switch (type) { case 0x41: // 'A' tags[tag] = String.fromCharCode(ba[p]); p += 1; break; case 0x69: // 'i' tags[tag] = this._dataView.getInt32(p, true); p += 4; break; case 0x49: // 'I' tags[tag] = this._dataView.getUint32(p, true); p += 4; break; case 0x63: // 'c' tags[tag] = this._dataView.getInt8(p); p += 1; break; case 0x43: // 'C' tags[tag] = this._dataView.getUint8(p); p += 1; break; case 0x73: // 's' tags[tag] = this._dataView.getInt16(p, true); p += 2; break; case 0x53: // 'S' tags[tag] = this._dataView.getUint16(p, true); p += 2; break; case 0x66: // 'f' tags[tag] = this._dataView.getFloat32(p, true); p += 4; break; case 0x5a: // 'Z' case 0x48: { // 'H' const start = p; while (p < blockEnd && ba[p] !== 0) { p++; } tags[tag] = textDecoder.decode(ba.subarray(start, p)); p++; // advance past null terminator break; } case 0x42: { // 'B' const Btype = ba[p++]; const limit = this._dataView.getInt32(p, true); p += 4; const absOffset = ba.byteOffset + p; if (Btype === 0x69) { // 'i' if (absOffset % 4 === 0) { tags[tag] = new Int32Array(ba.buffer, absOffset, limit); } else { const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getInt32(p + i * 4, true); } tags[tag] = arr; } p += limit << 2; } else if (Btype === 0x49) { // 'I' if (absOffset % 4 === 0) { tags[tag] = new Uint32Array(ba.buffer, absOffset, limit); } else { const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getUint32(p + i * 4, true); } tags[tag] = arr; } p += limit << 2; } else if (Btype === 0x73) { // 's' if (absOffset % 2 === 0) { tags[tag] = new Int16Array(ba.buffer, absOffset, limit); } else { const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getInt16(p + i * 2, true); } tags[tag] = arr; } p += limit << 1; } else if (Btype === 0x53) { // 'S' if (absOffset % 2 === 0) { tags[tag] = new Uint16Array(ba.buffer, absOffset, limit); } else { const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getUint16(p + i * 2, true); } tags[tag] = arr; } p += limit << 1; } else if (Btype === 0x63) { // 'c' tags[tag] = new Int8Array(ba.buffer, absOffset, limit); p += limit; } else if (Btype === 0x43) { // 'C' tags[tag] = new Uint8Array(ba.buffer, absOffset, limit); p += limit; } else if (Btype === 0x66) { // 'f' if (absOffset % 4 === 0) { tags[tag] = new Float32Array(ba.buffer, absOffset, limit); } else { const arr = new Array(limit); for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getFloat32(p + i * 4, true); } tags[tag] = arr; } p += limit << 2; } break; } default: console.error('Unknown BAM tag type', type); break; } } return tags; } isPaired() { return !!(this.flags & Constants.BAM_FPAIRED); } isProperlyPaired() { return !!(this.flags & Constants.BAM_FPROPER_PAIR); } isSegmentUnmapped() { return !!(this.flags & Constants.BAM_FUNMAP); } isMateUnmapped() { return !!(this.flags & Constants.BAM_FMUNMAP); } isReverseComplemented() { return !!(this.flags & Constants.BAM_FREVERSE); } isMateReverseComplemented() { return !!(this.flags & Constants.BAM_FMREVERSE); } isRead1() { return !!(this.flags & Constants.BAM_FREAD1); } isRead2() { return !!(this.flags & Constants.BAM_FREAD2); } isSecondary() { return !!(this.flags & Constants.BAM_FSECONDARY); } isFailedQc() { return !!(this.flags & Constants.BAM_FQCFAIL); } isDuplicate() { return !!(this.flags & Constants.BAM_FDUP); } isSupplementary() { return !!(this.flags & Constants.BAM_FSUPPLEMENTARY); } // Benchmark results for CIGAR parsing strategies (see benchmarks/cigar-lifecycle.bench.ts): // // Aligned data: // - Plain array is 1.6-1.8x faster than Uint32Array for small CIGARs (≤50 ops) // - Uint32Array view is 1.3-2.2x faster for large CIGARs (≥200 ops) // - Crossover point is around 50-100 ops // // Unaligned data (requires slice+copy for Uint32Array): // - Plain array is 3.7-6.1x faster for typical sizes (50-200 ops) // - Plain array is 9-10x faster for small CIGARs (1-7 ops) // - Uint32Array slice+copy only wins at extreme sizes (10000 ops: 1.4x faster) // // Using |0 to force 32-bit integers in plain array path: // - 1.67x faster for medium CIGARs (50 ops) // - Neutral for small CIGARs (1-7 ops) // // Strategy: use plain array with |0 for small aligned (≤50 ops) and all unaligned, // Uint32Array view only for large aligned CIGARs. // CG tag pattern: first op is soft-clip consuming entire sequence, second op is N encoding length-on-ref _isCGTagPattern(p) { const cigop = this._dataView.getInt32(p, true); return (cigop & 0xf) === CIGAR_SOFT_CLIP && cigop >> 4 === this.seq_length; } _computeLengthOnRef() { const flag_nc = this._dataView.getInt32(this._start + 16, true); if (flag_nc & (Constants.BAM_FUNMAP << 16)) { return 0; } const numCigarOps = flag_nc & 0xffff; const p = this.b0 + this.read_name_length; if (this._isCGTagPattern(p)) { const cigop2 = this._dataView.getInt32(p + 4, true); if ((cigop2 & 0xf) !== CIGAR_REF_SKIP) { console.warn('CG tag with no N tag'); } return cigop2 >> 4; } const absOffset = this._byteArray.byteOffset + p; if (absOffset % 4 === 0 && numCigarOps > 50) { const cigarView = new Uint32Array(this._byteArray.buffer, absOffset, numCigarOps); this._cachedNumericCigar = cigarView; let lref = 0; for (let c = 0; c < numCigarOps; ++c) { const co = cigarView[c]; lref += (co >> 4) * ((CIGAR_CONSUMES_REF_MASK >> (co & 0xf)) & 1); } return lref; } let lref = 0; for (let c = 0; c < numCigarOps; ++c) { const co = this._dataView.getInt32(p + c * 4, true); lref += (co >> 4) * ((CIGAR_CONSUMES_REF_MASK >> (co & 0xf)) & 1); } return lref; } _computeNumericCigar() { const flag_nc = this._dataView.getInt32(this._start + 16, true); if (flag_nc & (Constants.BAM_FUNMAP << 16)) { return new Uint32Array(0); } const numCigarOps = flag_nc & 0xffff; const p = this.b0 + this.read_name_length; if (this._isCGTagPattern(p)) { return (this.tags.CG ?? new Uint32Array(0)); } const absOffset = this._byteArray.byteOffset + p; if (absOffset % 4 === 0 && numCigarOps > 50) { return new Uint32Array(this._byteArray.buffer, absOffset, numCigarOps); } const cigarArray = new Array(numCigarOps); for (let c = 0; c < numCigarOps; ++c) { cigarArray[c] = this._dataView.getInt32(p + c * 4, true) | 0; } return cigarArray; } get length_on_ref() { if (this._cachedLengthOnRef === undefined) { this._cachedLengthOnRef = this._computeLengthOnRef(); } return this._cachedLengthOnRef; } get NUMERIC_CIGAR() { if (this._cachedNumericCigar === undefined) { this._cachedNumericCigar = this._computeNumericCigar(); } return this._cachedNumericCigar; } get CIGAR() { const numeric = this.NUMERIC_CIGAR; let result = ''; for (let i = 0, l = numeric.length; i < l; i++) { const packed = numeric[i]; const length = packed >> 4; const opCode = ASCII_CIGAR_CODES[packed & 0xf]; result += length + String.fromCharCode(opCode); } return result; } get num_cigar_ops() { return this.flag_nc & 0xffff; } get num_cigar_bytes() { return this.num_cigar_ops << 2; } get read_name_length() { return this.bin_mq_nl & 0xff; } get num_seq_bytes() { return (this.seq_length + 1) >> 1; } get NUMERIC_SEQ() { const p = this.seqStart; return this._byteArray.subarray(p, p + this.num_seq_bytes); } get seq() { const len = this.seq_length; const seqStart = this.seqStart; const numeric = this._byteArray; const buf = new Array(len); let i = 0; const fullBytes = len >> 1; for (let j = 0; j < fullBytes; ++j) { const sb = numeric[seqStart + j]; buf[i++] = SEQRET_DECODER[(sb & 0xf0) >> 4]; buf[i++] = SEQRET_DECODER[sb & 0x0f]; } if (i < len) { const sb = numeric[seqStart + fullBytes]; buf[i] = SEQRET_DECODER[(sb & 0xf0) >> 4]; } return buf.join(''); } // adapted from igv.js // uses precomputed lookup table indexed by flag bits + isize sign. // the BAM spec defines tlen as positive for the leftmost segment and // negative for the rightmost, so tlen > 0 reliably indicates which // read comes first without needing position-based correction // (see also: gmod/cram-js src/cramFile/record.ts getPairOrientation) get pair_orientation() { const f = this.flags; // unmapped (0x4) or mate unmapped (0x8) -> undefined if (f & 0xc || this.ref_id !== this.next_refid) { return undefined; } return PAIR_ORIENTATION_TABLE[((f >> 4) & 0xf) | (this.template_length > 0 ? 16 : 0)]; } get bin_mq_nl() { return this._dataView.getInt32(this._start + 12, true); } get flag_nc() { return this._dataView.getInt32(this._start + 16, true); } get seq_length() { return this._dataView.getInt32(this._start + 20, true); } get next_refid() { return this._dataView.getInt32(this._start + 24, true); } get next_pos() { return this._dataView.getInt32(this._start + 28, true); } get template_length() { return this._dataView.getInt32(this._start + 32, true); } seqAt(idx) { if (idx < this.seq_length) { const sb = this._byteArray[this.seqStart + (idx >> 1)]; return idx % 2 === 0 ? SEQRET_DECODER[(sb & 0xf0) >> 4] : SEQRET_DECODER[sb & 0x0f]; } else { return undefined; } } // Most public BamRecord fields are getters on the prototype, so // Object.keys(this) wouldn't include them — JSON.stringify needs an explicit // list. Returns the meaningful BAM-spec fields. Return type is widened so // subclasses can override with their own serialized shape. toJSON() { return { fileOffset: this.fileOffset, ref_id: this.ref_id, start: this.start, end: this.end, name: this.name, flags: this.flags, mq: this.mq, CIGAR: this.CIGAR, seq: this.seq, next_refid: this.next_refid, next_pos: this.next_pos, template_length: this.template_length, tags: this.tags, }; } } //# sourceMappingURL=record.js.map