UNPKG

@gmod/bam

Version:

Parser for BAM and BAM index (bai) files

742 lines (662 loc) 21.3 kB
import { CIGAR_REF_SKIP, CIGAR_SOFT_CLIP } from './cigar.ts' import Constants from './constants.ts' const SEQRET_DECODER = '=ACMGRSVTWYHKDBN'.split('') // precomputed pair orientation strings indexed by ((flags >> 4) & 0xF) | (isize > 0 ? 16 : 0) // bits 0-3 encode flag bits 0x10(reverse),0x20(mate reverse),0x40(read1),0x80(read2) // bit 4 encodes whether isize > 0 // prettier-ignore const PAIR_ORIENTATION_TABLE = [ 'F F ','F R ','R F ','R R ','F2F1','F2R1','R2F1','R2R1', 'F1F2','F1R2','R1F2','R1R2','F2F1','F2R1','R2F1','R2R1', 'F F ','R F ','F R ','R R ','F1F2','R1F2','F1R2','R1R2', 'F2F1','R2F1','F2R1','R2R1','F1F2','R1F2','F1R2','R1R2', ] const ASCII_CIGAR_CODES = [ 77, 73, 68, 78, 83, 72, 80, 61, 88, 63, 63, 63, 63, 63, 63, 63, ] const textDecoder = new TextDecoder() // Bitmask for ops that consume ref: M=0, D=2, N=3, P=6, ==7, X=8 // Binary: 0b111001101 = 0x1CD const CIGAR_CONSUMES_REF_MASK = 0x1cd export interface Bytes { start: number end: number byteArray: Uint8Array } export default class BamRecord { public fileOffset: number private _byteArray: Uint8Array private _start: number private _end: number private _dataView: DataView private _cachedEnd?: number private _cachedTags?: Record<string, unknown> private _cachedLengthOnRef?: number private _cachedNumericCigar?: Uint32Array | number[] private _cachedNUMERIC_MD?: Uint8Array | null private _cachedSeqStart?: number constructor(args: { bytes: Bytes; fileOffset: number; dataView: DataView }) { this._byteArray = args.bytes.byteArray this._start = args.bytes.start this._end = args.bytes.end this.fileOffset = args.fileOffset this._dataView = args.dataView } get byteArray() { return this._byteArray } get flags() { return (this._dataView.getInt32(this._start + 16, true) & 0xffff0000) >> 16 } get ref_id() { return this._dataView.getInt32(this._start + 4, true) } get start() { return this._dataView.getInt32(this._start + 8, true) } get end() { if (this._cachedEnd === undefined) { this._cachedEnd = this.start + this.length_on_ref } return this._cachedEnd } get mq() { const mq = (this.bin_mq_nl & 0xff00) >> 8 return mq === 255 ? undefined : mq } get score() { return this.mq } get qual() { if (this.isSegmentUnmapped()) { return null } else { const seqLen = this.seq_length const p = this.seqStart + ((seqLen + 1) >> 1) return this._byteArray.subarray(p, p + seqLen) } } get strand() { return this.isReverseComplemented() ? -1 : 1 } get b0() { return this._start + 36 } // start of the SEQ section (and end of CIGAR). All downstream sections // (qual, tags) are offsets from here, so cache once and reuse. get seqStart() { if (this._cachedSeqStart === undefined) { this._cachedSeqStart = this.b0 + this.read_name_length + this.num_cigar_bytes } return this._cachedSeqStart } get tagsStart() { const seqLen = this.seq_length return this.seqStart + ((seqLen + 1) >> 1) + seqLen } // batch fromCharCode: fastest for typical name lengths (see benchmarks/string-building.bench.ts) get name() { const len = this.read_name_length - 1 const start = this.b0 const ba = this._byteArray const codes = new Array(len) for (let i = 0; i < len; i++) { codes[i] = ba[start + i]! } return String.fromCharCode(...codes) } get NUMERIC_MD() { if (this._cachedNUMERIC_MD === undefined) { const result = this.getTagRaw('MD') this._cachedNUMERIC_MD = result instanceof Uint8Array ? result : null } return this._cachedNUMERIC_MD === null ? undefined : this._cachedNUMERIC_MD } get tags() { if (this._cachedTags === undefined) { this._cachedTags = this._computeTags() } return this._cachedTags } getTag(tagName: string) { if (this._cachedTags !== undefined) { return this._cachedTags[tagName] } return this._findTag(tagName, false) } getTagRaw(tagName: string) { return this._findTag(tagName, true) } private _findTag(tagName: string, raw: boolean) { const tag1 = tagName.charCodeAt(0) const tag2 = tagName.charCodeAt(1) let p = this.tagsStart const blockEnd = this._end const ba = this._byteArray while (p < blockEnd) { const currentTag1 = ba[p] const currentTag2 = ba[p + 1] const type = ba[p + 2] p += 3 const isMatch = currentTag1 === tag1 && currentTag2 === tag2 switch (type) { case 0x41: // 'A' if (isMatch) { return String.fromCharCode(ba[p]!) } p += 1 break case 0x69: // 'i' if (isMatch) { return this._dataView.getInt32(p, true) } p += 4 break case 0x49: // 'I' if (isMatch) { return this._dataView.getUint32(p, true) } p += 4 break case 0x63: // 'c' if (isMatch) { return this._dataView.getInt8(p) } p += 1 break case 0x43: // 'C' if (isMatch) { return this._dataView.getUint8(p) } p += 1 break case 0x73: // 's' if (isMatch) { return this._dataView.getInt16(p, true) } p += 2 break case 0x53: // 'S' if (isMatch) { return this._dataView.getUint16(p, true) } p += 2 break case 0x66: // 'f' if (isMatch) { return this._dataView.getFloat32(p, true) } p += 4 break case 0x5a: // 'Z' case 0x48: { // 'H' const start = p while (p < blockEnd && ba[p] !== 0) { p++ } if (isMatch) { return raw ? ba.subarray(start, p) : textDecoder.decode(ba.subarray(start, p)) } p++ // advance past null terminator break } case 0x42: { // 'B' const Btype = ba[p++] const limit = this._dataView.getInt32(p, true) p += 4 const absOffset = ba.byteOffset + p if (isMatch) { if (Btype === 0x69) { // 'i' if (absOffset % 4 === 0) { return new Int32Array(ba.buffer, absOffset, limit) } const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getInt32(p + i * 4, true) } return arr } else if (Btype === 0x49) { // 'I' if (absOffset % 4 === 0) { return new Uint32Array(ba.buffer, absOffset, limit) } const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getUint32(p + i * 4, true) } return arr } else if (Btype === 0x73) { // 's' if (absOffset % 2 === 0) { return new Int16Array(ba.buffer, absOffset, limit) } const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getInt16(p + i * 2, true) } return arr } else if (Btype === 0x53) { // 'S' if (absOffset % 2 === 0) { return new Uint16Array(ba.buffer, absOffset, limit) } const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getUint16(p + i * 2, true) } return arr } else if (Btype === 0x63) { // 'c' return new Int8Array(ba.buffer, absOffset, limit) } else if (Btype === 0x43) { // 'C' return new Uint8Array(ba.buffer, absOffset, limit) } else if (Btype === 0x66) { // 'f' if (absOffset % 4 === 0) { return new Float32Array(ba.buffer, absOffset, limit) } const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getFloat32(p + i * 4, true) } return arr } } if (Btype === 0x69 || Btype === 0x49 || Btype === 0x66) { p += limit << 2 } else if (Btype === 0x73 || Btype === 0x53) { p += limit << 1 } else if (Btype === 0x63 || Btype === 0x43) { p += limit } break } } } return undefined } private _computeTags() { let p = this.tagsStart const blockEnd = this._end const ba = this._byteArray const tags: Record<string, unknown> = {} while (p < blockEnd) { const tag = String.fromCharCode(ba[p]!, ba[p + 1]!) const type = ba[p + 2]! p += 3 switch (type) { case 0x41: // 'A' tags[tag] = String.fromCharCode(ba[p]!) p += 1 break case 0x69: // 'i' tags[tag] = this._dataView.getInt32(p, true) p += 4 break case 0x49: // 'I' tags[tag] = this._dataView.getUint32(p, true) p += 4 break case 0x63: // 'c' tags[tag] = this._dataView.getInt8(p) p += 1 break case 0x43: // 'C' tags[tag] = this._dataView.getUint8(p) p += 1 break case 0x73: // 's' tags[tag] = this._dataView.getInt16(p, true) p += 2 break case 0x53: // 'S' tags[tag] = this._dataView.getUint16(p, true) p += 2 break case 0x66: // 'f' tags[tag] = this._dataView.getFloat32(p, true) p += 4 break case 0x5a: // 'Z' case 0x48: { // 'H' const start = p while (p < blockEnd && ba[p] !== 0) { p++ } tags[tag] = textDecoder.decode(ba.subarray(start, p)) p++ // advance past null terminator break } case 0x42: { // 'B' const Btype = ba[p++] const limit = this._dataView.getInt32(p, true) p += 4 const absOffset = ba.byteOffset + p if (Btype === 0x69) { // 'i' if (absOffset % 4 === 0) { tags[tag] = new Int32Array(ba.buffer, absOffset, limit) } else { const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getInt32(p + i * 4, true) } tags[tag] = arr } p += limit << 2 } else if (Btype === 0x49) { // 'I' if (absOffset % 4 === 0) { tags[tag] = new Uint32Array(ba.buffer, absOffset, limit) } else { const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getUint32(p + i * 4, true) } tags[tag] = arr } p += limit << 2 } else if (Btype === 0x73) { // 's' if (absOffset % 2 === 0) { tags[tag] = new Int16Array(ba.buffer, absOffset, limit) } else { const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getInt16(p + i * 2, true) } tags[tag] = arr } p += limit << 1 } else if (Btype === 0x53) { // 'S' if (absOffset % 2 === 0) { tags[tag] = new Uint16Array(ba.buffer, absOffset, limit) } else { const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getUint16(p + i * 2, true) } tags[tag] = arr } p += limit << 1 } else if (Btype === 0x63) { // 'c' tags[tag] = new Int8Array(ba.buffer, absOffset, limit) p += limit } else if (Btype === 0x43) { // 'C' tags[tag] = new Uint8Array(ba.buffer, absOffset, limit) p += limit } else if (Btype === 0x66) { // 'f' if (absOffset % 4 === 0) { tags[tag] = new Float32Array(ba.buffer, absOffset, limit) } else { const arr: number[] = new Array(limit) for (let i = 0; i < limit; i++) { arr[i] = this._dataView.getFloat32(p + i * 4, true) } tags[tag] = arr } p += limit << 2 } break } default: console.error('Unknown BAM tag type', type) break } } return tags } isPaired() { return !!(this.flags & Constants.BAM_FPAIRED) } isProperlyPaired() { return !!(this.flags & Constants.BAM_FPROPER_PAIR) } isSegmentUnmapped() { return !!(this.flags & Constants.BAM_FUNMAP) } isMateUnmapped() { return !!(this.flags & Constants.BAM_FMUNMAP) } isReverseComplemented() { return !!(this.flags & Constants.BAM_FREVERSE) } isMateReverseComplemented() { return !!(this.flags & Constants.BAM_FMREVERSE) } isRead1() { return !!(this.flags & Constants.BAM_FREAD1) } isRead2() { return !!(this.flags & Constants.BAM_FREAD2) } isSecondary() { return !!(this.flags & Constants.BAM_FSECONDARY) } isFailedQc() { return !!(this.flags & Constants.BAM_FQCFAIL) } isDuplicate() { return !!(this.flags & Constants.BAM_FDUP) } isSupplementary() { return !!(this.flags & Constants.BAM_FSUPPLEMENTARY) } // Benchmark results for CIGAR parsing strategies (see benchmarks/cigar-lifecycle.bench.ts): // // Aligned data: // - Plain array is 1.6-1.8x faster than Uint32Array for small CIGARs (≤50 ops) // - Uint32Array view is 1.3-2.2x faster for large CIGARs (≥200 ops) // - Crossover point is around 50-100 ops // // Unaligned data (requires slice+copy for Uint32Array): // - Plain array is 3.7-6.1x faster for typical sizes (50-200 ops) // - Plain array is 9-10x faster for small CIGARs (1-7 ops) // - Uint32Array slice+copy only wins at extreme sizes (10000 ops: 1.4x faster) // // Using |0 to force 32-bit integers in plain array path: // - 1.67x faster for medium CIGARs (50 ops) // - Neutral for small CIGARs (1-7 ops) // // Strategy: use plain array with |0 for small aligned (≤50 ops) and all unaligned, // Uint32Array view only for large aligned CIGARs. // CG tag pattern: first op is soft-clip consuming entire sequence, second op is N encoding length-on-ref private _isCGTagPattern(p: number) { const cigop = this._dataView.getInt32(p, true) return (cigop & 0xf) === CIGAR_SOFT_CLIP && cigop >> 4 === this.seq_length } private _computeLengthOnRef(): number { const flag_nc = this._dataView.getInt32(this._start + 16, true) if (flag_nc & (Constants.BAM_FUNMAP << 16)) { return 0 } const numCigarOps = flag_nc & 0xffff const p = this.b0 + this.read_name_length if (this._isCGTagPattern(p)) { const cigop2 = this._dataView.getInt32(p + 4, true) if ((cigop2 & 0xf) !== CIGAR_REF_SKIP) { console.warn('CG tag with no N tag') } return cigop2 >> 4 } const absOffset = this._byteArray.byteOffset + p if (absOffset % 4 === 0 && numCigarOps > 50) { const cigarView = new Uint32Array( this._byteArray.buffer, absOffset, numCigarOps, ) this._cachedNumericCigar = cigarView let lref = 0 for (let c = 0; c < numCigarOps; ++c) { const co = cigarView[c]! lref += (co >> 4) * ((CIGAR_CONSUMES_REF_MASK >> (co & 0xf)) & 1) } return lref } let lref = 0 for (let c = 0; c < numCigarOps; ++c) { const co = this._dataView.getInt32(p + c * 4, true) lref += (co >> 4) * ((CIGAR_CONSUMES_REF_MASK >> (co & 0xf)) & 1) } return lref } private _computeNumericCigar(): Uint32Array | number[] { const flag_nc = this._dataView.getInt32(this._start + 16, true) if (flag_nc & (Constants.BAM_FUNMAP << 16)) { return new Uint32Array(0) } const numCigarOps = flag_nc & 0xffff const p = this.b0 + this.read_name_length if (this._isCGTagPattern(p)) { return ( (this.tags.CG as Uint32Array | number[] | undefined) ?? new Uint32Array(0) ) } const absOffset = this._byteArray.byteOffset + p if (absOffset % 4 === 0 && numCigarOps > 50) { return new Uint32Array(this._byteArray.buffer, absOffset, numCigarOps) } const cigarArray: number[] = new Array(numCigarOps) for (let c = 0; c < numCigarOps; ++c) { cigarArray[c] = this._dataView.getInt32(p + c * 4, true) | 0 } return cigarArray } get length_on_ref() { if (this._cachedLengthOnRef === undefined) { this._cachedLengthOnRef = this._computeLengthOnRef() } return this._cachedLengthOnRef } get NUMERIC_CIGAR() { if (this._cachedNumericCigar === undefined) { this._cachedNumericCigar = this._computeNumericCigar() } return this._cachedNumericCigar } get CIGAR() { const numeric = this.NUMERIC_CIGAR let result = '' for (let i = 0, l = numeric.length; i < l; i++) { const packed = numeric[i]! const length = packed >> 4 const opCode = ASCII_CIGAR_CODES[packed & 0xf]! result += length + String.fromCharCode(opCode) } return result } get num_cigar_ops() { return this.flag_nc & 0xffff } get num_cigar_bytes() { return this.num_cigar_ops << 2 } get read_name_length() { return this.bin_mq_nl & 0xff } get num_seq_bytes() { return (this.seq_length + 1) >> 1 } get NUMERIC_SEQ() { const p = this.seqStart return this._byteArray.subarray(p, p + this.num_seq_bytes) } get seq() { const len = this.seq_length const seqStart = this.seqStart const numeric = this._byteArray const buf = new Array(len) let i = 0 const fullBytes = len >> 1 for (let j = 0; j < fullBytes; ++j) { const sb = numeric[seqStart + j]! buf[i++] = SEQRET_DECODER[(sb & 0xf0) >> 4]! buf[i++] = SEQRET_DECODER[sb & 0x0f]! } if (i < len) { const sb = numeric[seqStart + fullBytes]! buf[i] = SEQRET_DECODER[(sb & 0xf0) >> 4]! } return buf.join('') } // adapted from igv.js // uses precomputed lookup table indexed by flag bits + isize sign. // the BAM spec defines tlen as positive for the leftmost segment and // negative for the rightmost, so tlen > 0 reliably indicates which // read comes first without needing position-based correction // (see also: gmod/cram-js src/cramFile/record.ts getPairOrientation) get pair_orientation() { const f = this.flags // unmapped (0x4) or mate unmapped (0x8) -> undefined if (f & 0xc || this.ref_id !== this.next_refid) { return undefined } return PAIR_ORIENTATION_TABLE[ ((f >> 4) & 0xf) | (this.template_length > 0 ? 16 : 0) ] } get bin_mq_nl() { return this._dataView.getInt32(this._start + 12, true) } get flag_nc() { return this._dataView.getInt32(this._start + 16, true) } get seq_length() { return this._dataView.getInt32(this._start + 20, true) } get next_refid() { return this._dataView.getInt32(this._start + 24, true) } get next_pos() { return this._dataView.getInt32(this._start + 28, true) } get template_length() { return this._dataView.getInt32(this._start + 32, true) } seqAt(idx: number): string | undefined { if (idx < this.seq_length) { const sb = this._byteArray[this.seqStart + (idx >> 1)]! return idx % 2 === 0 ? SEQRET_DECODER[(sb & 0xf0) >> 4]! : SEQRET_DECODER[sb & 0x0f]! } else { return undefined } } // Most public BamRecord fields are getters on the prototype, so // Object.keys(this) wouldn't include them — JSON.stringify needs an explicit // list. Returns the meaningful BAM-spec fields. Return type is widened so // subclasses can override with their own serialized shape. toJSON(): Record<string, unknown> { return { fileOffset: this.fileOffset, ref_id: this.ref_id, start: this.start, end: this.end, name: this.name, flags: this.flags, mq: this.mq, CIGAR: this.CIGAR, seq: this.seq, next_refid: this.next_refid, next_pos: this.next_pos, template_length: this.template_length, tags: this.tags, } } }