UNPKG

apr144-bam

Version:

Parser for BAM and BAM index (bai) files

618 lines (549 loc) 14.9 kB
import Constants from './constants' const SEQRET_DECODER = '=ACMGRSVTWYHKDBN'.split('') const CIGAR_DECODER = 'MIDNSHP=X???????'.split('') /** * Class of each BAM record returned by this API. */ export default class BamRecord { private data = {} as Record<string, any> private bytes: { start: number; end: number; byteArray: Buffer } private _id: number private _tagOffset: number | undefined private _tagList: string[] = [] private _allTagsParsed = false public flags: any public _refID: number constructor(args: any) { const { bytes, fileOffset } = args const { byteArray, start } = bytes this.data = {} this.bytes = bytes this._id = fileOffset this._refID = byteArray.readInt32LE(start + 4) this.data.start = byteArray.readInt32LE(start + 8) this.flags = (byteArray.readInt32LE(start + 16) & 0xffff0000) >> 16 } get(field: string) { //@ts-ignore if (this[field]) { //@ts-ignore if (this.data[field]) { return this.data[field] } //@ts-ignore this.data[field] = this[field]() return this.data[field] } return this._get(field.toLowerCase()) } end() { return this.get('start') + this.get('length_on_ref') } seq_id() { return this._refID } // same as get(), except requires lower-case arguments. used // internally to save lots of calls to field.toLowerCase() _get(field: string) { if (field in this.data) { return this.data[field] } this.data[field] = this._parseTag(field) return this.data[field] } _tags() { this._parseAllTags() let tags = ['seq'] if (!this.isSegmentUnmapped()) { tags.push( 'start', 'end', 'strand', 'score', 'qual', 'MQ', 'CIGAR', 'length_on_ref', 'template_length', ) } if (this.isPaired()) { tags.push('next_segment_position', 'pair_orientation') } tags = tags.concat(this._tagList || []) for (const k of Object.keys(this.data)) { if (!k.startsWith('_') && k !== 'next_seq_id') { tags.push(k) } } const seen: Record<string, boolean> = {} return tags.filter(t => { if ( (t in this.data && this.data[t] === undefined) || t === 'CG' || t === 'cg' ) { return false } const lt = t.toLowerCase() const s = seen[lt] seen[lt] = true return !s }) } parent() { return } children() { return this.get('subfeatures') } id() { return this._id } // special parsers /** * Mapping quality score. */ mq() { const mq = (this.get('_bin_mq_nl') & 0xff00) >> 8 return mq === 255 ? undefined : mq } score() { return this.get('mq') } qual() { return this.qualRaw()?.join(' ') } qualRaw() { if (this.isSegmentUnmapped()) { return } const { start, byteArray } = this.bytes const p = start + 36 + this.get('_l_read_name') + this.get('_n_cigar_op') * 4 + this.get('_seq_bytes') const lseq = this.get('seq_length') return byteArray.subarray(p, p + lseq) } strand() { return this.isReverseComplemented() ? -1 : 1 } multi_segment_next_segment_strand() { if (this.isMateUnmapped()) { return } return this.isMateReverseComplemented() ? -1 : 1 } name() { return this.get('_read_name') } _read_name() { const nl = this.get('_l_read_name') const { byteArray, start } = this.bytes return byteArray.toString('ascii', start + 36, start + 36 + nl - 1) } /** * Get the value of a tag, parsing the tags as far as necessary. * Only called if we have not already parsed that field. */ _parseTag(tagName?: string) { // if all of the tags have been parsed and we're still being // called, we already know that we have no such tag, because // it would already have been cached. if (this._allTagsParsed) { return } const { byteArray, start } = this.bytes let p = this._tagOffset || start + 36 + this.get('_l_read_name') + this.get('_n_cigar_op') * 4 + this.get('_seq_bytes') + this.get('seq_length') const blockEnd = this.bytes.end let lcTag while (p < blockEnd && lcTag !== tagName) { const tag = String.fromCharCode(byteArray[p], byteArray[p + 1]) lcTag = tag.toLowerCase() const type = String.fromCharCode(byteArray[p + 2]) p += 3 let value switch (type) { case 'A': { value = String.fromCharCode(byteArray[p]) p += 1 break } case 'i': { value = byteArray.readInt32LE(p) p += 4 break } case 'I': { value = byteArray.readUInt32LE(p) p += 4 break } case 'c': { value = byteArray.readInt8(p) p += 1 break } case 'C': { value = byteArray.readUInt8(p) p += 1 break } case 's': { value = byteArray.readInt16LE(p) p += 2 break } case 'S': { value = byteArray.readUInt16LE(p) p += 2 break } case 'f': { value = byteArray.readFloatLE(p) p += 4 break } case 'Z': case 'H': { value = '' while (p <= blockEnd) { const cc = byteArray[p++] if (cc === 0) { break } else { value += String.fromCharCode(cc) } } break } case 'B': { value = '' const cc = byteArray[p++] const Btype = String.fromCharCode(cc) const limit = byteArray.readInt32LE(p) p += 4 if (Btype === 'i') { if (tag === 'CG') { for (let k = 0; k < limit; k++) { const cigop = byteArray.readInt32LE(p) const lop = cigop >> 4 const op = CIGAR_DECODER[cigop & 0xf] value += lop + op p += 4 } } else { for (let k = 0; k < limit; k++) { value += byteArray.readInt32LE(p) if (k + 1 < limit) { value += ',' } p += 4 } } } if (Btype === 'I') { if (tag === 'CG') { for (let k = 0; k < limit; k++) { const cigop = byteArray.readUInt32LE(p) const lop = cigop >> 4 const op = CIGAR_DECODER[cigop & 0xf] value += lop + op p += 4 } } else { for (let k = 0; k < limit; k++) { value += byteArray.readUInt32LE(p) if (k + 1 < limit) { value += ',' } p += 4 } } } if (Btype === 's') { for (let k = 0; k < limit; k++) { value += byteArray.readInt16LE(p) if (k + 1 < limit) { value += ',' } p += 2 } } if (Btype === 'S') { for (let k = 0; k < limit; k++) { value += byteArray.readUInt16LE(p) if (k + 1 < limit) { value += ',' } p += 2 } } if (Btype === 'c') { for (let k = 0; k < limit; k++) { value += byteArray.readInt8(p) if (k + 1 < limit) { value += ',' } p += 1 } } if (Btype === 'C') { for (let k = 0; k < limit; k++) { value += byteArray.readUInt8(p) if (k + 1 < limit) { value += ',' } p += 1 } } if (Btype === 'f') { for (let k = 0; k < limit; k++) { value += byteArray.readFloatLE(p) if (k + 1 < limit) { value += ',' } p += 4 } } break } default: { console.warn(`Unknown BAM tag type '${type}', tags may be incomplete`) value = undefined p = blockEnd } // stop parsing tags } this._tagOffset = p this._tagList.push(tag) if (lcTag === tagName) { return value } this.data[lcTag] = value } this._allTagsParsed = true return } _parseAllTags() { this._parseTag('') } _parseCigar(cigar: string) { return ( //@ts-ignore cigar .match(/\d+\D/g) //@ts-ignore .map(op => [op.match(/\D/)[0].toUpperCase(), Number.parseInt(op, 10)]) ) } /** * @returns {boolean} true if the read is paired, regardless of whether both segments are mapped */ isPaired() { return !!(this.flags & Constants.BAM_FPAIRED) } /** @returns {boolean} true if the read is paired, and both segments are mapped */ isProperlyPaired() { return !!(this.flags & Constants.BAM_FPROPER_PAIR) } /** @returns {boolean} true if the read itself is unmapped; conflictive with isProperlyPaired */ isSegmentUnmapped() { return !!(this.flags & Constants.BAM_FUNMAP) } /** @returns {boolean} true if the read itself is unmapped; conflictive with isProperlyPaired */ isMateUnmapped() { return !!(this.flags & Constants.BAM_FMUNMAP) } /** @returns {boolean} true if the read is mapped to the reverse strand */ isReverseComplemented() { return !!(this.flags & Constants.BAM_FREVERSE) } /** @returns {boolean} true if the mate is mapped to the reverse strand */ isMateReverseComplemented() { return !!(this.flags & Constants.BAM_FMREVERSE) } /** @returns {boolean} true if this is read number 1 in a pair */ isRead1() { return !!(this.flags & Constants.BAM_FREAD1) } /** @returns {boolean} true if this is read number 2 in a pair */ isRead2() { return !!(this.flags & Constants.BAM_FREAD2) } /** @returns {boolean} true if this is a secondary alignment */ isSecondary() { return !!(this.flags & Constants.BAM_FSECONDARY) } /** @returns {boolean} true if this read has failed QC checks */ isFailedQc() { return !!(this.flags & Constants.BAM_FQCFAIL) } /** @returns {boolean} true if the read is an optical or PCR duplicate */ isDuplicate() { return !!(this.flags & Constants.BAM_FDUP) } /** @returns {boolean} true if this is a supplementary alignment */ isSupplementary() { return !!(this.flags & Constants.BAM_FSUPPLEMENTARY) } cigar() { if (this.isSegmentUnmapped()) { return } const { byteArray, start } = this.bytes const numCigarOps = this.get('_n_cigar_op') let p = start + 36 + this.get('_l_read_name') const seqLen = this.get('seq_length') let cigar = '' let lref = 0 // check for CG tag by inspecting whether the CIGAR field // contains a clip that consumes entire seqLen let cigop = byteArray.readInt32LE(p) let lop = cigop >> 4 let op = CIGAR_DECODER[cigop & 0xf] if (op === 'S' && lop === seqLen) { // if there is a CG the second CIGAR field will // be a N tag the represents the length on ref p += 4 cigop = byteArray.readInt32LE(p) lop = cigop >> 4 op = CIGAR_DECODER[cigop & 0xf] if (op !== 'N') { console.warn('CG tag with no N tag') } this.data.length_on_ref = lop return this.get('CG') } else { for (let c = 0; c < numCigarOps; ++c) { cigop = byteArray.readInt32LE(p) lop = cigop >> 4 op = CIGAR_DECODER[cigop & 0xf] cigar += lop + op // soft clip, hard clip, and insertion don't count toward // the length on the reference if (op !== 'H' && op !== 'S' && op !== 'I') { lref += lop } p += 4 } this.data.length_on_ref = lref return cigar } } length_on_ref() { if (this.data.length_on_ref) { return this.data.length_on_ref } else { this.get('cigar') // the length_on_ref is set as a side effect return this.data.length_on_ref } } _n_cigar_op() { return this.get('_flag_nc') & 0xffff } _l_read_name() { return this.get('_bin_mq_nl') & 0xff } /** * number of bytes in the sequence field */ _seq_bytes() { return (this.get('seq_length') + 1) >> 1 } getReadBases() { return this.seq() } seq() { const { byteArray, start } = this.bytes const p = start + 36 + this.get('_l_read_name') + this.get('_n_cigar_op') * 4 const seqBytes = this.get('_seq_bytes') const len = this.get('seq_length') let buf = '' let i = 0 for (let j = 0; j < seqBytes; ++j) { const sb = byteArray[p + j] buf += SEQRET_DECODER[(sb & 0xf0) >> 4] i++ if (i < len) { buf += SEQRET_DECODER[sb & 0x0f] i++ } } return buf } // adapted from igv.js getPairOrientation() { if ( !this.isSegmentUnmapped() && !this.isMateUnmapped() && this._refID === this._next_refid() ) { const s1 = this.isReverseComplemented() ? 'R' : 'F' const s2 = this.isMateReverseComplemented() ? 'R' : 'F' let o1 = ' ' let o2 = ' ' if (this.isRead1()) { o1 = '1' o2 = '2' } else if (this.isRead2()) { o1 = '2' o2 = '1' } const tmp = [] const isize = this.template_length() if (isize > 0) { tmp[0] = s1 tmp[1] = o1 tmp[2] = s2 tmp[3] = o2 } else { tmp[2] = s1 tmp[3] = o1 tmp[0] = s2 tmp[1] = o2 } return tmp.join('') } return '' } _bin_mq_nl() { return this.bytes.byteArray.readInt32LE(this.bytes.start + 12) } _flag_nc() { return this.bytes.byteArray.readInt32LE(this.bytes.start + 16) } seq_length() { return this.bytes.byteArray.readInt32LE(this.bytes.start + 20) } _next_refid() { return this.bytes.byteArray.readInt32LE(this.bytes.start + 24) } _next_pos() { return this.bytes.byteArray.readInt32LE(this.bytes.start + 28) } template_length() { return this.bytes.byteArray.readInt32LE(this.bytes.start + 32) } toJSON() { const data: Record<string, any> = {} for (const k of Object.keys(this)) { if (k.startsWith('_') || k === 'bytes') { continue } //@ts-ignore data[k] = this[k] } return data } }