UNPKG

@gmod/jbrowse

Version:

JBrowse - client-side genome browser

482 lines (437 loc) • 16.3 kB

JavaScript

define( ['dojo/_base/array', 'JBrowse/Util', 'JBrowse/Digest/Crc32', './Util', 'JBrowse/Model/SimpleFeature' ], function( array, Util, Crc32, BAMUtil, SimpleFeature ) { var SEQRET_DECODER = ['=', 'A', 'C', 'x', 'G', 'x', 'x', 'x', 'T', 'x', 'x', 'x', 'x', 'x', 'x', 'N']; var CIGAR_DECODER = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', '?', '?', '?', '?', '?', '?', '?']; var readInt = BAMUtil.readInt; var readShort = BAMUtil.readShort; var readFloat = BAMUtil.readFloat; var readByte = BAMUtil.readByte; var Feature = Util.fastDeclare( { constructor: function( args ) { this.file = args.file; this.data = { type: 'match', source: args.store.source }; this.bytes = { start: args.bytes.start, end: args.bytes.end, byteArray: args.bytes.byteArray }; this._coreParse(); }, get: function( field) { return this._get( field.toLowerCase() ); }, // same as get(), except requires lower-case arguments. used // internally to save lots of calls to field.toLowerCase() _get: function( field ) { return field in this.data ? this.data[field] : // have we already parsed it out? function(field) { var v = this.data[field] = this[field] ? this[field]() : // maybe we have a special parser for it this._flagMasks[field] ? this._parseFlag( field ) : // or is it a flag? this._parseTag( field ); // otherwise, look for it in the tags return v; }.call(this,field); }, tags: function() { return this._get('_tags'); }, _tags: function() { this._parseAllTags(); var tags = [ 'seq', 'seq_reverse_complemented', 'unmapped','qc_failed','duplicate','secondary_alignment','supplementary_alignment' ]; if( ! this._get('unmapped') ) tags.push( 'start', 'end', 'strand', 'score', 'qual', 'MQ', 'CIGAR', 'length_on_ref', 'template_length' ); if( this._get('multi_segment_template') ) { tags.push( 'multi_segment_all_correctly_aligned', 'multi_segment_next_segment_unmapped', 'multi_segment_next_segment_reversed', 'multi_segment_first', 'multi_segment_last', 'next_segment_position' ); } tags = tags.concat( this._tagList || [] ); var d = this.data; for( var k in d ) { if( d.hasOwnProperty( k ) && k[0] != '_' && k != 'multi_segment_all_aligned' && k != 'next_seq_id') tags.push( k ); } var seen = {}; tags = array.filter( tags, function(t) { if( t in this.data && this.data[t] === undefined ) return false; var lt = t.toLowerCase(); var s = seen[lt]; seen[lt] = true; return ! s; },this); return tags; }, parent: function() { return undefined; }, children: function() { return this._get('subfeatures'); }, id: function() { return Crc32.crc32_raw(this.bytes.byteArray, this.bytes.start, this.bytes.end); }, multi_segment_all_aligned: function() { return this._get('multi_segment_all_correctly_aligned'); }, // special parsers /** * Mapping quality score. */ mq: function() { var mq = (this._get('_bin_mq_nl') & 0xff00) >> 8; return mq == 255 ? undefined : mq; }, score: function() { return this._get('mq'); }, qual: function() { if( this._get('unmapped') ) return undefined; var qseq = []; var byteArray = this.bytes.byteArray; var p = this.bytes.start + 36 + this._get('_l_read_name') + this._get('_n_cigar_op')*4 + this._get('_seq_bytes'); var lseq = this._get('seq_length'); for (var j = 0; j < lseq; ++j) { qseq.push( byteArray[p + j] ); } return qseq.join(' '); }, strand: function() { return this._get('seq_reverse_complemented') ? -1 : 1; }, multi_segment_next_segment_strand: function() { if(this._get('multi_segment_next_segment_unmapped')) return undefined; return this._get('multi_segment_next_segment_reversed') ? -1 : 1; }, /** * Length in characters of the read name. */ _l_read_name: function() { return this._get('_bin_mq_nl') & 0xff; }, /** * number of bytes in the sequence field */ _seq_bytes: function() { return (this._get('seq_length') + 1) >> 1; }, seq: function() { var seq = ''; var byteArray = this.bytes.byteArray; var p = this.bytes.start + 36 + this._get('_l_read_name') + this._get('_n_cigar_op')*4; var seqBytes = this._get('_seq_bytes'); for (var j = 0; j < seqBytes; ++j) { var sb = byteArray[p + j]; seq += SEQRET_DECODER[(sb & 0xf0) >> 4]; if (seq.length < this.get('seq_length')) seq += SEQRET_DECODER[(sb & 0x0f)]; } return seq; }, name: function() { return this._get('_read_name'); }, _read_name: function() { var byteArray = this.bytes.byteArray; var readName = ''; var nl = this._get('_l_read_name'); var p = this.bytes.start + 36; for (var j = 0; j < nl-1; ++j) { readName += String.fromCharCode(byteArray[p+j]); } return readName; }, _n_cigar_op: function() { return this._get('_flag_nc') & 0xffff; }, cigar: function() { if( this._get('unmapped') ) return undefined; var byteArray = this.bytes.byteArray; var numCigarOps = this._get('_n_cigar_op'); var p = this.bytes.start + 36 + this._get('_l_read_name'); var cigar = ''; var lref = 0; for (var c = 0; c < numCigarOps; ++c) { var cigop = readInt(byteArray, p); var lop = cigop >> 4; var op = CIGAR_DECODER[cigop & 0xf]; cigar += lop + op; // soft clip, hard clip, and insertion don't count toward // the length on the reference if( op != 'H' && op != 'S' && op != 'I' ) lref += lop; p += 4; } this.data.length_on_ref = lref; return cigar; }, next_segment_position: function() { // NOTE: next_segment_position is a JBrowse location string, so // it is in 1-based coordinates. Thus, we add 1 to the position. var nextSegment = this.file.indexToChr[this._get('_next_refid')]; if( nextSegment ) return nextSegment.name+':'+(parseInt(this._get('_next_pos'))+1); else return undefined; }, subfeatures: function() { var cigar = this._get('cigar'); if( cigar ) return this._cigarToSubfeats( cigar ); return undefined; }, length_on_ref: function() { var c = this._get('cigar'); // the length_on_ref is set as a // side effect of the CIGAR parsing return this.data.length_on_ref; }, _flags: function() { return (this.get('_flag_nc') & 0xffff0000) >> 16; }, end: function() { return this._get('start') + ( this._get('length_on_ref') || this._get('seq_length') || undefined ); }, seq_id: function() { if( this._get('unmapped') ) return undefined; return ( this.file.indexToChr[ this._refID ] || {} ).name; }, next_seq_id: function() { if( this._get('multi_segment_next_segment_unmapped') ) return undefined; return ( this.file.indexToChr[this._get('_next_refid')] || {} ).name; }, _bin_mq_nl: function() { return readInt( this.bytes.byteArray, this.bytes.start + 12 ); }, _flag_nc: function() { return readInt( this.bytes.byteArray, this.bytes.start + 16 ); }, seq_length: function() { return readInt( this.bytes.byteArray, this.bytes.start + 20 ); }, _next_refid: function() { return readInt( this.bytes.byteArray, this.bytes.start + 24 ); }, _next_pos: function() { return readInt( this.bytes.byteArray, this.bytes.start + 28 ); }, template_length: function() { return readInt( this.bytes.byteArray, this.bytes.start + 32 ); }, /** * parse the core data: ref ID and start */ _coreParse: function() { this._refID = readInt( this.bytes.byteArray, this.bytes.start + 4 ); this.data.start = readInt( this.bytes.byteArray, this.bytes.start + 8 ); }, /** * Get the value of a tag, parsing the tags as far as necessary. * Only called if we have not already parsed that field. */ _parseTag: function( tagName ) { // if all of the tags have been parsed and we're still being // called, we already know that we have no such tag, because // it would already have been cached. if( this._allTagsParsed ) return undefined; this._tagList = this._tagList || []; var byteArray = this.bytes.byteArray; var p = this._tagOffset || this.bytes.start + 36 + this._get('_l_read_name') + this._get('_n_cigar_op')*4 + this._get('_seq_bytes') + this._get('seq_length'); var blockEnd = this.bytes.end; while( p < blockEnd && lcTag != tagName ) { var tag = String.fromCharCode( byteArray[p], byteArray[ p+1 ] ); var lcTag = tag.toLowerCase(); var type = String.fromCharCode( byteArray[ p+2 ] ); p += 3; var value; switch( type.toLowerCase() ) { case 'a': value = String.fromCharCode( byteArray[p] ); p += 1; break; case 'i': value = readInt(byteArray, p ); p += 4; break; case 'c': value = byteArray[p]; p += 1; break; case 's': value = readShort(byteArray, p); p += 2; break; case 'f': value = readFloat( byteArray, p ); p += 4; break; case 'z': case 'h': value = ''; while( p <= blockEnd ) { var cc = byteArray[p++]; if( cc == 0 ) { break; } else { value += String.fromCharCode(cc); } } break; case 'b': value = ''; var cc = byteArray[p++]; var Btype = String.fromCharCode(cc); if( Btype == 'i'|| Btype == 'I' ) { var limit = readInt( byteArray, p ) p += 4; for( var k = 0; k < limit; k++ ) { value += readInt( byteArray, p ); if(k+1<limit) value += ','; p += 4; } } if( Btype == 's'|| Btype == 'S' ) { var limit = readInt( byteArray, p ) p += 4; for( var k = 0; k < limit; k++ ) { value += readShort( byteArray, p ); if(k+1<limit) value += ','; p += 2; } } if( Btype == 'c'|| Btype == 'C' ) { var limit = readInt( byteArray, p ) p += 4; for( var k = 0; k < limit; k++ ) { value += readByte( byteArray, p ); if(k+1<limit) value += ','; p += 1; } } if( Btype == 'f' ) { var limit = readInt( byteArray, p ) p += 4; for( var k = 0; k < limit; k++ ) { value += readFloat( byteArray, p ); if(k+1<limit) value += ','; p += 4; } } break; default: console.warn( "Unknown BAM tag type '"+type +"', tags may be incomplete" ); value = undefined; p = blockEnd; // stop parsing tags } this._tagOffset = p; this._tagList.push( tag ); if( lcTag == tagName ) return value; else { this.data[ lcTag ] = value; } } this._allTagsParsed = true; return undefined; }, _parseAllTags: function() { this._parseTag(); // calling _parseTag with no arg just parses // all the tags and returns the last one }, _flagMasks: { multi_segment_template: 0x1, multi_segment_all_correctly_aligned: 0x2, unmapped: 0x4, multi_segment_next_segment_unmapped: 0x8, seq_reverse_complemented: 0x10, multi_segment_next_segment_reversed: 0x20, multi_segment_first: 0x40, multi_segment_last: 0x80, secondary_alignment: 0x100, qc_failed: 0x200, duplicate: 0x400, supplementary_alignment: 0x800 }, _parseFlag: function( flagName ) { return !!( this._get('_flags') & this._flagMasks[flagName] ); }, _parseCigar: function( cigar ) { return array.map( cigar.match(/\d+\D/g), function( op ) { return [ op.match(/\D/)[0].toUpperCase(), parseInt( op ) ]; }); }, /** * take a cigar string, and initial position, return an array of subfeatures */ _cigarToSubfeats: function(cigar) { var subfeats = []; var min = this._get('start'); var max; var ops = this._parseCigar( cigar ); for (var i = 0; i < ops.length; i++) { var lop = ops[i][1]; var op = ops[i][0]; // operation type // converting "=" to "E" to avoid possible problems later with non-alphanumeric type name if (op === "=") { op = "E"; } switch (op) { case 'M': case 'D': case 'N': case 'E': case 'X': max = min + lop; break; case 'I': max = min; break; case 'P': // not showing padding deletions (possibly change this later -- could treat same as 'I' ?? ) case 'H': // not showing hard clipping (since it's unaligned, and offset arg meant to be beginning of aligned part) case 'S': // not showing soft clipping (since it's unaligned, and offset arg meant to be beginning of aligned part) break; // other possible cases } if( op !== 'N' ) { subfeats.push( new SimpleFeature( { data: { type: op, start: min, end: max, strand: this._get('strand'), cigar_op: lop+op }, parent: this }) ); } min = max; } return subfeats; } }); return Feature; });