UNPKG

@gmod/tabix

Version:

Read Tabix-indexed files, supports both .tbi and .csi indexes

253 lines (231 loc) 7.51 kB
import { unzip } from '@gmod/bgzf-filehandle' import Chunk from './chunk.ts' import IndexFile, { Options } from './indexFile.ts' import { longFromBytesToUnsigned } from './long.ts' import { checkAbortSignal, optimizeChunks } from './util.ts' import VirtualOffset, { fromBytes } from './virtualOffset.ts' const TBI_MAGIC = 21578324 // TBI\1 const TAD_LIDX_SHIFT = 14 /** * calculate the list of bins that may overlap with region [beg,end) * (zero-based half-open) */ function reg2bins(beg: number, end: number) { beg += 1 // < convert to 1-based closed end -= 1 return [ [0, 0], [1 + (beg >> 26), 1 + (end >> 26)], [9 + (beg >> 23), 9 + (end >> 23)], [73 + (beg >> 20), 73 + (end >> 20)], [585 + (beg >> 17), 585 + (end >> 17)], [4681 + (beg >> 14), 4681 + (end >> 14)], ] as const } export default class TabixIndex extends IndexFile { async lineCount(refName: string, opts: Options = {}) { const indexData = await this.parse(opts) const refId = indexData.refNameToId[refName] if (refId === undefined) { return -1 } const idx = indexData.indices[refId] if (!idx) { return -1 } return indexData.indices[refId].stats?.lineCount ?? -1 } // fetch and parse the index async _parse(opts: Options = {}) { const buf = await this.filehandle.readFile(opts) const bytes = await unzip(buf) checkAbortSignal(opts.signal) const dataView = new DataView(bytes.buffer) const magic = dataView.getUint32(0, true) if (magic !== TBI_MAGIC /* "TBI\1" */) { throw new Error('Not a TBI file') } // number of reference sequences in the index const refCount = dataView.getUint32(4, true) const formatFlags = dataView.getUint32(8, true) const coordinateType = formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed' const formatOpts: Record<number, string> = { 0: 'generic', 1: 'SAM', 2: 'VCF', } const format = formatOpts[formatFlags & 0xf] if (!format) { throw new Error(`invalid Tabix preset format flags ${formatFlags}`) } const columnNumbers = { ref: dataView.getInt32(12, true), start: dataView.getInt32(16, true), end: dataView.getInt32(20, true), } const metaValue = dataView.getInt32(24, true) const depth = 5 const maxBinNumber = ((1 << ((depth + 1) * 3)) - 1) / 7 const maxRefLength = 2 ** (14 + depth * 3) const metaChar = metaValue ? String.fromCharCode(metaValue) : null const skipLines = dataView.getInt32(28, true) // read sequence dictionary const nameSectionLength = dataView.getInt32(32, true) const { refNameToId, refIdToName } = this._parseNameBytes( bytes.slice(36, 36 + nameSectionLength), ) // read the indexes for each reference sequence let currOffset = 36 + nameSectionLength let firstDataLine: VirtualOffset | undefined const indices = new Array(refCount).fill(0).map(() => { // the binning index const binCount = dataView.getInt32(currOffset, true) currOffset += 4 const binIndex: Record<number, Chunk[]> = {} let stats for (let j = 0; j < binCount; j += 1) { const bin = dataView.getUint32(currOffset, true) currOffset += 4 if (bin > maxBinNumber + 1) { throw new Error( 'tabix index contains too many bins, please use a CSI index', ) } else if (bin === maxBinNumber + 1) { const chunkCount = dataView.getInt32(currOffset, true) currOffset += 4 if (chunkCount === 2) { stats = this.parsePseudoBin(bytes, currOffset) } currOffset += 16 * chunkCount } else { const chunkCount = dataView.getInt32(currOffset, true) currOffset += 4 const chunks = new Array(chunkCount) for (let k = 0; k < chunkCount; k += 1) { const u = fromBytes(bytes, currOffset) const v = fromBytes(bytes, currOffset + 8) currOffset += 16 firstDataLine = this._findFirstData(firstDataLine, u) chunks[k] = new Chunk(u, v, bin) } binIndex[bin] = chunks } } // the linear index const linearCount = dataView.getInt32(currOffset, true) currOffset += 4 const linearIndex = new Array(linearCount) for (let k = 0; k < linearCount; k += 1) { linearIndex[k] = fromBytes(bytes, currOffset) currOffset += 8 firstDataLine = this._findFirstData(firstDataLine, linearIndex[k]) } return { binIndex, linearIndex, stats, } }) return { indices, metaChar, maxBinNumber, maxRefLength, skipLines, firstDataLine, columnNumbers, coordinateType, format, refIdToName, refNameToId, maxBlockSize: 1 << 16, } } parsePseudoBin(bytes: Uint8Array, offset: number) { return { lineCount: longFromBytesToUnsigned(bytes, offset + 16), } } _parseNameBytes(namesBytes: Uint8Array) { let currRefId = 0 let currNameStart = 0 const refIdToName: string[] = [] const refNameToId: Record<string, number> = {} const decoder = new TextDecoder('utf8') for (let i = 0; i < namesBytes.length; i += 1) { if (!namesBytes[i]) { if (currNameStart < i) { const refName = this.renameRefSeq( decoder.decode(namesBytes.subarray(currNameStart, i)), ) refIdToName[currRefId] = refName refNameToId[refName] = currRefId } currNameStart = i + 1 currRefId += 1 } } return { refNameToId, refIdToName, } } async blocksForRange( refName: string, min: number, max: number, opts: Options = {}, ) { if (min < 0) { min = 0 } const indexData = await this.parse(opts) const refId = indexData.refNameToId[refName] if (refId === undefined) { return [] } const ba = indexData.indices[refId] if (!ba) { return [] } const minOffset = ba.linearIndex.length ? ba.linearIndex[ min >> TAD_LIDX_SHIFT >= ba.linearIndex.length ? ba.linearIndex.length - 1 : min >> TAD_LIDX_SHIFT ] : new VirtualOffset(0, 0) if (!minOffset) { console.warn('querying outside of possible tabix range') } // const { linearIndex, binIndex } = indexes const overlappingBins = reg2bins(min, max) // List of bin #s that overlap min, max const chunks: Chunk[] = [] // Find chunks in overlapping bins. Leaf bins (< 4681) are not pruned for (const [start, end] of overlappingBins) { for (let bin = start; bin <= end; bin++) { if (ba.binIndex[bin]) { for (const c of ba.binIndex[bin]) { chunks.push(new Chunk(c.minv, c.maxv, bin)) } } } } // Use the linear index to find minimum file position of chunks that could // contain alignments in the region const nintv = ba.linearIndex.length let lowest = null const minLin = Math.min(min >> 14, nintv - 1) const maxLin = Math.min(max >> 14, nintv - 1) for (let i = minLin; i <= maxLin; ++i) { const vp = ba.linearIndex[i] if (vp) { if (!lowest || vp.compareTo(lowest) < 0) { lowest = vp } } } return optimizeChunks(chunks, lowest) } }