@gmod/tabix
Version:
Read Tabix-indexed files, supports both .tbi and .csi indexes
270 lines (245 loc) • 7.78 kB
text/typescript
import { unzip } from '@gmod/bgzf-filehandle'
import Chunk from './chunk.ts'
import IndexFile, { Options } from './indexFile.ts'
import { longFromBytesToUnsigned } from './long.ts'
import { optimizeChunks } from './util.ts'
import VirtualOffset, { fromBytes } from './virtualOffset.ts'
const CSI1_MAGIC = 21582659 // CSI\1
const CSI2_MAGIC = 38359875 // CSI\2
const formats = {
0: 'generic',
1: 'SAM',
2: 'VCF',
}
function lshift(num: number, bits: number) {
return num * 2 ** bits
}
function rshift(num: number, bits: number) {
return Math.floor(num / 2 ** bits)
}
export default class CSI extends IndexFile {
private maxBinNumber: number
private depth: number
private minShift: number
constructor(args: any) {
super(args)
this.maxBinNumber = 0
this.depth = 0
this.minShift = 0
}
async lineCount(refName: string, opts: Options = {}): Promise<number> {
const indexData = await this.parse(opts)
const refId = indexData.refNameToId[refName]
if (refId === undefined) {
return -1
}
const idx = indexData.indices[refId]
if (!idx) {
return -1
}
const { stats } = indexData.indices[refId]
if (stats) {
return stats.lineCount
}
return -1
}
indexCov() {
throw new Error('CSI indexes do not support indexcov')
}
parseAuxData(bytes: Uint8Array, offset: number) {
const dataView = new DataView(bytes.buffer)
const formatFlags = dataView.getInt32(offset, true)
const coordinateType =
formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
const format = formats[(formatFlags & 0xf) as 0 | 1 | 2]
if (!format) {
throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
}
const columnNumbers = {
ref: dataView.getInt32(offset + 4, true),
start: dataView.getInt32(offset + 8, true),
end: dataView.getInt32(offset + 12, true),
}
const metaValue = dataView.getInt32(offset + 16, true)
const metaChar = metaValue ? String.fromCharCode(metaValue) : null
const skipLines = dataView.getInt32(offset + 20, true)
const nameSectionLength = dataView.getInt32(offset + 24, true)
const { refIdToName, refNameToId } = this._parseNameBytes(
bytes.subarray(offset + 28, offset + 28 + nameSectionLength),
)
return {
refIdToName,
refNameToId,
skipLines,
metaChar,
columnNumbers,
format,
coordinateType,
}
}
_parseNameBytes(namesBytes: Uint8Array) {
let currRefId = 0
let currNameStart = 0
const refIdToName = []
const refNameToId: Record<string, number> = {}
const decoder = new TextDecoder('utf8')
for (let i = 0; i < namesBytes.length; i += 1) {
if (!namesBytes[i]) {
if (currNameStart < i) {
const refName = this.renameRefSeq(
decoder.decode(namesBytes.subarray(currNameStart, i)),
)
refIdToName[currRefId] = refName
refNameToId[refName] = currRefId
}
currNameStart = i + 1
currRefId += 1
}
}
return {
refNameToId,
refIdToName,
}
}
// fetch and parse the index
async _parse(opts: Options = {}) {
const bytes = await unzip(await this.filehandle.readFile(opts))
const dataView = new DataView(bytes.buffer)
// check TBI magic numbers
let csiVersion
if (dataView.getUint32(0, true) === CSI1_MAGIC) {
csiVersion = 1
} else if (dataView.getUint32(0, true) === CSI2_MAGIC) {
csiVersion = 2
} else {
throw new Error('Not a CSI file')
}
this.minShift = dataView.getInt32(4, true)
this.depth = dataView.getInt32(8, true)
this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
const maxRefLength = 2 ** (this.minShift + this.depth * 3)
const auxLength = dataView.getInt32(12, true)
const aux =
auxLength && auxLength >= 30
? this.parseAuxData(bytes, 16)
: {
refIdToName: [],
refNameToId: {},
metaChar: null,
columnNumbers: { ref: 0, start: 1, end: 2 },
coordinateType: 'zero-based-half-open',
format: 'generic',
}
const refCount = dataView.getInt32(16 + auxLength, true)
// read the indexes for each reference sequence
let firstDataLine: VirtualOffset | undefined
let currOffset = 16 + auxLength + 4
const indices = new Array(refCount).fill(0).map(() => {
const binCount = dataView.getInt32(currOffset, true)
currOffset += 4
const binIndex: Record<string, Chunk[]> = {}
let stats
for (let j = 0; j < binCount; j += 1) {
const bin = dataView.getUint32(currOffset, true)
if (bin > this.maxBinNumber) {
// this is a fake bin that actually has stats information about the
// reference sequence in it
stats = this.parsePseudoBin(bytes, currOffset + 4)
currOffset += 4 + 8 + 4 + 16 + 16
} else {
const loffset = fromBytes(bytes, currOffset + 4)
firstDataLine = this._findFirstData(firstDataLine, loffset)
const chunkCount = dataView.getInt32(currOffset + 12, true)
currOffset += 16
const chunks = new Array(chunkCount)
for (let k = 0; k < chunkCount; k += 1) {
const u = fromBytes(bytes, currOffset)
const v = fromBytes(bytes, currOffset + 8)
currOffset += 16
chunks[k] = new Chunk(u, v, bin)
}
binIndex[bin] = chunks
}
}
return { binIndex, stats }
})
return {
...aux,
csi: true,
refCount,
maxBlockSize: 1 << 16,
firstDataLine,
csiVersion,
indices,
depth: this.depth,
maxBinNumber: this.maxBinNumber,
maxRefLength,
}
}
parsePseudoBin(bytes: Uint8Array, offset: number) {
return {
lineCount: longFromBytesToUnsigned(bytes, offset + 28),
}
}
async blocksForRange(
refName: string,
min: number,
max: number,
opts: Options = {},
) {
if (min < 0) {
min = 0
}
const indexData = await this.parse(opts)
const refId = indexData.refNameToId[refName]
if (refId === undefined) {
return []
}
const ba = indexData.indices[refId]
if (!ba) {
return []
}
// List of bin #s that overlap min, max
const overlappingBins = this.reg2bins(min, max)
const chunks: Chunk[] = []
// Find chunks in overlapping bins. Leaf bins (< 4681) are not pruned
for (const [start, end] of overlappingBins) {
for (let bin = start; bin <= end; bin++) {
if (ba.binIndex[bin]) {
for (const c of ba.binIndex[bin]) {
chunks.push(new Chunk(c.minv, c.maxv, bin))
}
}
}
}
return optimizeChunks(chunks, new VirtualOffset(0, 0))
}
/**
* calculate the list of bins that may overlap with region [beg,end) (zero-based half-open)
*/
reg2bins(beg: number, end: number) {
beg -= 1 // < convert to 1-based closed
if (beg < 1) {
beg = 1
}
if (end > 2 ** 50) {
end = 2 ** 34
} // 17 GiB ought to be enough for anybody
end -= 1
let l = 0
let t = 0
let s = this.minShift + this.depth * 3
const bins = []
for (; l <= this.depth; s -= 3, t += lshift(1, l * 3), l += 1) {
const b = t + rshift(beg, s)
const e = t + rshift(end, s)
if (e - b + bins.length > this.maxBinNumber) {
throw new Error(
`query ${beg}-${end} is too large for current binning scheme (shift ${this.minShift}, depth ${this.depth}), try a smaller query or a coarser index binning scheme`,
)
}
bins.push([b, e] as const)
}
return bins
}
}