UNPKG

@gmod/bbi

Version:

Parser for BigWig/BigBed files

320 lines (288 loc) 9.22 kB
import AbortablePromiseCache from '@gmod/abortable-promise-cache' import QuickLRU from '@jbrowse/quick-lru' import { Observable, firstValueFrom, merge } from 'rxjs' import { map, reduce } from 'rxjs/operators' import { BBI } from './bbi.ts' import type { Feature, RequestOptions } from './types.ts' import type { GenericFilehandle } from 'generic-filehandle2' const decoder = new TextDecoder('utf8') interface Loc { key: string offset: number length: number field?: number } interface Index { type: number fieldcount: number offset: number field: number } export function filterUndef<T>(ts: (T | undefined)[]): T[] { return ts.filter((t: T | undefined): t is T => !!t) } function getTabField(str: string, fieldIndex: number) { if (fieldIndex < 0) { return undefined } let start = 0 for (let i = 0; i < fieldIndex; i++) { start = str.indexOf('\t', start) if (start === -1) { return undefined } start++ } const end = str.indexOf('\t', start) return end === -1 ? str.slice(start) : str.slice(start, end) } // Parses a null-terminated string key from a B+ tree node function parseKey(buffer: Uint8Array, offset: number, keySize: number) { const keyEnd = buffer.indexOf(0, offset) const effectiveKeyEnd = keyEnd !== -1 && keyEnd < offset + keySize ? keyEnd : offset + keySize return decoder.decode(buffer.subarray(offset, effectiveKeyEnd)) } // Recursively traverses a B+ tree to search for a specific name in the BigBed extraIndex // B+ trees are balanced tree structures optimized for disk-based searches async function readBPlusTreeNode( bbi: GenericFilehandle, nodeOffset: number, blockSize: number, keySize: number, valSize: number, name: string, field: number, opts: RequestOptions, ): Promise<Loc | undefined> { const len = 4 + blockSize * (keySize + valSize) const buffer = await bbi.read(len, nodeOffset, opts) const dataView = new DataView(buffer.buffer, buffer.byteOffset, buffer.length) let offset = 0 const nodeType = dataView.getInt8(offset) offset += 2 // skip nodeType byte + 1 reserved byte const cnt = dataView.getInt16(offset, true) offset += 2 // Non-leaf node (nodeType === 0): contains keys and child node pointers for navigation if (nodeType === 0) { const leafkeys = [] for (let i = 0; i < cnt; i++) { const key = parseKey(buffer, offset, keySize) offset += keySize const dataOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 leafkeys.push({ key, offset: dataOffset, }) } // Binary search to find the appropriate child node let left = 0 let right = leafkeys.length - 1 let targetIndex = leafkeys.length - 1 while (left <= right) { const mid = Math.floor((left + right) / 2) const cmp = name.localeCompare(leafkeys[mid]!.key) if (cmp < 0) { targetIndex = mid - 1 right = mid - 1 } else { left = mid + 1 } } const childOffset = targetIndex >= 0 ? leafkeys[targetIndex]!.offset : leafkeys[0]!.offset return readBPlusTreeNode( bbi, childOffset, blockSize, keySize, valSize, name, field, opts, ) } else if (nodeType === 1) { // Leaf node (nodeType === 1): contains actual key-value data const keys = [] for (let i = 0; i < cnt; i++) { const key = parseKey(buffer, offset, keySize) offset += keySize const dataOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 const length = dataView.getUint32(offset, true) offset += 4 offset += 4 // skip reserved keys.push({ key, offset: dataOffset, length, }) } // Binary search for exact key match in sorted leaf node let left = 0 let right = keys.length - 1 while (left <= right) { const mid = Math.floor((left + right) / 2) const cmp = name.localeCompare(keys[mid]!.key) if (cmp === 0) { return { ...keys[mid]!, field } } else if (cmp < 0) { right = mid - 1 } else { left = mid + 1 } } return undefined } } export class BigBed extends BBI { public readIndicesCache = new AbortablePromiseCache<RequestOptions, Index[]>({ cache: new QuickLRU({ maxSize: 1 }), fill: (args: RequestOptions, signal?: AbortSignal) => this._readIndices({ ...args, signal }), }) public readIndices(opts: RequestOptions = {}) { const { signal, ...rest } = opts return this.readIndicesCache.get(JSON.stringify(rest), opts, signal) } /* * retrieve unzoomed view for any scale */ protected async getView(_scale: number, opts?: RequestOptions) { return this.getUnzoomedView(opts) } /* * parse the bigbed extraIndex fields * * * @return a Promise for an array of Index data structure since there can be * multiple extraIndexes in a bigbed, see bedToBigBed documentation */ private async _readIndices(opts: RequestOptions) { const { extHeaderOffset } = await this.getHeader(opts) const b = await this.bbi.read(64, extHeaderOffset) const dataView = new DataView(b.buffer, b.byteOffset, b.length) let offset = 0 // const _size = dataView.getUint16(offset, true) offset += 2 const count = dataView.getUint16(offset, true) offset += 2 const dataOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 // no extra index is defined if count==0 if (count === 0) { return [] } const blocklen = 20 const len = blocklen * count const buffer = await this.bbi.read(len, dataOffset) const indices: Index[] = [] for (let i = 0; i < count; i += 1) { const b = buffer.subarray(i * blocklen) const dataView = new DataView(b.buffer, b.byteOffset, b.length) let offset = 0 const type = dataView.getInt16(offset, true) offset += 2 const fieldcount = dataView.getInt16(offset, true) offset += 2 const dataOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 + 4 // skip 8-byte offset + 4 reserved bytes const field = dataView.getInt16(offset, true) indices.push({ type, fieldcount, offset: dataOffset, field, }) } return indices } /* * perform a search in the bigbed extraIndex to find which blocks in the * bigbed data to look for the actual feature data * * @param name - the name to search for * * @param opts - a SearchOptions argument with optional signal * * @return a Promise for an array of bigbed block Loc entries */ private async searchExtraIndexBlocks( name: string, opts: RequestOptions = {}, ): Promise<Loc[]> { const indices = await this.readIndices(opts) if (indices.length === 0) { return [] } const locs = indices.map(async index => { const { offset: offset2, field } = index const b = await this.bbi.read(32, offset2, opts) const dataView = new DataView(b.buffer, b.byteOffset, b.length) let offset = 0 // const _magic = dataView.getInt32(offset, true) offset += 4 const blockSize = dataView.getInt32(offset, true) offset += 4 const keySize = dataView.getInt32(offset, true) offset += 4 const valSize = dataView.getInt32(offset, true) offset += 4 // const _itemCount = Number(dataView.getBigUint64(offset, true)) offset += 8 return readBPlusTreeNode( this.bbi, offset2 + 32, blockSize, keySize, valSize, name, field, opts, ) }) return filterUndef(await Promise.all(locs)) } /* * retrieve the features from the bigbed data that were found through the * lookup of the extraIndex note that there can be multiple extraIndex, see * the BigBed specification and the -extraIndex argument to bedToBigBed * * @param name - the name to search for * * @param opts - options object with optional AbortSignal * * @return array of Feature */ public async searchExtraIndex(name: string, opts: RequestOptions = {}) { const blocks = await this.searchExtraIndexBlocks(name, opts) if (blocks.length === 0) { return [] } const view = await this.getUnzoomedView(opts) const res = blocks.map(block => { return new Observable<Feature[]>(observer => { view.readFeatures(observer, [block], opts).catch((e: unknown) => { observer.error(e) }) }).pipe( reduce((acc, curr) => { acc.push(...curr) return acc }, [] as Feature[]), map(features => features.map(f => ({ ...f, field: block.field }))), ) }) const ret = await firstValueFrom(merge(...res)) // Filter to features where the indexed field matches the search name // field offset is adjusted by -3 to account for chrom, chromStart, chromEnd columns return ret.filter(f => { if (!f.rest) { return false } const fieldIndex = (f.field || 0) - 3 return getTabField(f.rest, fieldIndex) === name }) } }