UNPKG

@gmod/bbi

Version:

Parser for BigWig/BigBed files

475 lines (443 loc) 13.5 kB
import AbortablePromiseCache from '@gmod/abortable-promise-cache' import QuickLRU from 'quick-lru' import Range from './range.ts' import { unzip } from './unzip.ts' import { checkAbortSignal, groupBlocks } from './util.ts' import type { Feature } from './types.ts' import type { GenericFilehandle } from 'generic-filehandle2' import type { Observer } from 'rxjs' const decoder = typeof TextDecoder !== 'undefined' ? new TextDecoder('utf8') : undefined interface CoordRequest { chrId: number start: number end: number } interface ReadData { offset: number length: number } interface Options { signal?: AbortSignal request?: CoordRequest } function coordFilter(s1: number, e1: number, s2: number, e2: number): boolean { return s1 < e2 && e1 >= s2 } /** * View into a subset of the data in a BigWig file. * * Adapted by Robert Buels and Colin Diesh from bigwig.js in the Dalliance * Genome Explorer by Thomas Down. */ export class BlockView { private cirTreePromise?: Promise<Uint8Array> private featureCache = new AbortablePromiseCache<ReadData, Uint8Array>({ cache: new QuickLRU({ maxSize: 1000 }), fill: async ({ length, offset }, signal) => this.bbi.read(length, offset, { signal }), }) public constructor( private bbi: GenericFilehandle, private refsByName: any, private cirTreeOffset: number, private isCompressed: boolean, private blockType: string, ) { if (!(cirTreeOffset >= 0)) { throw new Error('invalid cirTreeOffset!') } } public async readWigData( chrName: string, start: number, end: number, observer: Observer<Feature[]>, opts?: Options, ) { try { const chrId = this.refsByName[chrName] if (chrId === undefined) { observer.complete() } const request = { chrId, start, end } if (!this.cirTreePromise) { this.cirTreePromise = this.bbi.read(48, this.cirTreeOffset, opts) } const buffer = await this.cirTreePromise const dataView = new DataView(buffer.buffer) const cirBlockSize = dataView.getUint32(4, true) let blocksToFetch: any[] = [] let outstanding = 0 const cirFobRecur2 = ( cirBlockData: Uint8Array, offset2: number, level: number, ) => { try { const data = cirBlockData.subarray(offset2) const b = data const dataView = new DataView(b.buffer, b.byteOffset, b.length) let offset = 0 const isLeaf = dataView.getUint8(offset) offset += 2 // 1 skip const cnt = dataView.getUint16(offset, true) offset += 2 if (isLeaf === 1) { const blocksToFetch2 = [] for (let i = 0; i < cnt; i++) { const startChrom = dataView.getUint32(offset, true) offset += 4 const startBase = dataView.getUint32(offset, true) offset += 4 const endChrom = dataView.getUint32(offset, true) offset += 4 const endBase = dataView.getUint32(offset, true) offset += 4 const blockOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 const blockSize = Number(dataView.getBigUint64(offset, true)) offset += 8 blocksToFetch2.push({ startChrom, startBase, endBase, endChrom, blockOffset, blockSize, offset, }) } blocksToFetch = blocksToFetch.concat( blocksToFetch2 .filter(f => filterFeats(f)) .map(l => ({ offset: l.blockOffset, length: l.blockSize, })), ) } else if (isLeaf === 0) { const recurOffsets = [] for (let i = 0; i < cnt; i++) { const startChrom = dataView.getUint32(offset, true) offset += 4 const startBase = dataView.getUint32(offset, true) offset += 4 const endChrom = dataView.getUint32(offset, true) offset += 4 const endBase = dataView.getUint32(offset, true) offset += 4 const blockOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 recurOffsets.push({ startChrom, startBase, endChrom, endBase, blockOffset, offset, }) } const recurOffsets2 = recurOffsets .filter(f => filterFeats(f)) .map(l => l.blockOffset) if (recurOffsets2.length > 0) { cirFobRecur(recurOffsets2, level + 1) } } } catch (e) { observer.error(e) } } const filterFeats = (b: { startChrom: number startBase: number endChrom: number endBase: number }) => { const { startChrom, startBase, endChrom, endBase } = b return ( (startChrom < chrId || (startChrom === chrId && startBase <= end)) && (endChrom > chrId || (endChrom === chrId && endBase >= start)) ) } const cirFobStartFetch = async ( off: number[], fr: Range, level: number, ) => { try { const length = fr.max - fr.min const offset = fr.min const resultBuffer = await this.featureCache.get( `${length}_${offset}`, { length, offset }, opts?.signal, ) for (const element of off) { if (fr.contains(element)) { cirFobRecur2(resultBuffer, element - offset, level) outstanding -= 1 if (outstanding === 0) { this.readFeatures(observer, blocksToFetch, { ...opts, request, }).catch((e: unknown) => { observer.error(e) }) } } } } catch (e) { observer.error(e) } } const cirFobRecur = (offset: number[], level: number) => { try { outstanding += offset.length // Upper bound on size, based on a completely full leaf node. const maxCirBlockSpan = 4 + cirBlockSize * 32 let spans = new Range([ { min: offset[0], max: offset[0] + maxCirBlockSpan, }, ]) for (let i = 1; i < offset.length; i += 1) { const blockSpan = new Range([ { min: offset[i], max: offset[i] + maxCirBlockSpan, }, ]) spans = spans.union(blockSpan) } // eslint-disable-next-line @typescript-eslint/no-floating-promises spans.getRanges().map(fr => cirFobStartFetch(offset, fr, level)) } catch (e) { observer.error(e) } } cirFobRecur([Number(this.cirTreeOffset) + 48], 1) return } catch (e) { observer.error(e) } } private parseSummaryBlock( b: Uint8Array, startOffset: number, request?: CoordRequest, ) { const features = [] as any[] let offset = startOffset const dataView = new DataView(b.buffer, b.byteOffset, b.length) while (offset < b.byteLength) { // this was extracted from looking at the runtime code generated by // binary-parser const chromId = dataView.getUint32(offset, true) offset += 4 const start = dataView.getUint32(offset, true) offset += 4 const end = dataView.getUint32(offset, true) offset += 4 const validCnt = dataView.getUint32(offset, true) offset += 4 const minScore = dataView.getFloat32(offset, true) offset += 4 const maxScore = dataView.getFloat32(offset, true) offset += 4 const sumData = dataView.getFloat32(offset, true) offset += 4 // unused // const sumSqData = dataView.getFloat32(offset, true) offset += 4 if ( request ? chromId === request.chrId && coordFilter(start, end, request.start, request.end) : true ) { features.push({ start, end, maxScore, minScore, summary: true, score: sumData / (validCnt || 1), }) } } return features } private parseBigBedBlock( data: Uint8Array, startOffset: number, offset: number, request?: CoordRequest, ) { const items = [] as Feature[] let currOffset = startOffset const b = data const dataView = new DataView(b.buffer, b.byteOffset, b.length) while (currOffset < data.byteLength) { const c2 = currOffset const chromId = dataView.getUint32(currOffset, true) currOffset += 4 const start = dataView.getInt32(currOffset, true) currOffset += 4 const end = dataView.getInt32(currOffset, true) currOffset += 4 let i = currOffset for (; i < data.length; i++) { if (data[i] === 0) { break } } const b = data.subarray(currOffset, i) const rest = decoder?.decode(b) ?? b.toString() currOffset = i + 1 items.push({ chromId, start, end, rest, uniqueId: `bb-${offset + c2}`, }) } return request ? items.filter((f: any) => coordFilter(f.start, f.end, request.start, request.end), ) : items } private parseBigWigBlock( buffer: Uint8Array, startOffset: number, req?: CoordRequest, ) { const b = buffer.subarray(startOffset) const dataView = new DataView(b.buffer, b.byteOffset, b.length) let offset = 0 offset += 4 const blockStart = dataView.getInt32(offset, true) offset += 8 const itemStep = dataView.getUint32(offset, true) offset += 4 const itemSpan = dataView.getUint32(offset, true) offset += 4 const blockType = dataView.getUint8(offset) offset += 2 const itemCount = dataView.getUint16(offset, true) offset += 2 const items = new Array(itemCount) switch (blockType) { case 1: { for (let i = 0; i < itemCount; i++) { const start = dataView.getInt32(offset, true) offset += 4 const end = dataView.getInt32(offset, true) offset += 4 const score = dataView.getFloat32(offset, true) offset += 4 items[i] = { start, end, score, } } break } case 2: { for (let i = 0; i < itemCount; i++) { const start = dataView.getInt32(offset, true) offset += 4 const score = dataView.getFloat32(offset, true) offset += 4 items[i] = { score, start, end: start + itemSpan, } } break } case 3: { for (let i = 0; i < itemCount; i++) { const score = dataView.getFloat32(offset, true) offset += 4 const start = blockStart + i * itemStep items[i] = { score, start, end: start + itemSpan, } } break } } return req ? items.filter(f => coordFilter(f.start, f.end, req.start, req.end)) : items } public async readFeatures( observer: Observer<Feature[]>, blocks: { offset: number; length: number }[], opts: Options = {}, ) { try { const { blockType, isCompressed } = this const { signal, request } = opts const blockGroupsToFetch = groupBlocks(blocks) checkAbortSignal(signal) await Promise.all( blockGroupsToFetch.map(async blockGroup => { checkAbortSignal(signal) const { length, offset } = blockGroup const data = await this.featureCache.get( `${length}_${offset}`, blockGroup, signal, ) for (const block of blockGroup.blocks) { checkAbortSignal(signal) let resultData = data.subarray( Number(block.offset) - Number(blockGroup.offset), ) if (isCompressed) { resultData = unzip(resultData) } checkAbortSignal(signal) switch (blockType) { case 'summary': { observer.next(this.parseSummaryBlock(resultData, 0, request)) break } case 'bigwig': { observer.next(this.parseBigWigBlock(resultData, 0, request)) break } case 'bigbed': { observer.next( this.parseBigBedBlock( resultData, 0, Number(block.offset) * (1 << 8), request, ), ) break } default: { console.warn(`Don't know what to do with ${blockType}`) } } } }), ) observer.complete() } catch (e) { observer.error(e) } } }