UNPKG

@gmod/bbi

Version:

Parser for BigWig/BigBed files

657 lines 27.7 kB
import AbortablePromiseCache from '@gmod/abortable-promise-cache'; import QuickLRU from '@jbrowse/quick-lru'; import Range from "./range.js"; import { decompressAndParseBigWigBlocks, decompressAndParseSummaryBlocks, unzipBatch, } from "./unzip.js"; import { groupBlocks } from "./util.js"; const decoder = new TextDecoder('utf8'); function coordFilter(s1, e1, s2, e2) { return s1 < e2 && e1 >= s2; } /** * View into a subset of the data in a BigWig file. * * Adapted by Robert Buels and Colin Diesh from bigwig.js in the Dalliance * Genome Explorer by Thomas Down. */ export class BlockView { bbi; refsByName; rTreeOffset; uncompressBufSize; blockType; // R-tree index header cache - R-trees are spatial data structures used to // efficiently query genomic intervals by chromosome and position rTreePromise; featureCache = new AbortablePromiseCache({ cache: new QuickLRU({ maxSize: 1000 }), fill: async ({ length, offset }, signal) => this.bbi.read(length, offset, { signal }), }); constructor(bbi, refsByName, // Offset to the R-tree index in the file - this is part of the "cirTree" // (combined ID R-tree), which combines a B+ tree for chromosome names // with an R-tree for efficient spatial queries rTreeOffset, uncompressBufSize, blockType) { this.bbi = bbi; this.refsByName = refsByName; this.rTreeOffset = rTreeOffset; this.uncompressBufSize = uncompressBufSize; this.blockType = blockType; if (!(rTreeOffset >= 0)) { throw new Error('invalid rTreeOffset!'); } } async readWigData(chrName, start, end, observer, opts) { try { const chrId = this.refsByName[chrName]; if (chrId === undefined) { observer.complete(); return; } const request = { chrId, start, end }; if (!this.rTreePromise) { this.rTreePromise = this.bbi.read(48, this.rTreeOffset, opts); } const buffer = await this.rTreePromise; const dataView = new DataView(buffer.buffer, buffer.byteOffset, buffer.length); // Maximum number of children per R-tree node - used to calculate memory bounds const rTreeBlockSize = dataView.getUint32(4, true); const blocksToFetch = []; let outstanding = 0; // R-tree leaf nodes contain the actual data blocks to fetch const processLeafNode = (dataView, startOffset, count) => { let offset = startOffset; for (let i = 0; i < count; i++) { const startChrom = dataView.getUint32(offset, true); offset += 4; const startBase = dataView.getUint32(offset, true); offset += 4; const endChrom = dataView.getUint32(offset, true); offset += 4; const endBase = dataView.getUint32(offset, true); offset += 4; const blockOffset = Number(dataView.getBigUint64(offset, true)); offset += 8; const blockSize = Number(dataView.getBigUint64(offset, true)); offset += 8; if (blockIntersectsQuery({ startChrom, startBase, endBase, endChrom })) { blocksToFetch.push({ offset: blockOffset, length: blockSize, }); } } }; // R-tree non-leaf nodes contain pointers to child nodes const processNonLeafNode = (dataView, startOffset, count, level) => { const recurOffsets = []; let offset = startOffset; for (let i = 0; i < count; i++) { const startChrom = dataView.getUint32(offset, true); offset += 4; const startBase = dataView.getUint32(offset, true); offset += 4; const endChrom = dataView.getUint32(offset, true); offset += 4; const endBase = dataView.getUint32(offset, true); offset += 4; const blockOffset = Number(dataView.getBigUint64(offset, true)); offset += 8; if (blockIntersectsQuery({ startChrom, startBase, endChrom, endBase })) { recurOffsets.push(blockOffset); } } if (recurOffsets.length > 0) { traverseRTree(recurOffsets, level + 1); } }; const processRTreeNode = (rTreeBlockData, offset2, level) => { try { const data = rTreeBlockData.subarray(offset2); const dataView = new DataView(data.buffer, data.byteOffset, data.length); let offset = 0; const isLeaf = dataView.getUint8(offset); offset += 2; // 1 skip for reserved byte const count = dataView.getUint16(offset, true); offset += 2; if (isLeaf === 1) { processLeafNode(dataView, offset, count); } else if (isLeaf === 0) { processNonLeafNode(dataView, offset, count, level); } } catch (e) { observer.error(e); } }; const blockIntersectsQuery = (b) => { const { startChrom, startBase, endChrom, endBase } = b; return ((startChrom < chrId || (startChrom === chrId && startBase <= end)) && (endChrom > chrId || (endChrom === chrId && endBase >= start))); }; const fetchAndProcessRTreeBlocks = async (offsets, range, level) => { try { const length = range.max - range.min; const offset = range.min; const resultBuffer = await this.featureCache.get(`${length}_${offset}`, { length, offset }, opts?.signal); for (const element of offsets) { if (range.contains(element)) { processRTreeNode(resultBuffer, element - offset, level); outstanding -= 1; if (outstanding === 0) { this.readFeatures(observer, blocksToFetch, { ...opts, request, }).catch((e) => { observer.error(e); }); } } } } catch (e) { observer.error(e); } }; const traverseRTree = (offsets, level) => { try { outstanding += offsets.length; // Upper bound on size, based on a completely full leaf node. const maxRTreeBlockSpan = 4 + rTreeBlockSize * 32; let spans = new Range([ { min: offsets[0], max: offsets[0] + maxRTreeBlockSpan, }, ]); for (let i = 1; i < offsets.length; i += 1) { const blockSpan = new Range([ { min: offsets[i], max: offsets[i] + maxRTreeBlockSpan, }, ]); spans = spans.union(blockSpan); } spans.getRanges().forEach(range => { fetchAndProcessRTreeBlocks(offsets, range, level).catch((e) => { observer.error(e); }); }); } catch (e) { observer.error(e); } }; traverseRTree([this.rTreeOffset + 48], 1); return; } catch (e) { observer.error(e); } } parseSummaryBlock(b, startOffset, request) { const features = []; let offset = startOffset; const dataView = new DataView(b.buffer, b.byteOffset, b.length); while (offset < b.byteLength) { const chromId = dataView.getUint32(offset, true); offset += 4; const start = dataView.getUint32(offset, true); offset += 4; const end = dataView.getUint32(offset, true); offset += 4; const validCnt = dataView.getUint32(offset, true); offset += 4; const minScore = dataView.getFloat32(offset, true); offset += 4; const maxScore = dataView.getFloat32(offset, true); offset += 4; const sumData = dataView.getFloat32(offset, true); offset += 8; if (!request || (chromId === request.chrId && coordFilter(start, end, request.start, request.end))) { features.push({ start, end, maxScore, minScore, summary: true, score: sumData / (validCnt || 1), }); } } return features; } parseBigBedBlock(data, startOffset, offset, request) { const items = []; let currOffset = startOffset; const dataView = new DataView(data.buffer, data.byteOffset, data.length); while (currOffset < data.byteLength) { const c2 = currOffset; const chromId = dataView.getUint32(currOffset, true); currOffset += 4; const start = dataView.getInt32(currOffset, true); currOffset += 4; const end = dataView.getInt32(currOffset, true); currOffset += 4; let i = currOffset; for (; i < data.length; i++) { if (data[i] === 0) { break; } } const b = data.subarray(currOffset, i); const rest = decoder.decode(b); currOffset = i + 1; if (!request || (chromId === request.chrId && coordFilter(start, end, request.start, request.end))) { items.push({ start, end, rest, uniqueId: `bb-${offset + c2}`, }); } } return items; } parseBigWigBlock(buffer, startOffset, req) { const b = buffer.subarray(startOffset); const dataView = new DataView(b.buffer, b.byteOffset, b.length); let offset = 0; offset += 4; const blockStart = dataView.getInt32(offset, true); offset += 8; const itemStep = dataView.getUint32(offset, true); offset += 4; const itemSpan = dataView.getUint32(offset, true); offset += 4; const blockType = dataView.getUint8(offset); offset += 2; const itemCount = dataView.getUint16(offset, true); offset += 2; const items = []; switch (blockType) { case 1: { for (let i = 0; i < itemCount; i++) { const start = dataView.getInt32(offset, true); offset += 4; const end = dataView.getInt32(offset, true); offset += 4; const score = dataView.getFloat32(offset, true); offset += 4; if (!req || coordFilter(start, end, req.start, req.end)) { items.push({ start, end, score, }); } } break; } case 2: { for (let i = 0; i < itemCount; i++) { const start = dataView.getInt32(offset, true); offset += 4; const score = dataView.getFloat32(offset, true); offset += 4; const end = start + itemSpan; if (!req || coordFilter(start, end, req.start, req.end)) { items.push({ score, start, end, }); } } break; } case 3: { for (let i = 0; i < itemCount; i++) { const score = dataView.getFloat32(offset, true); offset += 4; const start = blockStart + i * itemStep; const end = start + itemSpan; if (!req || coordFilter(start, end, req.start, req.end)) { items.push({ score, start, end, }); } } break; } } return items; } parseBigWigBlockAsArrays(buffer, startOffset, req) { const dataView = new DataView(buffer.buffer, buffer.byteOffset + startOffset, buffer.length - startOffset); const blockStart = dataView.getInt32(4, true); const itemStep = dataView.getUint32(12, true); const itemSpan = dataView.getUint32(16, true); const blockType = dataView.getUint8(20); const itemCount = dataView.getUint16(22, true); const starts = new Int32Array(itemCount); const ends = new Int32Array(itemCount); const scores = new Float32Array(itemCount); if (!req) { switch (blockType) { case 1: { let offset = 24; for (let i = 0; i < itemCount; i++) { starts[i] = dataView.getInt32(offset, true); ends[i] = dataView.getInt32(offset + 4, true); scores[i] = dataView.getFloat32(offset + 8, true); offset += 12; } return { starts, ends, scores }; } case 2: { let offset = 24; for (let i = 0; i < itemCount; i++) { const start = dataView.getInt32(offset, true); starts[i] = start; ends[i] = start + itemSpan; scores[i] = dataView.getFloat32(offset + 4, true); offset += 8; } return { starts, ends, scores }; } case 3: { let offset = 24; for (let i = 0; i < itemCount; i++) { const start = blockStart + i * itemStep; starts[i] = start; ends[i] = start + itemSpan; scores[i] = dataView.getFloat32(offset, true); offset += 4; } return { starts, ends, scores }; } } return { starts, ends, scores }; } const reqStart = req.start; const reqEnd = req.end; let idx = 0; switch (blockType) { case 1: { let offset = 24; for (let i = 0; i < itemCount; i++) { const start = dataView.getInt32(offset, true); const end = dataView.getInt32(offset + 4, true); if (start < reqEnd && end >= reqStart) { starts[idx] = start; ends[idx] = end; scores[idx] = dataView.getFloat32(offset + 8, true); idx++; } offset += 12; } break; } case 2: { let offset = 24; for (let i = 0; i < itemCount; i++) { const start = dataView.getInt32(offset, true); const end = start + itemSpan; if (start < reqEnd && end >= reqStart) { starts[idx] = start; ends[idx] = end; scores[idx] = dataView.getFloat32(offset + 4, true); idx++; } offset += 8; } break; } case 3: { let offset = 24; for (let i = 0; i < itemCount; i++) { const start = blockStart + i * itemStep; const end = start + itemSpan; if (start < reqEnd && end >= reqStart) { starts[idx] = start; ends[idx] = end; scores[idx] = dataView.getFloat32(offset, true); idx++; } offset += 4; } break; } } if (idx < itemCount) { return { starts: starts.subarray(0, idx), ends: ends.subarray(0, idx), scores: scores.subarray(0, idx), }; } return { starts, ends, scores }; } async readFeatures(observer, blocks, opts = {}) { try { const { blockType, uncompressBufSize } = this; const { signal, request } = opts; const blockGroupsToFetch = groupBlocks(blocks); await Promise.all(blockGroupsToFetch.map(async (blockGroup) => { const { length, offset } = blockGroup; const data = await this.featureCache.get(`${length}_${offset}`, blockGroup, signal); const localBlocks = blockGroup.blocks.map(block => ({ offset: block.offset - blockGroup.offset, length: block.length, })); let decompressedData; let decompressedOffsets; if (uncompressBufSize > 0) { const result = await unzipBatch(data, localBlocks, uncompressBufSize); decompressedData = result.data; decompressedOffsets = result.offsets; } else { decompressedData = data; decompressedOffsets = localBlocks.map(b => b.offset); decompressedOffsets.push(data.length); } for (let i = 0; i < blockGroup.blocks.length; i++) { const block = blockGroup.blocks[i]; const start = decompressedOffsets[i]; const end = decompressedOffsets[i + 1]; const resultData = decompressedData.subarray(start, end); switch (blockType) { case 'summary': { observer.next(this.parseSummaryBlock(resultData, 0, request)); break; } case 'bigwig': { observer.next(this.parseBigWigBlock(resultData, 0, request)); break; } case 'bigbed': { observer.next(this.parseBigBedBlock(resultData, 0, block.offset * (1 << 8), request)); break; } default: { console.warn(`Don't know what to do with ${blockType}`); } } } })); observer.complete(); } catch (e) { observer.error(e); } } async readBigWigFeaturesAsArrays(blocks, opts = {}) { const { uncompressBufSize } = this; const { signal, request } = opts; const blockGroupsToFetch = groupBlocks(blocks); const allStarts = []; const allEnds = []; const allScores = []; let totalCount = 0; await Promise.all(blockGroupsToFetch.map(async (blockGroup) => { const { length, offset } = blockGroup; const data = await this.featureCache.get(`${length}_${offset}`, blockGroup, signal); const localBlocks = blockGroup.blocks.map(block => ({ offset: block.offset - blockGroup.offset, length: block.length, })); if (uncompressBufSize > 0) { const result = await decompressAndParseBigWigBlocks(data, localBlocks, uncompressBufSize, request?.start ?? 0, request?.end ?? 0); if (result.starts.length > 0) { allStarts.push(result.starts); allEnds.push(result.ends); allScores.push(result.scores); totalCount += result.starts.length; } } else { for (const block of localBlocks) { const blockData = data.subarray(block.offset, block.offset + block.length); const result = this.parseBigWigBlockAsArrays(blockData, 0, request); if (result.starts.length > 0) { allStarts.push(result.starts); allEnds.push(result.ends); allScores.push(result.scores); totalCount += result.starts.length; } } } })); if (allStarts.length === 0) { return { starts: new Int32Array(0), ends: new Int32Array(0), scores: new Float32Array(0), isSummary: false, }; } if (allStarts.length === 1) { return { starts: allStarts[0], ends: allEnds[0], scores: allScores[0], isSummary: false, }; } const starts = new Int32Array(totalCount); const ends = new Int32Array(totalCount); const scores = new Float32Array(totalCount); let offset = 0; for (let i = 0; i < allStarts.length; i++) { starts.set(allStarts[i], offset); ends.set(allEnds[i], offset); scores.set(allScores[i], offset); offset += allStarts[i].length; } return { starts, ends, scores, isSummary: false }; } async readSummaryFeaturesAsArrays(blocks, opts = {}) { const { uncompressBufSize } = this; const { signal, request } = opts; const blockGroupsToFetch = groupBlocks(blocks); const allStarts = []; const allEnds = []; const allScores = []; const allMinScores = []; const allMaxScores = []; let totalCount = 0; await Promise.all(blockGroupsToFetch.map(async (blockGroup) => { const { length, offset } = blockGroup; const data = await this.featureCache.get(`${length}_${offset}`, blockGroup, signal); const localBlocks = blockGroup.blocks.map(block => ({ offset: block.offset - blockGroup.offset, length: block.length, })); if (uncompressBufSize > 0) { const result = await decompressAndParseSummaryBlocks(data, localBlocks, uncompressBufSize, request?.chrId ?? 0, request?.start ?? 0, request?.end ?? 0); if (result.starts.length > 0) { allStarts.push(result.starts); allEnds.push(result.ends); allScores.push(result.scores); allMinScores.push(result.minScores); allMaxScores.push(result.maxScores); totalCount += result.starts.length; } } else { for (const block of localBlocks) { const blockData = data.subarray(block.offset, block.offset + block.length); const features = this.parseSummaryBlock(blockData, 0, request); if (features.length > 0) { const starts = new Int32Array(features.length); const ends = new Int32Array(features.length); const scores = new Float32Array(features.length); const minScores = new Float32Array(features.length); const maxScores = new Float32Array(features.length); for (let i = 0; i < features.length; i++) { const f = features[i]; starts[i] = f.start; ends[i] = f.end; scores[i] = f.score ?? 0; minScores[i] = f.minScore ?? 0; maxScores[i] = f.maxScore ?? 0; } allStarts.push(starts); allEnds.push(ends); allScores.push(scores); allMinScores.push(minScores); allMaxScores.push(maxScores); totalCount += features.length; } } } })); if (allStarts.length === 0) { return { starts: new Int32Array(0), ends: new Int32Array(0), scores: new Float32Array(0), minScores: new Float32Array(0), maxScores: new Float32Array(0), isSummary: true, }; } if (allStarts.length === 1) { return { starts: allStarts[0], ends: allEnds[0], scores: allScores[0], minScores: allMinScores[0], maxScores: allMaxScores[0], isSummary: true, }; } const starts = new Int32Array(totalCount); const ends = new Int32Array(totalCount); const scores = new Float32Array(totalCount); const minScores = new Float32Array(totalCount); const maxScores = new Float32Array(totalCount); let offset = 0; for (let i = 0; i < allStarts.length; i++) { starts.set(allStarts[i], offset); ends.set(allEnds[i], offset); scores.set(allScores[i], offset); minScores.set(allMinScores[i], offset); maxScores.set(allMaxScores[i], offset); offset += allStarts[i].length; } return { starts, ends, scores, minScores, maxScores, isSummary: true, }; } } //# sourceMappingURL=block-view.js.map