@gmod/bbi
Version:
Parser for BigWig/BigBed files
657 lines • 27.7 kB
JavaScript
import AbortablePromiseCache from '@gmod/abortable-promise-cache';
import QuickLRU from '@jbrowse/quick-lru';
import Range from "./range.js";
import { decompressAndParseBigWigBlocks, decompressAndParseSummaryBlocks, unzipBatch, } from "./unzip.js";
import { groupBlocks } from "./util.js";
const decoder = new TextDecoder('utf8');
function coordFilter(s1, e1, s2, e2) {
return s1 < e2 && e1 >= s2;
}
/**
* View into a subset of the data in a BigWig file.
*
* Adapted by Robert Buels and Colin Diesh from bigwig.js in the Dalliance
* Genome Explorer by Thomas Down.
*/
export class BlockView {
bbi;
refsByName;
rTreeOffset;
uncompressBufSize;
blockType;
// R-tree index header cache - R-trees are spatial data structures used to
// efficiently query genomic intervals by chromosome and position
rTreePromise;
featureCache = new AbortablePromiseCache({
cache: new QuickLRU({ maxSize: 1000 }),
fill: async ({ length, offset }, signal) => this.bbi.read(length, offset, { signal }),
});
constructor(bbi, refsByName,
// Offset to the R-tree index in the file - this is part of the "cirTree"
// (combined ID R-tree), which combines a B+ tree for chromosome names
// with an R-tree for efficient spatial queries
rTreeOffset, uncompressBufSize, blockType) {
this.bbi = bbi;
this.refsByName = refsByName;
this.rTreeOffset = rTreeOffset;
this.uncompressBufSize = uncompressBufSize;
this.blockType = blockType;
if (!(rTreeOffset >= 0)) {
throw new Error('invalid rTreeOffset!');
}
}
async readWigData(chrName, start, end, observer, opts) {
try {
const chrId = this.refsByName[chrName];
if (chrId === undefined) {
observer.complete();
return;
}
const request = { chrId, start, end };
if (!this.rTreePromise) {
this.rTreePromise = this.bbi.read(48, this.rTreeOffset, opts);
}
const buffer = await this.rTreePromise;
const dataView = new DataView(buffer.buffer, buffer.byteOffset, buffer.length);
// Maximum number of children per R-tree node - used to calculate memory bounds
const rTreeBlockSize = dataView.getUint32(4, true);
const blocksToFetch = [];
let outstanding = 0;
// R-tree leaf nodes contain the actual data blocks to fetch
const processLeafNode = (dataView, startOffset, count) => {
let offset = startOffset;
for (let i = 0; i < count; i++) {
const startChrom = dataView.getUint32(offset, true);
offset += 4;
const startBase = dataView.getUint32(offset, true);
offset += 4;
const endChrom = dataView.getUint32(offset, true);
offset += 4;
const endBase = dataView.getUint32(offset, true);
offset += 4;
const blockOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
const blockSize = Number(dataView.getBigUint64(offset, true));
offset += 8;
if (blockIntersectsQuery({ startChrom, startBase, endBase, endChrom })) {
blocksToFetch.push({
offset: blockOffset,
length: blockSize,
});
}
}
};
// R-tree non-leaf nodes contain pointers to child nodes
const processNonLeafNode = (dataView, startOffset, count, level) => {
const recurOffsets = [];
let offset = startOffset;
for (let i = 0; i < count; i++) {
const startChrom = dataView.getUint32(offset, true);
offset += 4;
const startBase = dataView.getUint32(offset, true);
offset += 4;
const endChrom = dataView.getUint32(offset, true);
offset += 4;
const endBase = dataView.getUint32(offset, true);
offset += 4;
const blockOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
if (blockIntersectsQuery({ startChrom, startBase, endChrom, endBase })) {
recurOffsets.push(blockOffset);
}
}
if (recurOffsets.length > 0) {
traverseRTree(recurOffsets, level + 1);
}
};
const processRTreeNode = (rTreeBlockData, offset2, level) => {
try {
const data = rTreeBlockData.subarray(offset2);
const dataView = new DataView(data.buffer, data.byteOffset, data.length);
let offset = 0;
const isLeaf = dataView.getUint8(offset);
offset += 2; // 1 skip for reserved byte
const count = dataView.getUint16(offset, true);
offset += 2;
if (isLeaf === 1) {
processLeafNode(dataView, offset, count);
}
else if (isLeaf === 0) {
processNonLeafNode(dataView, offset, count, level);
}
}
catch (e) {
observer.error(e);
}
};
const blockIntersectsQuery = (b) => {
const { startChrom, startBase, endChrom, endBase } = b;
return ((startChrom < chrId || (startChrom === chrId && startBase <= end)) &&
(endChrom > chrId || (endChrom === chrId && endBase >= start)));
};
const fetchAndProcessRTreeBlocks = async (offsets, range, level) => {
try {
const length = range.max - range.min;
const offset = range.min;
const resultBuffer = await this.featureCache.get(`${length}_${offset}`, { length, offset }, opts?.signal);
for (const element of offsets) {
if (range.contains(element)) {
processRTreeNode(resultBuffer, element - offset, level);
outstanding -= 1;
if (outstanding === 0) {
this.readFeatures(observer, blocksToFetch, {
...opts,
request,
}).catch((e) => {
observer.error(e);
});
}
}
}
}
catch (e) {
observer.error(e);
}
};
const traverseRTree = (offsets, level) => {
try {
outstanding += offsets.length;
// Upper bound on size, based on a completely full leaf node.
const maxRTreeBlockSpan = 4 + rTreeBlockSize * 32;
let spans = new Range([
{
min: offsets[0],
max: offsets[0] + maxRTreeBlockSpan,
},
]);
for (let i = 1; i < offsets.length; i += 1) {
const blockSpan = new Range([
{
min: offsets[i],
max: offsets[i] + maxRTreeBlockSpan,
},
]);
spans = spans.union(blockSpan);
}
spans.getRanges().forEach(range => {
fetchAndProcessRTreeBlocks(offsets, range, level).catch((e) => {
observer.error(e);
});
});
}
catch (e) {
observer.error(e);
}
};
traverseRTree([this.rTreeOffset + 48], 1);
return;
}
catch (e) {
observer.error(e);
}
}
parseSummaryBlock(b, startOffset, request) {
const features = [];
let offset = startOffset;
const dataView = new DataView(b.buffer, b.byteOffset, b.length);
while (offset < b.byteLength) {
const chromId = dataView.getUint32(offset, true);
offset += 4;
const start = dataView.getUint32(offset, true);
offset += 4;
const end = dataView.getUint32(offset, true);
offset += 4;
const validCnt = dataView.getUint32(offset, true);
offset += 4;
const minScore = dataView.getFloat32(offset, true);
offset += 4;
const maxScore = dataView.getFloat32(offset, true);
offset += 4;
const sumData = dataView.getFloat32(offset, true);
offset += 8;
if (!request ||
(chromId === request.chrId &&
coordFilter(start, end, request.start, request.end))) {
features.push({
start,
end,
maxScore,
minScore,
summary: true,
score: sumData / (validCnt || 1),
});
}
}
return features;
}
parseBigBedBlock(data, startOffset, offset, request) {
const items = [];
let currOffset = startOffset;
const dataView = new DataView(data.buffer, data.byteOffset, data.length);
while (currOffset < data.byteLength) {
const c2 = currOffset;
const chromId = dataView.getUint32(currOffset, true);
currOffset += 4;
const start = dataView.getInt32(currOffset, true);
currOffset += 4;
const end = dataView.getInt32(currOffset, true);
currOffset += 4;
let i = currOffset;
for (; i < data.length; i++) {
if (data[i] === 0) {
break;
}
}
const b = data.subarray(currOffset, i);
const rest = decoder.decode(b);
currOffset = i + 1;
if (!request ||
(chromId === request.chrId &&
coordFilter(start, end, request.start, request.end))) {
items.push({
start,
end,
rest,
uniqueId: `bb-${offset + c2}`,
});
}
}
return items;
}
parseBigWigBlock(buffer, startOffset, req) {
const b = buffer.subarray(startOffset);
const dataView = new DataView(b.buffer, b.byteOffset, b.length);
let offset = 0;
offset += 4;
const blockStart = dataView.getInt32(offset, true);
offset += 8;
const itemStep = dataView.getUint32(offset, true);
offset += 4;
const itemSpan = dataView.getUint32(offset, true);
offset += 4;
const blockType = dataView.getUint8(offset);
offset += 2;
const itemCount = dataView.getUint16(offset, true);
offset += 2;
const items = [];
switch (blockType) {
case 1: {
for (let i = 0; i < itemCount; i++) {
const start = dataView.getInt32(offset, true);
offset += 4;
const end = dataView.getInt32(offset, true);
offset += 4;
const score = dataView.getFloat32(offset, true);
offset += 4;
if (!req || coordFilter(start, end, req.start, req.end)) {
items.push({
start,
end,
score,
});
}
}
break;
}
case 2: {
for (let i = 0; i < itemCount; i++) {
const start = dataView.getInt32(offset, true);
offset += 4;
const score = dataView.getFloat32(offset, true);
offset += 4;
const end = start + itemSpan;
if (!req || coordFilter(start, end, req.start, req.end)) {
items.push({
score,
start,
end,
});
}
}
break;
}
case 3: {
for (let i = 0; i < itemCount; i++) {
const score = dataView.getFloat32(offset, true);
offset += 4;
const start = blockStart + i * itemStep;
const end = start + itemSpan;
if (!req || coordFilter(start, end, req.start, req.end)) {
items.push({
score,
start,
end,
});
}
}
break;
}
}
return items;
}
parseBigWigBlockAsArrays(buffer, startOffset, req) {
const dataView = new DataView(buffer.buffer, buffer.byteOffset + startOffset, buffer.length - startOffset);
const blockStart = dataView.getInt32(4, true);
const itemStep = dataView.getUint32(12, true);
const itemSpan = dataView.getUint32(16, true);
const blockType = dataView.getUint8(20);
const itemCount = dataView.getUint16(22, true);
const starts = new Int32Array(itemCount);
const ends = new Int32Array(itemCount);
const scores = new Float32Array(itemCount);
if (!req) {
switch (blockType) {
case 1: {
let offset = 24;
for (let i = 0; i < itemCount; i++) {
starts[i] = dataView.getInt32(offset, true);
ends[i] = dataView.getInt32(offset + 4, true);
scores[i] = dataView.getFloat32(offset + 8, true);
offset += 12;
}
return { starts, ends, scores };
}
case 2: {
let offset = 24;
for (let i = 0; i < itemCount; i++) {
const start = dataView.getInt32(offset, true);
starts[i] = start;
ends[i] = start + itemSpan;
scores[i] = dataView.getFloat32(offset + 4, true);
offset += 8;
}
return { starts, ends, scores };
}
case 3: {
let offset = 24;
for (let i = 0; i < itemCount; i++) {
const start = blockStart + i * itemStep;
starts[i] = start;
ends[i] = start + itemSpan;
scores[i] = dataView.getFloat32(offset, true);
offset += 4;
}
return { starts, ends, scores };
}
}
return { starts, ends, scores };
}
const reqStart = req.start;
const reqEnd = req.end;
let idx = 0;
switch (blockType) {
case 1: {
let offset = 24;
for (let i = 0; i < itemCount; i++) {
const start = dataView.getInt32(offset, true);
const end = dataView.getInt32(offset + 4, true);
if (start < reqEnd && end >= reqStart) {
starts[idx] = start;
ends[idx] = end;
scores[idx] = dataView.getFloat32(offset + 8, true);
idx++;
}
offset += 12;
}
break;
}
case 2: {
let offset = 24;
for (let i = 0; i < itemCount; i++) {
const start = dataView.getInt32(offset, true);
const end = start + itemSpan;
if (start < reqEnd && end >= reqStart) {
starts[idx] = start;
ends[idx] = end;
scores[idx] = dataView.getFloat32(offset + 4, true);
idx++;
}
offset += 8;
}
break;
}
case 3: {
let offset = 24;
for (let i = 0; i < itemCount; i++) {
const start = blockStart + i * itemStep;
const end = start + itemSpan;
if (start < reqEnd && end >= reqStart) {
starts[idx] = start;
ends[idx] = end;
scores[idx] = dataView.getFloat32(offset, true);
idx++;
}
offset += 4;
}
break;
}
}
if (idx < itemCount) {
return {
starts: starts.subarray(0, idx),
ends: ends.subarray(0, idx),
scores: scores.subarray(0, idx),
};
}
return { starts, ends, scores };
}
async readFeatures(observer, blocks, opts = {}) {
try {
const { blockType, uncompressBufSize } = this;
const { signal, request } = opts;
const blockGroupsToFetch = groupBlocks(blocks);
await Promise.all(blockGroupsToFetch.map(async (blockGroup) => {
const { length, offset } = blockGroup;
const data = await this.featureCache.get(`${length}_${offset}`, blockGroup, signal);
const localBlocks = blockGroup.blocks.map(block => ({
offset: block.offset - blockGroup.offset,
length: block.length,
}));
let decompressedData;
let decompressedOffsets;
if (uncompressBufSize > 0) {
const result = await unzipBatch(data, localBlocks, uncompressBufSize);
decompressedData = result.data;
decompressedOffsets = result.offsets;
}
else {
decompressedData = data;
decompressedOffsets = localBlocks.map(b => b.offset);
decompressedOffsets.push(data.length);
}
for (let i = 0; i < blockGroup.blocks.length; i++) {
const block = blockGroup.blocks[i];
const start = decompressedOffsets[i];
const end = decompressedOffsets[i + 1];
const resultData = decompressedData.subarray(start, end);
switch (blockType) {
case 'summary': {
observer.next(this.parseSummaryBlock(resultData, 0, request));
break;
}
case 'bigwig': {
observer.next(this.parseBigWigBlock(resultData, 0, request));
break;
}
case 'bigbed': {
observer.next(this.parseBigBedBlock(resultData, 0, block.offset * (1 << 8), request));
break;
}
default: {
console.warn(`Don't know what to do with ${blockType}`);
}
}
}
}));
observer.complete();
}
catch (e) {
observer.error(e);
}
}
async readBigWigFeaturesAsArrays(blocks, opts = {}) {
const { uncompressBufSize } = this;
const { signal, request } = opts;
const blockGroupsToFetch = groupBlocks(blocks);
const allStarts = [];
const allEnds = [];
const allScores = [];
let totalCount = 0;
await Promise.all(blockGroupsToFetch.map(async (blockGroup) => {
const { length, offset } = blockGroup;
const data = await this.featureCache.get(`${length}_${offset}`, blockGroup, signal);
const localBlocks = blockGroup.blocks.map(block => ({
offset: block.offset - blockGroup.offset,
length: block.length,
}));
if (uncompressBufSize > 0) {
const result = await decompressAndParseBigWigBlocks(data, localBlocks, uncompressBufSize, request?.start ?? 0, request?.end ?? 0);
if (result.starts.length > 0) {
allStarts.push(result.starts);
allEnds.push(result.ends);
allScores.push(result.scores);
totalCount += result.starts.length;
}
}
else {
for (const block of localBlocks) {
const blockData = data.subarray(block.offset, block.offset + block.length);
const result = this.parseBigWigBlockAsArrays(blockData, 0, request);
if (result.starts.length > 0) {
allStarts.push(result.starts);
allEnds.push(result.ends);
allScores.push(result.scores);
totalCount += result.starts.length;
}
}
}
}));
if (allStarts.length === 0) {
return {
starts: new Int32Array(0),
ends: new Int32Array(0),
scores: new Float32Array(0),
isSummary: false,
};
}
if (allStarts.length === 1) {
return {
starts: allStarts[0],
ends: allEnds[0],
scores: allScores[0],
isSummary: false,
};
}
const starts = new Int32Array(totalCount);
const ends = new Int32Array(totalCount);
const scores = new Float32Array(totalCount);
let offset = 0;
for (let i = 0; i < allStarts.length; i++) {
starts.set(allStarts[i], offset);
ends.set(allEnds[i], offset);
scores.set(allScores[i], offset);
offset += allStarts[i].length;
}
return { starts, ends, scores, isSummary: false };
}
async readSummaryFeaturesAsArrays(blocks, opts = {}) {
const { uncompressBufSize } = this;
const { signal, request } = opts;
const blockGroupsToFetch = groupBlocks(blocks);
const allStarts = [];
const allEnds = [];
const allScores = [];
const allMinScores = [];
const allMaxScores = [];
let totalCount = 0;
await Promise.all(blockGroupsToFetch.map(async (blockGroup) => {
const { length, offset } = blockGroup;
const data = await this.featureCache.get(`${length}_${offset}`, blockGroup, signal);
const localBlocks = blockGroup.blocks.map(block => ({
offset: block.offset - blockGroup.offset,
length: block.length,
}));
if (uncompressBufSize > 0) {
const result = await decompressAndParseSummaryBlocks(data, localBlocks, uncompressBufSize, request?.chrId ?? 0, request?.start ?? 0, request?.end ?? 0);
if (result.starts.length > 0) {
allStarts.push(result.starts);
allEnds.push(result.ends);
allScores.push(result.scores);
allMinScores.push(result.minScores);
allMaxScores.push(result.maxScores);
totalCount += result.starts.length;
}
}
else {
for (const block of localBlocks) {
const blockData = data.subarray(block.offset, block.offset + block.length);
const features = this.parseSummaryBlock(blockData, 0, request);
if (features.length > 0) {
const starts = new Int32Array(features.length);
const ends = new Int32Array(features.length);
const scores = new Float32Array(features.length);
const minScores = new Float32Array(features.length);
const maxScores = new Float32Array(features.length);
for (let i = 0; i < features.length; i++) {
const f = features[i];
starts[i] = f.start;
ends[i] = f.end;
scores[i] = f.score ?? 0;
minScores[i] = f.minScore ?? 0;
maxScores[i] = f.maxScore ?? 0;
}
allStarts.push(starts);
allEnds.push(ends);
allScores.push(scores);
allMinScores.push(minScores);
allMaxScores.push(maxScores);
totalCount += features.length;
}
}
}
}));
if (allStarts.length === 0) {
return {
starts: new Int32Array(0),
ends: new Int32Array(0),
scores: new Float32Array(0),
minScores: new Float32Array(0),
maxScores: new Float32Array(0),
isSummary: true,
};
}
if (allStarts.length === 1) {
return {
starts: allStarts[0],
ends: allEnds[0],
scores: allScores[0],
minScores: allMinScores[0],
maxScores: allMaxScores[0],
isSummary: true,
};
}
const starts = new Int32Array(totalCount);
const ends = new Int32Array(totalCount);
const scores = new Float32Array(totalCount);
const minScores = new Float32Array(totalCount);
const maxScores = new Float32Array(totalCount);
let offset = 0;
for (let i = 0; i < allStarts.length; i++) {
starts.set(allStarts[i], offset);
ends.set(allEnds[i], offset);
scores.set(allScores[i], offset);
minScores.set(allMinScores[i], offset);
maxScores.set(allMaxScores[i], offset);
offset += allStarts[i].length;
}
return {
starts,
ends,
scores,
minScores,
maxScores,
isSummary: true,
};
}
}
//# sourceMappingURL=block-view.js.map