@gmod/bbi
Version:
Parser for BigWig/BigBed files
346 lines • 13.2 kB
JavaScript
import { LocalFile, RemoteFile } from 'generic-filehandle2';
import { Observable, firstValueFrom } from 'rxjs';
import { toArray } from 'rxjs/operators';
import { BlockView } from "./block-view.js";
const BIG_WIG_MAGIC = -2003829722;
const BIG_BED_MAGIC = -2021002517;
const decoder = new TextDecoder('utf8');
function getDataView(buffer) {
return new DataView(buffer.buffer, buffer.byteOffset, buffer.length);
}
export class BBI {
bbi;
headerP;
renameRefSeqs;
getHeader(opts) {
if (!this.headerP) {
this.headerP = this._getHeader(opts).catch((e) => {
this.headerP = undefined;
throw e;
});
}
return this.headerP;
}
/*
* @param filehandle - a filehandle from generic-filehandle2
*
* @param path - a Local file path as a string
*
* @param url - a URL string
*
* @param renameRefSeqs - an optional method to rename the internal reference
* sequences using a mapping function
*/
constructor(args) {
const { filehandle, renameRefSeqs = s => s, path, url } = args;
this.renameRefSeqs = renameRefSeqs;
if (filehandle) {
this.bbi = filehandle;
}
else if (url) {
this.bbi = new RemoteFile(url);
}
else if (path) {
this.bbi = new LocalFile(path);
}
else {
throw new Error('no file given');
}
}
async _getHeader(opts) {
const header = await this._getMainHeader(opts);
const chroms = await this._readChromosomeTree(header, opts);
return {
...header,
...chroms,
};
}
async _getMainHeader(opts, requestSize = 2000) {
const b = await this.bbi.read(requestSize, 0, opts);
const dataView = getDataView(b);
const r1 = dataView.getInt32(0, true);
if (r1 !== BIG_WIG_MAGIC && r1 !== BIG_BED_MAGIC) {
throw new Error('not a BigWig/BigBed file');
}
let offset = 0;
const magic = dataView.getInt32(offset, true);
offset += 4;
const version = dataView.getUint16(offset, true);
offset += 2;
const numZoomLevels = dataView.getUint16(offset, true);
offset += 2;
// Offset to the B+ tree that maps chromosome names to integer IDs
const chromosomeTreeOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
const unzoomedDataOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
const unzoomedIndexOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
const fieldCount = dataView.getUint16(offset, true);
offset += 2;
const definedFieldCount = dataView.getUint16(offset, true);
offset += 2;
const asOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
const totalSummaryOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
const uncompressBufSize = dataView.getUint32(offset, true);
offset += 4;
const extHeaderOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
const zoomLevels = [];
for (let i = 0; i < numZoomLevels; i++) {
const reductionLevel = dataView.getUint32(offset, true);
offset += 4;
const reserved = dataView.getUint32(offset, true);
offset += 4;
const dataOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
const indexOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
zoomLevels.push({
reductionLevel,
reserved,
dataOffset,
indexOffset,
});
}
const fileType = magic === BIG_BED_MAGIC ? 'bigbed' : 'bigwig';
// refetch header if it is too large on first pass,
// 8*5 is the sizeof the totalSummary struct
if (asOffset > requestSize || totalSummaryOffset > requestSize - 8 * 5) {
return this._getMainHeader(opts, requestSize * 2);
}
let totalSummary;
if (totalSummaryOffset) {
const b2 = b.subarray(totalSummaryOffset);
let offset = 0;
const dataView = getDataView(b2);
const basesCovered = Number(dataView.getBigUint64(offset, true));
offset += 8;
const scoreMin = dataView.getFloat64(offset, true);
offset += 8;
const scoreMax = dataView.getFloat64(offset, true);
offset += 8;
const scoreSum = dataView.getFloat64(offset, true);
offset += 8;
const scoreSumSquares = dataView.getFloat64(offset, true);
offset += 8;
totalSummary = {
scoreMin,
scoreMax,
scoreSum,
scoreSumSquares,
basesCovered,
};
}
else {
throw new Error('no stats');
}
return {
zoomLevels,
magic,
extHeaderOffset,
numZoomLevels,
fieldCount,
totalSummary,
definedFieldCount,
uncompressBufSize,
asOffset,
chromosomeTreeOffset,
totalSummaryOffset,
unzoomedDataOffset,
unzoomedIndexOffset,
fileType,
version,
autoSql: asOffset
? decoder.decode(b.subarray(asOffset, b.indexOf(0, asOffset)))
: '',
};
}
// Reads the B+ tree that maps chromosome names to integer IDs
// This is part of the "cirTree" (combined ID R-tree) structure, which uses
// integer chromosome IDs instead of strings for more efficient spatial indexing
async _readChromosomeTree(header, opts) {
const refsByNumber = [];
const refsByName = {};
const chromosomeTreeOffset = header.chromosomeTreeOffset;
const dataView = getDataView(await this.bbi.read(32, chromosomeTreeOffset, opts));
let offset = 0;
// const magic = dataView.getUint32(offset, true) // unused
offset += 4;
// const blockSize = dataView.getUint32(offset, true) // unused
offset += 4;
const keySize = dataView.getUint32(offset, true);
offset += 4;
const valSize = dataView.getUint32(offset, true);
offset += 4;
// const itemCount = dataView.getBigUint64(offset, true) // unused
offset += 8;
// Recursively traverses the B+ tree to populate chromosome name-to-ID mappings
const readBPlusTreeNode = async (currentOffset) => {
const b = await this.bbi.read(4, currentOffset);
const dataView = getDataView(b);
let offset = 0;
const isLeafNode = dataView.getUint8(offset);
offset += 1;
// const reserved = dataView.getUint8(offset) // unused
offset += 1;
const count = dataView.getUint16(offset, true);
offset += 2;
// Leaf nodes contain the actual chromosome name-to-ID mappings
if (isLeafNode) {
const b = await this.bbi.read(count * (keySize + valSize), currentOffset + offset);
const dataView = getDataView(b);
offset = 0;
for (let n = 0; n < count; n++) {
const keyEnd = b.indexOf(0, offset);
const effectiveKeyEnd = keyEnd !== -1 && keyEnd < offset + keySize
? keyEnd
: offset + keySize;
const key = decoder.decode(b.subarray(offset, effectiveKeyEnd));
offset += keySize;
const refId = dataView.getUint32(offset, true);
offset += 4;
const refSize = dataView.getUint32(offset, true);
offset += 4;
refsByName[this.renameRefSeqs(key)] = refId;
refsByNumber[refId] = {
name: key,
id: refId,
length: refSize,
};
}
}
else {
// Non-leaf nodes contain pointers to child nodes
const nextNodes = [];
const dataView = getDataView(await this.bbi.read(count * (keySize + 8), currentOffset + offset));
offset = 0;
for (let n = 0; n < count; n++) {
offset += keySize;
const childOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
nextNodes.push(readBPlusTreeNode(childOffset));
}
await Promise.all(nextNodes);
}
};
await readBPlusTreeNode(chromosomeTreeOffset + 32);
return {
refsByName,
refsByNumber,
};
}
/*
* fetches the "unzoomed" view of the bigwig data. this is the default for bigbed
* @param abortSignal - a signal to optionally abort this operation
*/
async getUnzoomedView(opts) {
const { unzoomedIndexOffset, refsByName, uncompressBufSize, fileType } = await this.getHeader(opts);
return new BlockView(this.bbi, refsByName, unzoomedIndexOffset, uncompressBufSize, fileType);
}
/**
* Gets features from a BigWig file
*
* @param refName - The chromosome name
*
* @param start - The start of a region
*
* @param end - The end of a region
*
* @param opts - An object containing basesPerSpan (e.g. pixels per basepair)
* or scale used to infer the zoomLevel to use
*/
async getFeatureStream(refName, start, end, opts) {
await this.getHeader(opts);
const chrName = this.renameRefSeqs(refName);
let view;
const { basesPerSpan, scale } = opts || {};
if (basesPerSpan) {
view = await this.getView(1 / basesPerSpan, opts);
}
else if (scale) {
view = await this.getView(scale, opts);
}
else {
view = await this.getView(1, opts);
}
return new Observable(observer => {
view
.readWigData(chrName, start, end, observer, opts)
.catch((e) => {
observer.error(e);
});
});
}
async getFeatures(refName, start, end, opts) {
const ob = await this.getFeatureStream(refName, start, end, opts);
const arrays = await firstValueFrom(ob.pipe(toArray()));
const totalLength = arrays.reduce((sum, arr) => sum + arr.length, 0);
const result = new Array(totalLength);
let index = 0;
for (const arr of arrays) {
for (const item of arr) {
result[index++] = item;
}
}
return result;
}
/**
* Gets features from a BigWig file as typed arrays (more efficient than getFeatures)
*
* @param refName - The chromosome name
* @param start - The start of a region
* @param end - The end of a region
* @param opts - Options including basesPerSpan or scale
* @returns Promise with typed arrays: starts, ends, scores (and minScores/maxScores for summary data)
*/
async getFeaturesAsArrays(refName, start, end, opts) {
const features = await this.getFeatures(refName, start, end, opts);
const count = features.length;
if (count === 0) {
return {
starts: new Int32Array(0),
ends: new Int32Array(0),
scores: new Float32Array(0),
isSummary: false,
};
}
const hasSummary = features[0]?.summary === true;
if (hasSummary) {
const starts = new Int32Array(count);
const ends = new Int32Array(count);
const scores = new Float32Array(count);
const minScores = new Float32Array(count);
const maxScores = new Float32Array(count);
for (let i = 0; i < count; i++) {
const f = features[i];
starts[i] = f.start;
ends[i] = f.end;
scores[i] = f.score ?? 0;
minScores[i] = f.minScore ?? 0;
maxScores[i] = f.maxScore ?? 0;
}
return {
starts,
ends,
scores,
minScores,
maxScores,
isSummary: true,
};
}
const starts = new Int32Array(count);
const ends = new Int32Array(count);
const scores = new Float32Array(count);
for (let i = 0; i < count; i++) {
const f = features[i];
starts[i] = f.start;
ends[i] = f.end;
scores[i] = f.score ?? 0;
}
return { starts, ends, scores, isSummary: false };
}
}
//# sourceMappingURL=bbi.js.map