@gmod/bbi
Version:
Parser for BigWig/BigBed files
243 lines • 9.37 kB
JavaScript
import AbortablePromiseCache from '@gmod/abortable-promise-cache';
import QuickLRU from '@jbrowse/quick-lru';
import { Observable, firstValueFrom, merge } from 'rxjs';
import { map, reduce } from 'rxjs/operators';
import { BBI } from "./bbi.js";
const decoder = new TextDecoder('utf8');
export function filterUndef(ts) {
return ts.filter((t) => !!t);
}
function getTabField(str, fieldIndex) {
if (fieldIndex < 0) {
return undefined;
}
let start = 0;
for (let i = 0; i < fieldIndex; i++) {
start = str.indexOf('\t', start);
if (start === -1) {
return undefined;
}
start++;
}
const end = str.indexOf('\t', start);
return end === -1 ? str.slice(start) : str.slice(start, end);
}
// Parses a null-terminated string key from a B+ tree node
function parseKey(buffer, offset, keySize) {
const keyEnd = buffer.indexOf(0, offset);
const effectiveKeyEnd = keyEnd !== -1 && keyEnd < offset + keySize ? keyEnd : offset + keySize;
return decoder.decode(buffer.subarray(offset, effectiveKeyEnd));
}
// Recursively traverses a B+ tree to search for a specific name in the BigBed extraIndex
// B+ trees are balanced tree structures optimized for disk-based searches
async function readBPlusTreeNode(bbi, nodeOffset, blockSize, keySize, valSize, name, field, opts) {
const len = 4 + blockSize * (keySize + valSize);
const buffer = await bbi.read(len, nodeOffset, opts);
const dataView = new DataView(buffer.buffer, buffer.byteOffset, buffer.length);
let offset = 0;
const nodeType = dataView.getInt8(offset);
offset += 2; // skip nodeType byte + 1 reserved byte
const cnt = dataView.getInt16(offset, true);
offset += 2;
// Non-leaf node (nodeType === 0): contains keys and child node pointers for navigation
if (nodeType === 0) {
const leafkeys = [];
for (let i = 0; i < cnt; i++) {
const key = parseKey(buffer, offset, keySize);
offset += keySize;
const dataOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
leafkeys.push({
key,
offset: dataOffset,
});
}
// Binary search to find the appropriate child node
let left = 0;
let right = leafkeys.length - 1;
let targetIndex = leafkeys.length - 1;
while (left <= right) {
const mid = Math.floor((left + right) / 2);
const cmp = name.localeCompare(leafkeys[mid].key);
if (cmp < 0) {
targetIndex = mid - 1;
right = mid - 1;
}
else {
left = mid + 1;
}
}
const childOffset = targetIndex >= 0 ? leafkeys[targetIndex].offset : leafkeys[0].offset;
return readBPlusTreeNode(bbi, childOffset, blockSize, keySize, valSize, name, field, opts);
}
else if (nodeType === 1) {
// Leaf node (nodeType === 1): contains actual key-value data
const keys = [];
for (let i = 0; i < cnt; i++) {
const key = parseKey(buffer, offset, keySize);
offset += keySize;
const dataOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
const length = dataView.getUint32(offset, true);
offset += 4;
offset += 4; // skip reserved
keys.push({
key,
offset: dataOffset,
length,
});
}
// Binary search for exact key match in sorted leaf node
let left = 0;
let right = keys.length - 1;
while (left <= right) {
const mid = Math.floor((left + right) / 2);
const cmp = name.localeCompare(keys[mid].key);
if (cmp === 0) {
return { ...keys[mid], field };
}
else if (cmp < 0) {
right = mid - 1;
}
else {
left = mid + 1;
}
}
return undefined;
}
}
export class BigBed extends BBI {
readIndicesCache = new AbortablePromiseCache({
cache: new QuickLRU({ maxSize: 1 }),
fill: (args, signal) => this._readIndices({ ...args, signal }),
});
readIndices(opts = {}) {
const { signal, ...rest } = opts;
return this.readIndicesCache.get(JSON.stringify(rest), opts, signal);
}
/*
* retrieve unzoomed view for any scale
*/
async getView(_scale, opts) {
return this.getUnzoomedView(opts);
}
/*
* parse the bigbed extraIndex fields
*
*
* @return a Promise for an array of Index data structure since there can be
* multiple extraIndexes in a bigbed, see bedToBigBed documentation
*/
async _readIndices(opts) {
const { extHeaderOffset } = await this.getHeader(opts);
const b = await this.bbi.read(64, extHeaderOffset);
const dataView = new DataView(b.buffer, b.byteOffset, b.length);
let offset = 0;
// const _size = dataView.getUint16(offset, true)
offset += 2;
const count = dataView.getUint16(offset, true);
offset += 2;
const dataOffset = Number(dataView.getBigUint64(offset, true));
offset += 8;
// no extra index is defined if count==0
if (count === 0) {
return [];
}
const blocklen = 20;
const len = blocklen * count;
const buffer = await this.bbi.read(len, dataOffset);
const indices = [];
for (let i = 0; i < count; i += 1) {
const b = buffer.subarray(i * blocklen);
const dataView = new DataView(b.buffer, b.byteOffset, b.length);
let offset = 0;
const type = dataView.getInt16(offset, true);
offset += 2;
const fieldcount = dataView.getInt16(offset, true);
offset += 2;
const dataOffset = Number(dataView.getBigUint64(offset, true));
offset += 8 + 4; // skip 8-byte offset + 4 reserved bytes
const field = dataView.getInt16(offset, true);
indices.push({
type,
fieldcount,
offset: dataOffset,
field,
});
}
return indices;
}
/*
* perform a search in the bigbed extraIndex to find which blocks in the
* bigbed data to look for the actual feature data
*
* @param name - the name to search for
*
* @param opts - a SearchOptions argument with optional signal
*
* @return a Promise for an array of bigbed block Loc entries
*/
async searchExtraIndexBlocks(name, opts = {}) {
const indices = await this.readIndices(opts);
if (indices.length === 0) {
return [];
}
const locs = indices.map(async (index) => {
const { offset: offset2, field } = index;
const b = await this.bbi.read(32, offset2, opts);
const dataView = new DataView(b.buffer, b.byteOffset, b.length);
let offset = 0;
// const _magic = dataView.getInt32(offset, true)
offset += 4;
const blockSize = dataView.getInt32(offset, true);
offset += 4;
const keySize = dataView.getInt32(offset, true);
offset += 4;
const valSize = dataView.getInt32(offset, true);
offset += 4;
// const _itemCount = Number(dataView.getBigUint64(offset, true))
offset += 8;
return readBPlusTreeNode(this.bbi, offset2 + 32, blockSize, keySize, valSize, name, field, opts);
});
return filterUndef(await Promise.all(locs));
}
/*
* retrieve the features from the bigbed data that were found through the
* lookup of the extraIndex note that there can be multiple extraIndex, see
* the BigBed specification and the -extraIndex argument to bedToBigBed
*
* @param name - the name to search for
*
* @param opts - options object with optional AbortSignal
*
* @return array of Feature
*/
async searchExtraIndex(name, opts = {}) {
const blocks = await this.searchExtraIndexBlocks(name, opts);
if (blocks.length === 0) {
return [];
}
const view = await this.getUnzoomedView(opts);
const res = blocks.map(block => {
return new Observable(observer => {
view.readFeatures(observer, [block], opts).catch((e) => {
observer.error(e);
});
}).pipe(reduce((acc, curr) => {
acc.push(...curr);
return acc;
}, []), map(features => features.map(f => ({ ...f, field: block.field }))));
});
const ret = await firstValueFrom(merge(...res));
// Filter to features where the indexed field matches the search name
// field offset is adjusted by -3 to account for chrom, chromStart, chromEnd columns
return ret.filter(f => {
if (!f.rest) {
return false;
}
const fieldIndex = (f.field || 0) - 3;
return getTabField(f.rest, fieldIndex) === name;
});
}
}
//# sourceMappingURL=bigbed.js.map