genomic-reader
Version:
A Typescript library for reading BigWig, BigBed, 2bit, and Bam files. Capable of streaming. For use in the browser or on Node.js.
349 lines • 16.6 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseBigBed = exports.BigWigReader = void 0;
const DataLoader_1 = require("../loader/DataLoader");
const BinaryParser_1 = require("../util/BinaryParser");
const BigWigHeaderReader_1 = require("./BigWigHeaderReader");
const TwoBitHeaderReader_1 = require("./TwoBitHeaderReader");
const pako_1 = require("pako");
const stream_1 = require("stream");
const IDX_MAGIC = 0x2468ACE0;
const RPTREE_HEADER_SIZE = 48;
const RPTREE_NODE_LEAF_ITEM_SIZE = 32;
const RPTREE_NODE_CHILD_ITEM_SIZE = 24;
const DEFAULT_BUFFER_SIZE = 512000;
class BigWigReader {
constructor(dataLoader, bufferSize = DEFAULT_BUFFER_SIZE) {
this.dataLoader = dataLoader;
this.bufferSize = bufferSize;
this.cachedSequenceRecords = {};
}
fileType() {
return __awaiter(this, void 0, void 0, function* () {
let header = yield this.getHeader();
return header.fileType;
});
}
getHeader() {
return __awaiter(this, void 0, void 0, function* () {
if (!this.cachedHeader) {
this.cachedHeader = yield BigWigHeaderReader_1.loadHeaderData(this.dataLoader);
}
return this.cachedHeader;
});
}
getSequenceRecord(chrom) {
return __awaiter(this, void 0, void 0, function* () {
let header = yield this.getHeader();
if (header.fileType !== BigWigHeaderReader_1.FileType.TwoBit)
throw new DataLoader_1.FileFormatError("getSequenceRecord is not valid on " + header.fileType + " files.");
if (!this.cachedSequenceRecords[chrom]) {
this.cachedSequenceRecords[chrom] = yield TwoBitHeaderReader_1.loadSequenceRecord(this.dataLoader, header, chrom);
}
return this.cachedSequenceRecords[chrom];
});
}
readBigWigData(startChrom, startBase, endChrom, endBase) {
return __awaiter(this, void 0, void 0, function* () {
return this.readData(startChrom, startBase, endChrom, endBase, (yield this.getHeader()).common.fullIndexOffset, decodeWigData);
});
}
streamBigWigData(startChrom, startBase, endChrom, endBase) {
return __awaiter(this, void 0, void 0, function* () {
return this.streamData(startChrom, startBase, endChrom, endBase, (yield this.getHeader()).common.fullIndexOffset, decodeWigData);
});
}
readBigBedData(startChrom, startBase, endChrom, endBase, restParser) {
return __awaiter(this, void 0, void 0, function* () {
return this.readData(startChrom, startBase, endChrom, endBase, (yield this.getHeader()).common.fullIndexOffset, decodeBedData(restParser || parseBigBed));
});
}
streamBigBedData(startChrom, startBase, endChrom, endBase, restParser) {
return __awaiter(this, void 0, void 0, function* () {
return this.streamData(startChrom, startBase, endChrom, endBase, (yield this.getHeader()).common.fullIndexOffset, decodeBedData(restParser || parseBigBed));
});
}
readTwoBitData(chrom, startBase, endBase) {
return __awaiter(this, void 0, void 0, function* () {
const sequence = yield this.getSequenceRecord(chrom);
return TwoBitHeaderReader_1.loadSequence(this.dataLoader, this.cachedHeader, sequence, startBase, endBase);
});
}
readTwoBitDataMatrix(chrom, startBase, endBase) {
return __awaiter(this, void 0, void 0, function* () {
const sequence = yield this.getSequenceRecord(chrom);
return TwoBitHeaderReader_1.loadOneHotEncodingFromSequence(this.dataLoader, this.cachedHeader, sequence, startBase, endBase);
});
}
streamTwoBitData(chrom, startBase, endBase, chunkSize = 1024, oneHotEncodedData = false) {
return __awaiter(this, void 0, void 0, function* () {
const sequence = yield this.getSequenceRecord(chrom);
return TwoBitHeaderReader_1.streamSequence(this.dataLoader, this.cachedHeader, sequence, startBase, endBase, chunkSize, oneHotEncodedData);
});
}
readZoomData(startChrom, startBase, endChrom, endBase, zoomLevelIndex) {
return __awaiter(this, void 0, void 0, function* () {
const header = yield this.getHeader();
if (undefined == header.zoomLevelHeaders || !(zoomLevelIndex in header.zoomLevelHeaders)) {
throw new DataLoader_1.FileFormatError("Given zoomLevelIndex not found in zoom level headers.");
}
const treeOffset = header.zoomLevelHeaders[zoomLevelIndex].indexOffset;
return this.readData(startChrom, startBase, endChrom, endBase, treeOffset, decodeZoomData);
});
}
streamZoomData(startChrom, startBase, endChrom, endBase, zoomLevelIndex) {
return __awaiter(this, void 0, void 0, function* () {
const header = yield this.getHeader();
if (undefined == header.zoomLevelHeaders || !(zoomLevelIndex in header.zoomLevelHeaders)) {
throw new DataLoader_1.FileFormatError("Given zoomLevelIndex not found in zoom level headers.");
}
const treeOffset = header.zoomLevelHeaders[zoomLevelIndex].indexOffset;
return this.streamData(startChrom, startBase, endChrom, endBase, treeOffset, decodeZoomData);
});
}
loadData(startChrom, startBase, endChrom, endBase, treeOffset, streamMode, decodeFunction, loadFunction) {
return __awaiter(this, void 0, void 0, function* () {
const header = yield this.getHeader();
if (undefined == header.chromTree) {
throw new DataLoader_1.FileFormatError("No chromosome tree found in file header.");
}
const startChromIndex = header.chromTree.chromToId[startChrom];
const endChromIndex = header.chromTree.chromToId[endChrom];
if (undefined == startChromIndex) {
throw new DataLoader_1.DataMissingError(startChrom);
}
if (undefined == endChromIndex) {
throw new DataLoader_1.DataMissingError(endChrom);
}
const bufferedLoader = new DataLoader_1.BufferedDataLoader(this.dataLoader, this.bufferSize, streamMode);
const magic = new BinaryParser_1.BinaryParser(yield bufferedLoader.load(treeOffset, RPTREE_HEADER_SIZE)).getUInt();
if (IDX_MAGIC !== magic) {
throw new DataLoader_1.FileFormatError(`R+ tree not found at offset ${treeOffset}`);
}
const rootNodeOffset = treeOffset + RPTREE_HEADER_SIZE;
const leafNodes = yield loadLeafNodesForRPNode(bufferedLoader, header.littleEndian, rootNodeOffset, startChromIndex, startBase, endChromIndex, endBase);
for (const leafNode of leafNodes) {
let leafData = new Uint8Array(yield bufferedLoader.load(leafNode.dataOffset, leafNode.dataSize));
if (header.common.uncompressBuffSize > 0) {
leafData = pako_1.inflate(leafData);
}
let leafDecodedData = decodeFunction(leafData.buffer, startChromIndex, startBase, endChromIndex, endBase, header.chromTree.idToChrom);
loadFunction(leafDecodedData);
}
});
}
readData(startChrom, startBase, endChrom, endBase, treeOffset, decodeFunction) {
return __awaiter(this, void 0, void 0, function* () {
const data = [];
const load = (d) => data.push(...d);
yield this.loadData(startChrom, startBase, endChrom, endBase, treeOffset, false, decodeFunction, load);
return data;
});
}
streamData(startChrom, startBase, endChrom, endBase, treeOffset, decodeFunction) {
return __awaiter(this, void 0, void 0, function* () {
const stream = new stream_1.Readable({ objectMode: true, read() { } });
const load = (d) => {
d.forEach((el) => stream.push(el));
};
yield this.loadData(startChrom, startBase, endChrom, endBase, treeOffset, true, decodeFunction, load);
stream.push(null);
return stream;
});
}
}
exports.BigWigReader = BigWigReader;
function loadLeafNodesForRPNode(bufferedLoader, littleEndian, rpNodeOffset, startChromIndex, startBase, endChromIndex, endBase) {
return __awaiter(this, void 0, void 0, function* () {
const nodeHeaderData = yield bufferedLoader.load(rpNodeOffset, 4);
const nodeHeaderParser = new BinaryParser_1.BinaryParser(nodeHeaderData, littleEndian);
const isLeaf = 1 === nodeHeaderParser.getByte();
nodeHeaderParser.position++;
const count = nodeHeaderParser.getUShort();
const nodeDataOffset = rpNodeOffset + 4;
const bytesRequired = count * (isLeaf ? RPTREE_NODE_LEAF_ITEM_SIZE : RPTREE_NODE_CHILD_ITEM_SIZE);
const nodeData = yield bufferedLoader.load(nodeDataOffset, bytesRequired);
let leafNodes = [];
const nodeDataParser = new BinaryParser_1.BinaryParser(nodeData, littleEndian);
for (let i = 0; i < count; i++) {
const nodeStartChr = nodeDataParser.getInt();
const nodeStartBase = nodeDataParser.getInt();
const nodeEndChr = nodeDataParser.getInt();
const nodeEndBase = nodeDataParser.getInt();
const overlaps = ((endChromIndex > nodeStartChr) || (endChromIndex == nodeStartChr && endBase >= nodeStartBase)) &&
((startChromIndex < nodeEndChr) || (startChromIndex == nodeEndChr && startBase <= nodeEndBase));
if (isLeaf) {
const leafNode = {
startChrom: nodeStartChr,
startBase: nodeStartBase,
endChrom: nodeEndChr,
endBase: nodeEndBase,
dataOffset: nodeDataParser.getLong(),
dataSize: nodeDataParser.getLong()
};
if (overlaps) {
leafNodes.push(leafNode);
}
}
else {
const childOffset = nodeDataParser.getLong();
if (overlaps) {
leafNodes.push(...yield loadLeafNodesForRPNode(bufferedLoader, littleEndian, childOffset, startChromIndex, startBase, endChromIndex, endBase));
}
}
}
return leafNodes;
});
}
function parseBigBed(chrom, startBase, endBase, rest) {
const entry = {
chr: chrom,
start: startBase,
end: endBase
};
let tokens = rest.split("\t");
if (tokens.length > 0) {
entry.name = tokens[0];
}
if (tokens.length > 1) {
entry.score = parseFloat(tokens[1]);
}
if (tokens.length > 2) {
entry.strand = tokens[2];
}
if (tokens.length > 3) {
entry.cdStart = parseInt(tokens[3]);
}
if (tokens.length > 4) {
entry.cdEnd = parseInt(tokens[4]);
}
if (tokens.length > 5 && tokens[5] !== "." && tokens[5] !== "0") {
let color;
if (tokens[5].includes(",")) {
color = tokens[5].startsWith("rgb") ? tokens[5] : "rgb(" + tokens[5] + ")";
}
else {
color = tokens[5];
}
entry.color = color;
}
if (tokens.length > 8) {
const exonCount = parseInt(tokens[6]);
const exonSizes = tokens[7].split(',');
const exonStarts = tokens[8].split(',');
const exons = [];
for (var i = 0; i < exonCount; i++) {
const eStart = startBase + parseInt(exonStarts[i]);
const eEnd = eStart + parseInt(exonSizes[i]);
exons.push({ start: eStart, end: eEnd });
}
entry.exons = exons;
}
return entry;
}
exports.parseBigBed = parseBigBed;
const decodeBedData = (restParser) => (data, filterStartChromIndex, filterStartBase, filterEndChromIndex, filterEndBase, chromDict) => {
const decodedData = [];
const binaryParser = new BinaryParser_1.BinaryParser(data);
const minSize = 3 * 4 + 1;
while (binaryParser.remLength() >= minSize) {
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
const startBase = binaryParser.getInt();
const endBase = binaryParser.getInt();
const rest = binaryParser.getString();
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) {
continue;
}
else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
const entry = restParser(chrom, startBase, endBase, rest);
decodedData.push(entry);
}
return decodedData;
};
function decodeWigData(data, filterStartChromIndex, filterStartBase, filterEndChromIndex, filterEndBase, chromDict) {
const decodedData = [];
const binaryParser = new BinaryParser_1.BinaryParser(data);
const chromIndex = binaryParser.getInt();
const chrom = chromDict[chromIndex];
let startBase = binaryParser.getInt();
let endBase = binaryParser.getInt();
const itemStep = binaryParser.getInt();
const itemSpan = binaryParser.getInt();
const type = binaryParser.getByte();
const reserved = binaryParser.getByte();
let itemCount = binaryParser.getUShort();
if (chromIndex < filterStartChromIndex || chromIndex > filterEndChromIndex) {
return decodedData;
}
while (itemCount-- > 0) {
let value;
if (1 === type) {
startBase = binaryParser.getInt();
endBase = binaryParser.getInt();
value = binaryParser.getFloat();
}
else if (2 === type) {
startBase = binaryParser.getInt();
value = binaryParser.getFloat();
endBase = startBase + itemSpan;
}
else {
value = binaryParser.getFloat();
endBase = startBase + itemSpan;
}
if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) {
break;
}
else if (!(chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase))) {
decodedData.push({
chr: chrom,
start: startBase,
end: endBase,
value: value
});
}
if (1 !== type && 2 !== type) {
startBase += itemStep;
}
}
return decodedData;
}
function decodeZoomData(data, filterStartChromIndex, filterStartBase, filterEndChromIndex, filterEndBase, chromDict) {
const decodedData = [];
const binaryParser = new BinaryParser_1.BinaryParser(data);
const minSize = 8 * 4;
while (binaryParser.remLength() > minSize) {
const chromIndex = binaryParser.getInt();
const decodedZoomData = {
chr: chromDict[chromIndex],
start: binaryParser.getInt(),
end: binaryParser.getInt(),
validCount: binaryParser.getInt(),
minVal: binaryParser.getFloat(),
maxVal: binaryParser.getFloat(),
sumData: binaryParser.getFloat(),
sumSquares: binaryParser.getFloat()
};
if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && decodedZoomData.end < filterStartBase)) {
continue;
}
else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && decodedZoomData.start >= filterEndBase)) {
break;
}
decodedData.push(decodedZoomData);
}
return decodedData;
}
//# sourceMappingURL=BigWigReader.js.map