UNPKG

genomic-reader

Version:

A Typescript library for reading BigWig, BigBed, 2bit, and Bam files. Capable of streaming. For use in the browser or on Node.js.

349 lines 16.6 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.parseBigBed = exports.BigWigReader = void 0; const DataLoader_1 = require("../loader/DataLoader"); const BinaryParser_1 = require("../util/BinaryParser"); const BigWigHeaderReader_1 = require("./BigWigHeaderReader"); const TwoBitHeaderReader_1 = require("./TwoBitHeaderReader"); const pako_1 = require("pako"); const stream_1 = require("stream"); const IDX_MAGIC = 0x2468ACE0; const RPTREE_HEADER_SIZE = 48; const RPTREE_NODE_LEAF_ITEM_SIZE = 32; const RPTREE_NODE_CHILD_ITEM_SIZE = 24; const DEFAULT_BUFFER_SIZE = 512000; class BigWigReader { constructor(dataLoader, bufferSize = DEFAULT_BUFFER_SIZE) { this.dataLoader = dataLoader; this.bufferSize = bufferSize; this.cachedSequenceRecords = {}; } fileType() { return __awaiter(this, void 0, void 0, function* () { let header = yield this.getHeader(); return header.fileType; }); } getHeader() { return __awaiter(this, void 0, void 0, function* () { if (!this.cachedHeader) { this.cachedHeader = yield BigWigHeaderReader_1.loadHeaderData(this.dataLoader); } return this.cachedHeader; }); } getSequenceRecord(chrom) { return __awaiter(this, void 0, void 0, function* () { let header = yield this.getHeader(); if (header.fileType !== BigWigHeaderReader_1.FileType.TwoBit) throw new DataLoader_1.FileFormatError("getSequenceRecord is not valid on " + header.fileType + " files."); if (!this.cachedSequenceRecords[chrom]) { this.cachedSequenceRecords[chrom] = yield TwoBitHeaderReader_1.loadSequenceRecord(this.dataLoader, header, chrom); } return this.cachedSequenceRecords[chrom]; }); } readBigWigData(startChrom, startBase, endChrom, endBase) { return __awaiter(this, void 0, void 0, function* () { return this.readData(startChrom, startBase, endChrom, endBase, (yield this.getHeader()).common.fullIndexOffset, decodeWigData); }); } streamBigWigData(startChrom, startBase, endChrom, endBase) { return __awaiter(this, void 0, void 0, function* () { return this.streamData(startChrom, startBase, endChrom, endBase, (yield this.getHeader()).common.fullIndexOffset, decodeWigData); }); } readBigBedData(startChrom, startBase, endChrom, endBase, restParser) { return __awaiter(this, void 0, void 0, function* () { return this.readData(startChrom, startBase, endChrom, endBase, (yield this.getHeader()).common.fullIndexOffset, decodeBedData(restParser || parseBigBed)); }); } streamBigBedData(startChrom, startBase, endChrom, endBase, restParser) { return __awaiter(this, void 0, void 0, function* () { return this.streamData(startChrom, startBase, endChrom, endBase, (yield this.getHeader()).common.fullIndexOffset, decodeBedData(restParser || parseBigBed)); }); } readTwoBitData(chrom, startBase, endBase) { return __awaiter(this, void 0, void 0, function* () { const sequence = yield this.getSequenceRecord(chrom); return TwoBitHeaderReader_1.loadSequence(this.dataLoader, this.cachedHeader, sequence, startBase, endBase); }); } readTwoBitDataMatrix(chrom, startBase, endBase) { return __awaiter(this, void 0, void 0, function* () { const sequence = yield this.getSequenceRecord(chrom); return TwoBitHeaderReader_1.loadOneHotEncodingFromSequence(this.dataLoader, this.cachedHeader, sequence, startBase, endBase); }); } streamTwoBitData(chrom, startBase, endBase, chunkSize = 1024, oneHotEncodedData = false) { return __awaiter(this, void 0, void 0, function* () { const sequence = yield this.getSequenceRecord(chrom); return TwoBitHeaderReader_1.streamSequence(this.dataLoader, this.cachedHeader, sequence, startBase, endBase, chunkSize, oneHotEncodedData); }); } readZoomData(startChrom, startBase, endChrom, endBase, zoomLevelIndex) { return __awaiter(this, void 0, void 0, function* () { const header = yield this.getHeader(); if (undefined == header.zoomLevelHeaders || !(zoomLevelIndex in header.zoomLevelHeaders)) { throw new DataLoader_1.FileFormatError("Given zoomLevelIndex not found in zoom level headers."); } const treeOffset = header.zoomLevelHeaders[zoomLevelIndex].indexOffset; return this.readData(startChrom, startBase, endChrom, endBase, treeOffset, decodeZoomData); }); } streamZoomData(startChrom, startBase, endChrom, endBase, zoomLevelIndex) { return __awaiter(this, void 0, void 0, function* () { const header = yield this.getHeader(); if (undefined == header.zoomLevelHeaders || !(zoomLevelIndex in header.zoomLevelHeaders)) { throw new DataLoader_1.FileFormatError("Given zoomLevelIndex not found in zoom level headers."); } const treeOffset = header.zoomLevelHeaders[zoomLevelIndex].indexOffset; return this.streamData(startChrom, startBase, endChrom, endBase, treeOffset, decodeZoomData); }); } loadData(startChrom, startBase, endChrom, endBase, treeOffset, streamMode, decodeFunction, loadFunction) { return __awaiter(this, void 0, void 0, function* () { const header = yield this.getHeader(); if (undefined == header.chromTree) { throw new DataLoader_1.FileFormatError("No chromosome tree found in file header."); } const startChromIndex = header.chromTree.chromToId[startChrom]; const endChromIndex = header.chromTree.chromToId[endChrom]; if (undefined == startChromIndex) { throw new DataLoader_1.DataMissingError(startChrom); } if (undefined == endChromIndex) { throw new DataLoader_1.DataMissingError(endChrom); } const bufferedLoader = new DataLoader_1.BufferedDataLoader(this.dataLoader, this.bufferSize, streamMode); const magic = new BinaryParser_1.BinaryParser(yield bufferedLoader.load(treeOffset, RPTREE_HEADER_SIZE)).getUInt(); if (IDX_MAGIC !== magic) { throw new DataLoader_1.FileFormatError(`R+ tree not found at offset ${treeOffset}`); } const rootNodeOffset = treeOffset + RPTREE_HEADER_SIZE; const leafNodes = yield loadLeafNodesForRPNode(bufferedLoader, header.littleEndian, rootNodeOffset, startChromIndex, startBase, endChromIndex, endBase); for (const leafNode of leafNodes) { let leafData = new Uint8Array(yield bufferedLoader.load(leafNode.dataOffset, leafNode.dataSize)); if (header.common.uncompressBuffSize > 0) { leafData = pako_1.inflate(leafData); } let leafDecodedData = decodeFunction(leafData.buffer, startChromIndex, startBase, endChromIndex, endBase, header.chromTree.idToChrom); loadFunction(leafDecodedData); } }); } readData(startChrom, startBase, endChrom, endBase, treeOffset, decodeFunction) { return __awaiter(this, void 0, void 0, function* () { const data = []; const load = (d) => data.push(...d); yield this.loadData(startChrom, startBase, endChrom, endBase, treeOffset, false, decodeFunction, load); return data; }); } streamData(startChrom, startBase, endChrom, endBase, treeOffset, decodeFunction) { return __awaiter(this, void 0, void 0, function* () { const stream = new stream_1.Readable({ objectMode: true, read() { } }); const load = (d) => { d.forEach((el) => stream.push(el)); }; yield this.loadData(startChrom, startBase, endChrom, endBase, treeOffset, true, decodeFunction, load); stream.push(null); return stream; }); } } exports.BigWigReader = BigWigReader; function loadLeafNodesForRPNode(bufferedLoader, littleEndian, rpNodeOffset, startChromIndex, startBase, endChromIndex, endBase) { return __awaiter(this, void 0, void 0, function* () { const nodeHeaderData = yield bufferedLoader.load(rpNodeOffset, 4); const nodeHeaderParser = new BinaryParser_1.BinaryParser(nodeHeaderData, littleEndian); const isLeaf = 1 === nodeHeaderParser.getByte(); nodeHeaderParser.position++; const count = nodeHeaderParser.getUShort(); const nodeDataOffset = rpNodeOffset + 4; const bytesRequired = count * (isLeaf ? RPTREE_NODE_LEAF_ITEM_SIZE : RPTREE_NODE_CHILD_ITEM_SIZE); const nodeData = yield bufferedLoader.load(nodeDataOffset, bytesRequired); let leafNodes = []; const nodeDataParser = new BinaryParser_1.BinaryParser(nodeData, littleEndian); for (let i = 0; i < count; i++) { const nodeStartChr = nodeDataParser.getInt(); const nodeStartBase = nodeDataParser.getInt(); const nodeEndChr = nodeDataParser.getInt(); const nodeEndBase = nodeDataParser.getInt(); const overlaps = ((endChromIndex > nodeStartChr) || (endChromIndex == nodeStartChr && endBase >= nodeStartBase)) && ((startChromIndex < nodeEndChr) || (startChromIndex == nodeEndChr && startBase <= nodeEndBase)); if (isLeaf) { const leafNode = { startChrom: nodeStartChr, startBase: nodeStartBase, endChrom: nodeEndChr, endBase: nodeEndBase, dataOffset: nodeDataParser.getLong(), dataSize: nodeDataParser.getLong() }; if (overlaps) { leafNodes.push(leafNode); } } else { const childOffset = nodeDataParser.getLong(); if (overlaps) { leafNodes.push(...yield loadLeafNodesForRPNode(bufferedLoader, littleEndian, childOffset, startChromIndex, startBase, endChromIndex, endBase)); } } } return leafNodes; }); } function parseBigBed(chrom, startBase, endBase, rest) { const entry = { chr: chrom, start: startBase, end: endBase }; let tokens = rest.split("\t"); if (tokens.length > 0) { entry.name = tokens[0]; } if (tokens.length > 1) { entry.score = parseFloat(tokens[1]); } if (tokens.length > 2) { entry.strand = tokens[2]; } if (tokens.length > 3) { entry.cdStart = parseInt(tokens[3]); } if (tokens.length > 4) { entry.cdEnd = parseInt(tokens[4]); } if (tokens.length > 5 && tokens[5] !== "." && tokens[5] !== "0") { let color; if (tokens[5].includes(",")) { color = tokens[5].startsWith("rgb") ? tokens[5] : "rgb(" + tokens[5] + ")"; } else { color = tokens[5]; } entry.color = color; } if (tokens.length > 8) { const exonCount = parseInt(tokens[6]); const exonSizes = tokens[7].split(','); const exonStarts = tokens[8].split(','); const exons = []; for (var i = 0; i < exonCount; i++) { const eStart = startBase + parseInt(exonStarts[i]); const eEnd = eStart + parseInt(exonSizes[i]); exons.push({ start: eStart, end: eEnd }); } entry.exons = exons; } return entry; } exports.parseBigBed = parseBigBed; const decodeBedData = (restParser) => (data, filterStartChromIndex, filterStartBase, filterEndChromIndex, filterEndBase, chromDict) => { const decodedData = []; const binaryParser = new BinaryParser_1.BinaryParser(data); const minSize = 3 * 4 + 1; while (binaryParser.remLength() >= minSize) { const chromIndex = binaryParser.getInt(); const chrom = chromDict[chromIndex]; const startBase = binaryParser.getInt(); const endBase = binaryParser.getInt(); const rest = binaryParser.getString(); if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase)) { continue; } else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) { break; } const entry = restParser(chrom, startBase, endBase, rest); decodedData.push(entry); } return decodedData; }; function decodeWigData(data, filterStartChromIndex, filterStartBase, filterEndChromIndex, filterEndBase, chromDict) { const decodedData = []; const binaryParser = new BinaryParser_1.BinaryParser(data); const chromIndex = binaryParser.getInt(); const chrom = chromDict[chromIndex]; let startBase = binaryParser.getInt(); let endBase = binaryParser.getInt(); const itemStep = binaryParser.getInt(); const itemSpan = binaryParser.getInt(); const type = binaryParser.getByte(); const reserved = binaryParser.getByte(); let itemCount = binaryParser.getUShort(); if (chromIndex < filterStartChromIndex || chromIndex > filterEndChromIndex) { return decodedData; } while (itemCount-- > 0) { let value; if (1 === type) { startBase = binaryParser.getInt(); endBase = binaryParser.getInt(); value = binaryParser.getFloat(); } else if (2 === type) { startBase = binaryParser.getInt(); value = binaryParser.getFloat(); endBase = startBase + itemSpan; } else { value = binaryParser.getFloat(); endBase = startBase + itemSpan; } if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && startBase >= filterEndBase)) { break; } else if (!(chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && endBase < filterStartBase))) { decodedData.push({ chr: chrom, start: startBase, end: endBase, value: value }); } if (1 !== type && 2 !== type) { startBase += itemStep; } } return decodedData; } function decodeZoomData(data, filterStartChromIndex, filterStartBase, filterEndChromIndex, filterEndBase, chromDict) { const decodedData = []; const binaryParser = new BinaryParser_1.BinaryParser(data); const minSize = 8 * 4; while (binaryParser.remLength() > minSize) { const chromIndex = binaryParser.getInt(); const decodedZoomData = { chr: chromDict[chromIndex], start: binaryParser.getInt(), end: binaryParser.getInt(), validCount: binaryParser.getInt(), minVal: binaryParser.getFloat(), maxVal: binaryParser.getFloat(), sumData: binaryParser.getFloat(), sumSquares: binaryParser.getFloat() }; if (chromIndex < filterStartChromIndex || (chromIndex === filterStartChromIndex && decodedZoomData.end < filterStartBase)) { continue; } else if (chromIndex > filterEndChromIndex || (chromIndex === filterEndChromIndex && decodedZoomData.start >= filterEndBase)) { break; } decodedData.push(decodedZoomData); } return decodedData; } //# sourceMappingURL=BigWigReader.js.map