UNPKG

genomic-reader

Version:

A Typescript library for reading BigWig, BigBed, 2bit, and Bam files. Capable of streaming. For use in the browser or on Node.js.

216 lines 9.96 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.loadSequence = exports.loadOneHotEncodingFromSequence = exports.streamSequence = exports.loadSequenceRecord = exports.loadTwoBitHeaderData = void 0; const DataLoader_1 = require("../loader/DataLoader"); const BinaryParser_1 = require("../util/BinaryParser"); const BigWigHeaderReader_1 = require("./BigWigHeaderReader"); const stream_1 = require("stream"); const HEADER_BUFFER_SIZE = 32768; const BUFFER_SIZE = 3000000; const TWOBIT_HEADER_SIZE = 16; function chararray() { const CHARMAPPING = "TCAG"; const CHARARRAY = []; for (let i = 0; i <= 256; ++i) CHARARRAY.push(CHARMAPPING[i >> 6] + CHARMAPPING[(i >> 4) & 3] + CHARMAPPING[(i >> 2) & 3] + CHARMAPPING[i & 3]); return (i) => CHARARRAY[i]; } ; const letters = { A: [1, 0, 0, 0], C: [0, 1, 0, 0], G: [0, 0, 1, 0], T: [0, 0, 0, 1], N: [0, 0, 0, 0], a: [1, 0, 0, 0], c: [0, 1, 0, 0], g: [0, 0, 1, 0], t: [0, 0, 0, 1], n: [0, 0, 0, 0] }; const getBases = chararray(); ; function loadTwoBitHeaderData(dataLoaderR, littleEndian) { return __awaiter(this, void 0, void 0, function* () { let dataLoader = new DataLoader_1.BufferedDataLoader(dataLoaderR, HEADER_BUFFER_SIZE); const headerData = yield dataLoader.load(0, TWOBIT_HEADER_SIZE); let binaryParser = new BinaryParser_1.BinaryParser(headerData, littleEndian); let magic = binaryParser.getUInt(); let version = binaryParser.getUInt(); let sequenceCount = binaryParser.getUInt(); let reserved = binaryParser.getUInt(); if (version !== 0 || reserved !== 0) throw new DataLoader_1.FileFormatError("Unable to determine file type: invalid version or reserved header byte."); let header = { sequences: {}, littleEndian: littleEndian, fileType: BigWigHeaderReader_1.FileType.TwoBit }; let offset = TWOBIT_HEADER_SIZE; for (let i = 0; i < sequenceCount; ++i) { let xdata = yield dataLoader.load(offset, 4); let binaryParser = new BinaryParser_1.BinaryParser(xdata, littleEndian); let size = binaryParser.getByte(); offset += 1; xdata = yield dataLoader.load(offset, size + 4); binaryParser = new BinaryParser_1.BinaryParser(xdata, littleEndian); header.sequences[binaryParser.getString(size)] = binaryParser.getUInt(); offset += size + 4; } return header; }); } exports.loadTwoBitHeaderData = loadTwoBitHeaderData; function loadSequenceRecord(dataLoaderR, header, sequence) { return __awaiter(this, void 0, void 0, function* () { let dataLoader = new DataLoader_1.BufferedDataLoader(dataLoaderR, BUFFER_SIZE); if (header.sequences[sequence] === undefined) throw new DataLoader_1.DataMissingError(sequence); let data = yield dataLoader.load(header.sequences[sequence], 8); let binaryParser = new BinaryParser_1.BinaryParser(data, header.littleEndian); let offset = header.sequences[sequence] + 8; let r = { dnaSize: binaryParser.getUInt(), nBlockCount: binaryParser.getUInt(), nBlockStarts: [], nBlockSizes: [], maskBlockCount: 0, maskBlockStarts: [], maskBlockSizes: [], reserved: 0, offset: 0 }; data = yield dataLoader.load(offset, r.nBlockCount * 8 + 4); offset += r.nBlockCount * 8 + 4; binaryParser = new BinaryParser_1.BinaryParser(data, header.littleEndian); for (let i = 0; i < r.nBlockCount; ++i) r.nBlockStarts.push(binaryParser.getUInt()); for (let i = 0; i < r.nBlockCount; ++i) r.nBlockSizes.push(binaryParser.getUInt()); r.maskBlockCount = binaryParser.getUInt(); data = yield dataLoader.load(offset, r.maskBlockCount * 8 + 4); offset += r.maskBlockCount * 8 + 4; binaryParser = new BinaryParser_1.BinaryParser(data, header.littleEndian); for (let i = 0; i < r.maskBlockCount; ++i) r.maskBlockStarts.push(binaryParser.getUInt()); for (let i = 0; i < r.maskBlockCount; ++i) r.maskBlockSizes.push(binaryParser.getUInt()); r.reserved = binaryParser.getUInt(); r.offset = offset; return r; }); } exports.loadSequenceRecord = loadSequenceRecord; function rn(i) { let retval = ""; for (let ii = 0; ii < i; ++ii) retval += 'N'; return retval; } function streamSequence(dataLoader, header, sequence, start, end, chunkSize = 1024, oneHotEncodedData = false) { return __awaiter(this, void 0, void 0, function* () { const dataToBuffer = Math.ceil((end - start) / 4) + 1; const bufferedLoader = new DataLoader_1.BufferedDataLoader(dataLoader, dataToBuffer, true); const stream = new stream_1.Readable({ read() { }, encoding: 'utf8' }); const matrixStream = new stream_1.Readable({ read() { }, objectMode: true }); let currentStart = start; while (currentStart < end) { let currentEnd = currentStart + chunkSize - 1; if (currentEnd >= end) currentEnd = end; if (oneHotEncodedData) { const matrix = yield loadOneHotEncodingFromSequence(bufferedLoader, header, sequence, currentStart, currentEnd); matrixStream.push(matrix); } else { const seq = yield loadSequence(bufferedLoader, header, sequence, currentStart, currentEnd); stream.push(seq); } currentStart = currentEnd + 1; } if (oneHotEncodedData) { matrixStream.push(null); return matrixStream; } else { stream.push(null); return stream; } }); } exports.streamSequence = streamSequence; function loadOneHotEncodingFromSequence(dataLoader, header, sequence, start, end) { return __awaiter(this, void 0, void 0, function* () { const seq = yield loadSequence(dataLoader, header, sequence, start, end); let matrix = []; for (let c of seq) { matrix.push(letters[c]); } return matrix; }); } exports.loadOneHotEncodingFromSequence = loadOneHotEncodingFromSequence; function loadSequence(dataLoader, header, sequence, start, end) { return __awaiter(this, void 0, void 0, function* () { let interruptingNBlocks = [], interruptingMaskBlocks = []; let csequence = ""; start = start - 1 < 0 ? 0 : start - 1; for (let i = 0; i < sequence.nBlockStarts.length; ++i) { if (sequence.nBlockStarts[i] > end) break; if (sequence.nBlockStarts[i] + sequence.nBlockSizes[i] < start) continue; interruptingNBlocks.push({ start: sequence.nBlockStarts[i], size: sequence.nBlockSizes[i] }); } for (let i = 0; i < sequence.maskBlockStarts.length; ++i) { if (sequence.nBlockStarts[i] > end) break; if (sequence.nBlockStarts[i] + sequence.nBlockSizes[i] < start) continue; interruptingMaskBlocks.push({ start: sequence.maskBlockStarts[i], size: sequence.maskBlockSizes[i] }); } let n = Math.ceil((end - start) / 4 + Math.ceil((start % 4) / 4)); let data = yield dataLoader.load(Math.floor(start / 4) + sequence.offset, n); let binaryParser = new BinaryParser_1.BinaryParser(data, header.littleEndian); for (let j = 0; j < n; ++j) csequence += getBases(binaryParser.getByte()); csequence = csequence.substring(start % 4, start % 4 + end - start); interruptingNBlocks.forEach((block, i) => { let blockEnd = block.start + block.size; if (i === 0 && block.start <= start) csequence = rn((blockEnd <= end ? blockEnd : end) - start) + csequence.substring((blockEnd < end ? blockEnd : end) - start); else csequence = csequence.substring(0, block.start - start) + rn((blockEnd <= end ? blockEnd : end) - block.start) + csequence.substring((blockEnd < end ? blockEnd : end) - start); }); interruptingMaskBlocks.forEach((block, i) => { let blockEnd = block.start + block.size; if (i === 0 && block.start <= start) csequence = csequence.substring(0, (blockEnd <= end ? blockEnd : end) - start).toLowerCase() + csequence.substring((blockEnd < end ? blockEnd : end) - start); else csequence = csequence.substring(0, block.start - start) + csequence.substring(block.start - start, (blockEnd <= end ? blockEnd : end) - start).toLowerCase() + csequence.substring((blockEnd < end ? blockEnd : end) - start); }); return csequence; }); } exports.loadSequence = loadSequence; //# sourceMappingURL=TwoBitHeaderReader.js.map