genomic-reader
Version:
A Typescript library for reading BigWig, BigBed, 2bit, and Bam files. Capable of streaming. For use in the browser or on Node.js.
216 lines • 9.96 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.loadSequence = exports.loadOneHotEncodingFromSequence = exports.streamSequence = exports.loadSequenceRecord = exports.loadTwoBitHeaderData = void 0;
const DataLoader_1 = require("../loader/DataLoader");
const BinaryParser_1 = require("../util/BinaryParser");
const BigWigHeaderReader_1 = require("./BigWigHeaderReader");
const stream_1 = require("stream");
const HEADER_BUFFER_SIZE = 32768;
const BUFFER_SIZE = 3000000;
const TWOBIT_HEADER_SIZE = 16;
function chararray() {
const CHARMAPPING = "TCAG";
const CHARARRAY = [];
for (let i = 0; i <= 256; ++i)
CHARARRAY.push(CHARMAPPING[i >> 6] + CHARMAPPING[(i >> 4) & 3] + CHARMAPPING[(i >> 2) & 3] + CHARMAPPING[i & 3]);
return (i) => CHARARRAY[i];
}
;
const letters = {
A: [1, 0, 0, 0],
C: [0, 1, 0, 0],
G: [0, 0, 1, 0],
T: [0, 0, 0, 1],
N: [0, 0, 0, 0],
a: [1, 0, 0, 0],
c: [0, 1, 0, 0],
g: [0, 0, 1, 0],
t: [0, 0, 0, 1],
n: [0, 0, 0, 0]
};
const getBases = chararray();
;
function loadTwoBitHeaderData(dataLoaderR, littleEndian) {
return __awaiter(this, void 0, void 0, function* () {
let dataLoader = new DataLoader_1.BufferedDataLoader(dataLoaderR, HEADER_BUFFER_SIZE);
const headerData = yield dataLoader.load(0, TWOBIT_HEADER_SIZE);
let binaryParser = new BinaryParser_1.BinaryParser(headerData, littleEndian);
let magic = binaryParser.getUInt();
let version = binaryParser.getUInt();
let sequenceCount = binaryParser.getUInt();
let reserved = binaryParser.getUInt();
if (version !== 0 || reserved !== 0)
throw new DataLoader_1.FileFormatError("Unable to determine file type: invalid version or reserved header byte.");
let header = {
sequences: {},
littleEndian: littleEndian,
fileType: BigWigHeaderReader_1.FileType.TwoBit
};
let offset = TWOBIT_HEADER_SIZE;
for (let i = 0; i < sequenceCount; ++i) {
let xdata = yield dataLoader.load(offset, 4);
let binaryParser = new BinaryParser_1.BinaryParser(xdata, littleEndian);
let size = binaryParser.getByte();
offset += 1;
xdata = yield dataLoader.load(offset, size + 4);
binaryParser = new BinaryParser_1.BinaryParser(xdata, littleEndian);
header.sequences[binaryParser.getString(size)] = binaryParser.getUInt();
offset += size + 4;
}
return header;
});
}
exports.loadTwoBitHeaderData = loadTwoBitHeaderData;
function loadSequenceRecord(dataLoaderR, header, sequence) {
return __awaiter(this, void 0, void 0, function* () {
let dataLoader = new DataLoader_1.BufferedDataLoader(dataLoaderR, BUFFER_SIZE);
if (header.sequences[sequence] === undefined)
throw new DataLoader_1.DataMissingError(sequence);
let data = yield dataLoader.load(header.sequences[sequence], 8);
let binaryParser = new BinaryParser_1.BinaryParser(data, header.littleEndian);
let offset = header.sequences[sequence] + 8;
let r = {
dnaSize: binaryParser.getUInt(),
nBlockCount: binaryParser.getUInt(),
nBlockStarts: [],
nBlockSizes: [],
maskBlockCount: 0,
maskBlockStarts: [],
maskBlockSizes: [],
reserved: 0,
offset: 0
};
data = yield dataLoader.load(offset, r.nBlockCount * 8 + 4);
offset += r.nBlockCount * 8 + 4;
binaryParser = new BinaryParser_1.BinaryParser(data, header.littleEndian);
for (let i = 0; i < r.nBlockCount; ++i)
r.nBlockStarts.push(binaryParser.getUInt());
for (let i = 0; i < r.nBlockCount; ++i)
r.nBlockSizes.push(binaryParser.getUInt());
r.maskBlockCount = binaryParser.getUInt();
data = yield dataLoader.load(offset, r.maskBlockCount * 8 + 4);
offset += r.maskBlockCount * 8 + 4;
binaryParser = new BinaryParser_1.BinaryParser(data, header.littleEndian);
for (let i = 0; i < r.maskBlockCount; ++i)
r.maskBlockStarts.push(binaryParser.getUInt());
for (let i = 0; i < r.maskBlockCount; ++i)
r.maskBlockSizes.push(binaryParser.getUInt());
r.reserved = binaryParser.getUInt();
r.offset = offset;
return r;
});
}
exports.loadSequenceRecord = loadSequenceRecord;
function rn(i) {
let retval = "";
for (let ii = 0; ii < i; ++ii)
retval += 'N';
return retval;
}
function streamSequence(dataLoader, header, sequence, start, end, chunkSize = 1024, oneHotEncodedData = false) {
return __awaiter(this, void 0, void 0, function* () {
const dataToBuffer = Math.ceil((end - start) / 4) + 1;
const bufferedLoader = new DataLoader_1.BufferedDataLoader(dataLoader, dataToBuffer, true);
const stream = new stream_1.Readable({ read() { }, encoding: 'utf8' });
const matrixStream = new stream_1.Readable({ read() { }, objectMode: true });
let currentStart = start;
while (currentStart < end) {
let currentEnd = currentStart + chunkSize - 1;
if (currentEnd >= end)
currentEnd = end;
if (oneHotEncodedData) {
const matrix = yield loadOneHotEncodingFromSequence(bufferedLoader, header, sequence, currentStart, currentEnd);
matrixStream.push(matrix);
}
else {
const seq = yield loadSequence(bufferedLoader, header, sequence, currentStart, currentEnd);
stream.push(seq);
}
currentStart = currentEnd + 1;
}
if (oneHotEncodedData) {
matrixStream.push(null);
return matrixStream;
}
else {
stream.push(null);
return stream;
}
});
}
exports.streamSequence = streamSequence;
function loadOneHotEncodingFromSequence(dataLoader, header, sequence, start, end) {
return __awaiter(this, void 0, void 0, function* () {
const seq = yield loadSequence(dataLoader, header, sequence, start, end);
let matrix = [];
for (let c of seq) {
matrix.push(letters[c]);
}
return matrix;
});
}
exports.loadOneHotEncodingFromSequence = loadOneHotEncodingFromSequence;
function loadSequence(dataLoader, header, sequence, start, end) {
return __awaiter(this, void 0, void 0, function* () {
let interruptingNBlocks = [], interruptingMaskBlocks = [];
let csequence = "";
start = start - 1 < 0 ? 0 : start - 1;
for (let i = 0; i < sequence.nBlockStarts.length; ++i) {
if (sequence.nBlockStarts[i] > end)
break;
if (sequence.nBlockStarts[i] + sequence.nBlockSizes[i] < start)
continue;
interruptingNBlocks.push({
start: sequence.nBlockStarts[i],
size: sequence.nBlockSizes[i]
});
}
for (let i = 0; i < sequence.maskBlockStarts.length; ++i) {
if (sequence.nBlockStarts[i] > end)
break;
if (sequence.nBlockStarts[i] + sequence.nBlockSizes[i] < start)
continue;
interruptingMaskBlocks.push({
start: sequence.maskBlockStarts[i],
size: sequence.maskBlockSizes[i]
});
}
let n = Math.ceil((end - start) / 4 + Math.ceil((start % 4) / 4));
let data = yield dataLoader.load(Math.floor(start / 4) + sequence.offset, n);
let binaryParser = new BinaryParser_1.BinaryParser(data, header.littleEndian);
for (let j = 0; j < n; ++j)
csequence += getBases(binaryParser.getByte());
csequence = csequence.substring(start % 4, start % 4 + end - start);
interruptingNBlocks.forEach((block, i) => {
let blockEnd = block.start + block.size;
if (i === 0 && block.start <= start)
csequence = rn((blockEnd <= end ? blockEnd : end) - start) +
csequence.substring((blockEnd < end ? blockEnd : end) - start);
else
csequence = csequence.substring(0, block.start - start) + rn((blockEnd <= end ? blockEnd : end) - block.start) +
csequence.substring((blockEnd < end ? blockEnd : end) - start);
});
interruptingMaskBlocks.forEach((block, i) => {
let blockEnd = block.start + block.size;
if (i === 0 && block.start <= start)
csequence = csequence.substring(0, (blockEnd <= end ? blockEnd : end) - start).toLowerCase() +
csequence.substring((blockEnd < end ? blockEnd : end) - start);
else
csequence = csequence.substring(0, block.start - start) +
csequence.substring(block.start - start, (blockEnd <= end ? blockEnd : end) - start).toLowerCase() +
csequence.substring((blockEnd < end ? blockEnd : end) - start);
});
return csequence;
});
}
exports.loadSequence = loadSequence;
//# sourceMappingURL=TwoBitHeaderReader.js.map