fastareader
Version:
FASTA DNA/RNA sequence reader
420 lines (335 loc) • 9.83 kB
JavaScript
var fs = require('fs');
var pth = require('path');
var AP = require('argparser');
var dna = require('dna');
function usage() {
var cmd = 'fastareader';
console.error('[synopsis]');
console.error('\t[to get information json file]');
console.error('\t' + cmd + ' <fasta file>\n');
console.error('\t[to get sequences]');
console.error('\t' + cmd + ' <fasta file> <rname> <pos> <length>\n');
console.error('[options]');
console.error('\t--json|-j <json file>\t# import json summary file of the fasta file.\n');
console.error('\t--compl|-c\t# shows complementary strand of the result (5\' -> 3\').\n');
console.error('\t<rname>\tsequence id of the fasta file to get.\n');
console.error('\t<pos>\tstart position of the fasta file to get.');
console.error('\t<length>\tlength of the fasta file to get.');
console.error('[notice]');
console.error('\t1-based coordinate system');
}
function main(args) {
var node = args.shift(), fname = args.shift();
try {
var p = require('argparser')
.nonvals('c', 'compl')
.files(0, 'json', 'j')
.parse(args);
}
catch (e) {
console.error(e.message);
usage();
return;
}
var fpath = p.arg(0);
var json = p.opt('json', 'j');
if (json) {
json = JSON.parse(fs.readFileSync(json, "utf8"));
}
var fastas = new FASTAReader(fpath, json);
var seq_id = p.arg(1);
var start = p.arg(2);
var length = p.arg(3);
var rev = p.opt('compl', 'c');
if (seq_id && start && length) {
console.log(fastas.fetch(seq_id, start, length, rev));
}
// get JSON
else if (!seq_id && !start && !length) {
Object.keys(fastas.result).forEach(function(chr) {
delete fastas.result[chr].fpath;
});
console.log(JSON.stringify({result:fastas.result, Ns:fastas.Ns}));
}
// invalid args
else {
usage();
return;
}
}
function FASTAReader(fpath, json) {
this.fpath = fpath;
this.fd = fs.openSync(fpath, 'r');
if (!json) {
try {
json = require(fpath + '.json');
}
catch (e) {}
}
if (json) {
this.result = (function() {
var ret = {};
Object.keys(json.result).forEach(function(k) {
ret[k] = new FASTA(json.result[k], fpath);
});
return ret;
})();
this.Ns = json.Ns;
}
else {
var parsed = fparse(fpath);
this.result = parsed[0];
this.Ns = parsed[1];
}
}
FASTAReader.create = function(fpath, json) {
return new FASTAReader(fpath, json);
};
FASTAReader.prototype.getResult = function(id) {
var unit = this.result[id];
if (unit) return unit;
throw new Error('['+ id +']: No such rname.');
};
FASTAReader.prototype.close = function() {
fs.closeSync(this.fd);
}
FASTAReader.prototype.fetch = function(id, start, length, inverse) {
var unit = this.getResult(id);
return unit.fetch(start, length, this.fd, inverse);
}
FASTAReader.prototype.fetchByFormat = function(format) {
if (typeof format == 'object') format = dna.getFormat(format);
var info = dna.parseFormat(format, true);
var poslen = dna.getPosLen(info.start, info.end);
return this.fetch(info.chr, poslen[0], poslen[1], info.strand == "-");
}
FASTAReader.prototype.getStartIndex = function(id) {
var unit = this.getResult(id);
return unit.getStartIndex();
}
FASTAReader.prototype.getEndIndex = function(id) {
var unit = this.getResult(id);
return unit.getEndIndex();
}
FASTAReader.prototype.getEndPos = function(id) {
var unit = this.getResult(id);
return unit.getEndPos();
}
FASTAReader.prototype.getIndex = function(id, pos) {
var unit = this.getResult(id);
return unit.getIndex(pos);
}
FASTAReader.prototype.hasN = function(id, start, length) {
var unit = this.result[id];
var startIdx = fgetIndex(unit, start);
var endIdx = Math.min(fgetIndex(unit, Number(start) + Number(length)), fendIndex(unit));
var ns = this.Ns[id];
if (!ns) return false;
const len = ns.length;
if (len == 0) { return false;}
var end = start + length;
var i = 0;
while (i == 0 || ns[i-1]) {
var ns_end = (i == len) ? unit.start + unit.length : ns[i].start;
var ns_start = (i >= 1) ? ns[i-1].end: unit.start;
if (ns_end <= startIdx) {
i++;
continue;
}
return !( ns_start < startIdx && endIdx < ns_end);
}
return false;
}
function FASTA(unit, fpath) {
this.id = unit.id;
this.desc = unit.desc;
this.start = unit.start;
this.length = unit.length;
this.linelen = unit.linelen;
this.fpath = fpath;
}
FASTA.prototype.getIndex = function(pos) {
return fgetIndex(this, pos);
}
FASTA.prototype.getStartIndex = function() {
return fstartIndex(this);
}
FASTA.prototype.idlen = function() {
return idlen(this);
}
FASTA.prototype.getEndIndex = function() {
return fendIndex(this);
}
FASTA.prototype.fetch = function(start, length, fd, inverse) {
return ffetch(this.fpath, this, start, length, fd, inverse);
}
FASTA.prototype.getEndPos = function(){
return fendPos(this);
}
/* FASTA function implementation (be static) */
function fgetIndex(unit, pos) {
return pos2index(pos, idlen(unit), unit.linelen) + Number(unit.start);
}
function idlen(unit) {
var desclen = (unit.desc == null) ? 0 : 1 + unit.desc.length;
return unit.id.length + 2 + desclen;
}
function id_desc_len(unit) {
return unit.id.length + 2;
}
function fstartIndex(unit) {
return idlen(unit) + Number(unit.start);
}
function fendIndex(unit) {
return unit.length + Number(unit.start);
}
function fendPos(unit) {
return idx2pos(unit.length-2, idlen(unit), unit.linelen);
}
function ffetch(fpath, unit, start, length, fd, inverse) {
var startIdx = fgetIndex(unit, start);
var endIdx = Math.min(fgetIndex(unit, Number(start) + Number(length)), fendIndex(unit));
if (endIdx - startIdx <= 0) {
return '';
}
try {
var read = fs.readSync(fd, endIdx - startIdx, startIdx);
}
catch(e) {
return '';
}
var ret = read[0].split('\n').join('');
return (inverse) ? dna.complStrand(ret, true) : ret;
}
/* static functions */
/**
* FASTAReader.pos2index
* convert DNA base position to character index
* @param number pos : DNA base position
* @param number prelen : header data length
* @param number linelen : one line length
* @return number : character index
*/
function pos2index(pos, prelen, linelen) {
return Number(prelen) + Number(pos) -1 + Math.floor( (pos -1)/linelen );
}
/**
* FASTAReader.idx2pos
* convert charcter index to DNA base position
* @param number idx : character index (leftside)
* @param number prelen : header data length
* @param number linelen : one line length
* @return number : DNA base position
*/
function idx2pos(idx, prelen, linelen) {
prelen = prelen || 0;
linelen = linelen || 50;
idx = Number(idx);
return Math.max(0, idx - prelen - Math.floor((idx - prelen)/(linelen + 1))) + 1;
}
/**
* result format
* id : sequence id
* start : start index
* linelen : length of one line
* length : total index length (including id length)
*
*/
function fparse(fpath) {
if (pth.existsSync(!fpath)) {
console.error(config_file + ': No such file.');
return [false, false];
}
var fd = fs.openSync(fpath, 'r');
var read = '';
var pos = 0;
var start = 0;
var Ns = {};
var currentNs = null;
var currentN = null;
var remnant = '';
var result = {};
var buffsize = 65535;
var summary = null;
var length = 0;
var emptyline = 0;
function setAtEnd(_summary, _length, _emptyline) {
// result, and fpath is in outer scope.
_summary.length = _length - _emptyline;
result[_summary.id] = new FASTA(_summary, fpath);
}
do {
read = fs.readSync(fd, buffsize, pos);
var lines = (remnant + read[0]).split('\n');
remnant = lines.pop();
lines.forEach(function(line, i) {
if (line.match('N')) {
if (!currentN) {
currentN = {
line: i,
start: start + length,
end : start + length+ line.length
};
}
else {
currentN.line++;
currentN.end += line.length;
}
}
else if (currentN) {
currentNs.push(currentN);
currentN = null;
}
if (line == '') {
emptyline++;
}
else if (line.charAt(0) == '>') {
// register a previous summary
if (summary) {
setAtEnd(summary, length, emptyline);
}
start += length;
emptyline = 0;
length = 0;
// make a new summary
var id_desc = line.slice(1).split(' ');
var id = id_desc.shift();
var desc = (id_desc.length) ? id_desc.join(' ') : null;
summary = {id: id, desc: desc, start: start, linelen: 0};
// make a new Ns
Ns[summary.id] = [];
currentNs = Ns[summary.id];
}
else {
if (!summary) {
console.error(fpath +' does not seem to be FASTA format.');
process.exit();
}
if (!summary.linelen) {
summary.linelen = line.length;
}
}
length += line.length + 1;
});
pos = pos + read[1];
}
while (read[1] > 0);
// end
length += remnant.replace('\n', '').length;
setAtEnd(summary, length, emptyline);
fs.closeSync(fd);
return [result, Ns];
}
FASTAReader.parse = fparse;
FASTAReader.fetch = ffetch;
FASTAReader.pos2index = pos2index;
FASTAReader.idx2pos= idx2pos;
FASTAReader.fstartIndex= fstartIndex;
FASTAReader.fendIndex= fendIndex;
FASTAReader.fgetIndex= fgetIndex;
FASTAReader.fendPos = fendPos;
FASTAReader.FASTA = FASTA;
FASTAReader.idlen = idlen;
FASTAReader.main = main;
module.exports = FASTAReader;
if (__filename == process.argv[1]) { main(process.argv); }