bioinformatics-parser
Version:
.fasta and .fastq parsers
222 lines (189 loc) • 6.25 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", {
value: true
});
exports.realignFasta = realignFasta;
exports.reverseComplement = reverseComplement;
exports.stringify = stringify;
exports.parse = parse;
exports.isAllowed = isAllowed;
function ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); if (enumerableOnly) symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; }); keys.push.apply(keys, symbols); } return keys; }
function _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i] != null ? arguments[i] : {}; if (i % 2) { ownKeys(Object(source), true).forEach(function (key) { _defineProperty(target, key, source[key]); }); } else if (Object.getOwnPropertyDescriptors) { Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)); } else { ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } } return target; }
function _defineProperty(obj, key, value) { if (key in obj) { Object.defineProperty(obj, key, { value: value, enumerable: true, configurable: true, writable: true }); } else { obj[key] = value; } return obj; }
/*
* fasta.js
*/
var LINE_LENGTH = 70;
/**
* @typedef {Object} Fasta
* @property {string} description
* @property {string} data
* @property {boolean} [isReversed]
*/
/**
* @typedef {Object} ValidationResult<T>
* @property {boolean} ok
* @property {T} result
*/
/**
* @param {Fasta} fasta
* @param {string} sequence
* @returns {ValidationResult<Fasta>}
*/
function realignFasta(fasta, sequence) {
var index = fasta.data.indexOf(sequence);
var isReversed = false;
var conversionsRef = {
value: 0
};
if (index === -1) {
var reverseSequence = reverseComplement(sequence);
index = fasta.data.indexOf(reverseSequence);
isReversed = true;
}
if (index === -1) {
return {
ok: false,
result: undefined
};
}
var input = isReversed ? reverseComplement(fasta.data, conversionsRef) : fasta.data;
if (isReversed) index = fasta.data.length - index - sequence.length;
var newSequence = input.slice(index) + input.slice(0, index);
return {
ok: true,
result: _objectSpread(_objectSpread({}, fasta), {}, {
data: newSequence,
isReversed: isReversed,
conversions: conversionsRef.value
})
};
}
function reverseComplement(sequence) {
var conversionsRef = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : null;
return Array.from(sequence.replace(/./g, function (_char) {
switch (_char) {
case 'A':
return 'T';
case 'T':
return 'A';
case 'G':
return 'C';
case 'C':
return 'G';
default:
if (conversionsRef !== null) conversionsRef.value += 1;
return 'N';
}
})).reverse().join('');
}
/**
* @param {Fasta|Fasta[]} fasta
* @returns {string} fasta file string
*/
function stringify(fasta) {
if (Array.isArray(fasta)) return fasta.map(function (s) {
return stringifySequence(s);
}).join('\n');
return stringifySequence(fasta);
}
function stringifySequence(fasta) {
var result = '>' + fasta.description + '\n';
var i;
for (i = 0; i + LINE_LENGTH < fasta.data.length; i += LINE_LENGTH) {
result += fasta.data.slice(i, i + LINE_LENGTH) + '\n';
}
result += fasta.data.slice(i);
return result;
}
function parse(text) {
var lines = text.split('\n');
if (!lines[0] || !lines[0].startsWith('>')) return failWith('file does not start with ">"');
var sequences = [];
var sequence = undefined;
for (var i in lines) {
var line = lines[i];
if (line.startsWith('>')) {
if (sequence) sequences.push(sequence);
sequence = {
description: line.slice(1),
data: ''
};
} else if (isAllowedOrEmpty(line)) {
if (!sequence) return failWith("data with no sequence identifier at line ".concat(i));
sequence.data += line.toUpperCase();
} else {
return failWith("invalid input at line ".concat(i));
}
}
if (sequence) sequences.push(sequence);
return {
ok: true,
error: undefined,
result: sequences
};
}
function failWith(message) {
return {
ok: false,
error: new Error('Invalid fasta file: ' + message),
result: undefined
};
}
function isAllowed(text) {
for (var i = 0; i < text.length; i++) {
if (!isAllowedCode(text.charCodeAt(i))) return false;
}
return true;
}
function isAllowedOrEmpty(text) {
if (text.length === 0) return true;
return isAllowed(text);
}
function isAllowedCode(code) {
if (code >= 65
/* A */
&& code <= 78
/* N */
|| code >= 80
/* P */
&& code <= 90
/* Z */
|| code >= 97
/* a */
&& code <= 110
/* n */
|| code >= 112
/* p */
&& code <= 122
/* z */
|| code === 42
/* * */
|| code === 45
/* - */
) return true;
return false;
}
/*
Fasta format allowed chars:
A adenosine C cytidine G guanine
T thymidine N A/G/C/T (any) U uridine
K G/T (keto) S G/C (strong) Y T/C (pyrimidine)
M A/C (amino) W A/T (weak) R G/A (purine)
B G/T/C D G/A/T H A/C/T
V G/C/A - gap of indeterminate length
For those programs that use amino acid query sequences (BLASTP and TBLASTN), the accepted amino acid codes are:
A alanine P proline
B aspartate/asparagine Q glutamine
C cystine R arginine
D aspartate S serine
E glutamate T threonine
F phenylalanine U selenocysteine
G glycine V valine
H histidine W tryptophan
I isoleucine Y tyrosine
K lysine Z glutamate/glutamine
L leucine X any
M methionine * translation stop
N asparagine - gap of indeterminate length
*/