gtf-nostream
Version:
utilities to read GTF data
319 lines • 9.39 kB
JavaScript
;
//@ts-nocheck
/** @module util */
Object.defineProperty(exports, "__esModule", { value: true });
exports.unescape = unescape;
exports.escape = escape;
exports.escapeColumn = escapeColumn;
exports.parseAttributes = parseAttributes;
exports.parseFeature = parseFeature;
exports.parseDirective = parseDirective;
exports.formatAttributes = formatAttributes;
exports.formatFeature = formatFeature;
exports.formatDirective = formatDirective;
exports.formatComment = formatComment;
exports.formatSequence = formatSequence;
exports.formatItem = formatItem;
// Forks @gmod/gff-js and adapts it to parse and format GTF.
const fieldNames = [
'seq_name',
'source',
'featureType',
'start',
'end',
'score',
'strand',
'frame',
'attributes',
];
// TODO: check about enconding/escaping in gtf 9th column
/**
* Unescape a string/text value used in a GTF attribute.
* Textual attributes should be surrounded by double quotes
* source info:
* https://mblab.wustl.edu/GTF22.html
* https://en.wikipedia.org/wiki/Gene_transfer_format
*
* @param {String} s
* @returns {String}
*/
function unescape(s) {
if (s === null) {
return null;
}
return String(s).replace(/%([0-9A-Fa-f]{2})/g, (_, seq) => String.fromCharCode(parseInt(seq, 16)));
}
/**
* Escape a value for use in a GTF attribute value.
*
* @param {String} s
* @returns {String}
*/
function _escape(regex, s) {
return String(s).replace(regex, ch => {
let hex = ch.charCodeAt(0).toString(16).toUpperCase();
// lol, apparently there's no native function for fixed-width hex output
if (hex.length < 2) {
hex = `0${hex}`;
}
return `%${hex}`;
});
}
function escape(s) {
// eslint-disable-next-line no-control-regex
return _escape(/[\n;\r\t=%&,\x00-\x1f\x7f-\xff]/g, s);
}
/**
* Escape a value for use in a GTF column value.
*
* @param {String} s
* @returns {String}
*/
function escapeColumn(s) {
// eslint-disable-next-line no-control-regex
return _escape(/[\n\r\t%\x00-\x1f\x7f-\xff]/g, s);
}
/**
* Parse the 9th column (attributes) of a GTF feature line.
*
* @param {String} attrString
* @returns {Object}
*/
function parseAttributes(attrString) {
if (!(attrString && attrString.length) || attrString === '.') {
return {};
}
const attrs = {};
attrString
.replace(/\r?\n$/, '')
.slice(0, -1) // need to remove the last semicolon in the attributes
.split(';')
.forEach(attribute => {
if (!attribute) {
return;
}
const attr = attribute.trim().split(' ');
if (!(attr[1] && attr[1].length)) {
return;
}
attr[0] = attr[0].trim();
let arec = attrs[attr[0].trim()];
if (!arec) {
arec = [];
attrs[attr[0]] = arec;
}
// arec.push(unescape(attr[1].trim()))
arec.push(...attr[1]
.split(',')
.map(s => s.trim())
.map(unescape));
});
return attrs;
}
/**
* Parse a GTF feature line.
*
* @param {String} line
* returns the parsed line in an object
*/
function parseFeature(line) {
// assumed that there are no comments at the end of a line
// split the line into columns and replace '.' with null in each column
const f = line.split('\t').map(a => (a === '.' ? null : a));
// unescape only the seq_name, source, and feature columns
f[0] = unescape(f[0]);
f[1] = unescape(f[1]);
f[2] = unescape(f[2]);
f[8] = parseAttributes(f[8]);
const parsed = {};
for (let i = 0; i < fieldNames.length; i += 1) {
parsed[fieldNames[i]] = f[i] === '.' ? null : f[i];
}
if (parsed.start !== null) {
parsed.start = parseInt(parsed.start, 10);
}
if (parsed.end !== null) {
parsed.end = parseInt(parsed.end, 10);
}
if (parsed.score !== null) {
parsed.score = parseFloat(parsed.score, 10);
}
if (parsed.strand != null) {
parsed.strand = parsed.strand;
}
return parsed;
}
/**
* Parse a GTF directive/comment line.
*
* @param {String} line
* @returns {Object} the information in the directive
*/
function parseDirective(line) {
const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line);
// const match = /^\s*\#\#\s*(\S+)\s*(.*)/.exec(line)
if (!match) {
return null;
}
// let [, name, contents] = match
const name = match[1];
let contents = match[2];
const parsed = { directive: name };
if (contents.length) {
contents = contents.replace(/\r?\n$/, '');
parsed.value = contents;
}
// do a little additional parsing for sequence-region and genome-build directives
if (name === 'sequence-region') {
const [seqId, contentStart, contentEnd] = contents.split(/\s+/, 3);
parsed.seq_id = seqId;
parsed.start = contentStart && contentStart.replace(/\D/g, '');
parsed.end = contentEnd && contentEnd.replace(/\D/g, '');
}
else if (name === 'genome-build') {
const [source, buildname] = contents.split(/\s+/, 2);
parsed.source = source;
parsed.buildname = buildname;
}
return parsed;
}
/**
* Format an attributes object into a string suitable for the 9th column of GTF.
*
* @param {Object} attrs
*/
function formatAttributes(attrs) {
const attrOrder = [];
Object.keys(attrs).forEach(tag => {
const val = attrs[tag];
let valstring;
// eslint-disable-next-line no-prototype-builtins
if (val.hasOwnProperty('toString')) {
valstring = escape(val.toString());
}
else if (Array.isArray(val.values)) {
valstring = val.values.map(escape).join(',');
}
else if (Array.isArray(val)) {
valstring = val.map(escape).join(',');
}
else {
valstring = escape(val);
}
attrOrder.push(`${escape(tag)} ${valstring}`);
});
return attrOrder.length ? attrOrder.join('; ').concat(';') : '.';
}
const translateStrand = ['-', '.', '+'];
function _formatSingleFeature(f, seenFeature) {
const attrString = f.attributes === null || f.attributes === undefined
? '.'
: formatAttributes(f.attributes);
const fields = [];
for (let i = 0; i < 8; i += 1) {
const val = f[fieldNames[i]];
// deserialize strand
if (i === 6) {
fields[i] =
val === null || val === undefined
? '.'
: translateStrand[val + 1] || val;
}
else {
fields[i] =
val === null || val === undefined ? '.' : escapeColumn(String(val));
}
}
fields[8] = attrString;
const formattedString = `${fields.join('\t')}\n`;
// if we have already output this exact feature, skip it
if (seenFeature[formattedString]) {
return '';
}
// eslint-disable-next-line no-param-reassign
seenFeature[formattedString] = true;
return formattedString;
}
function _formatFeature(feature, seenFeature) {
if (Array.isArray(feature)) {
return feature.map(f => _formatFeature(f, seenFeature)).join('');
}
const strings = [_formatSingleFeature(feature, seenFeature)];
['child_features', 'derived_features'].forEach(multiSlot => {
if (feature[multiSlot]) {
strings.push(...feature[multiSlot].map(f => _formatFeature(f, seenFeature)));
}
});
return strings.join('');
}
/**
* Format a feature object or array of
* feature objects into one or more lines of GTF.
*
* @param {Object|Array[Object]} featureOrFeatures
*/
function formatFeature(featureOrFeatures) {
const seen = {};
return _formatFeature(featureOrFeatures, seen);
}
/**
* Format a directive into a line of GTF.
*
* @param {Object} directive
* @returns {String}
*/
function formatDirective(directive) {
let str = `##${directive.directive}`;
if (directive.value) {
str += ` ${directive.value}`;
}
str += '\n';
return str;
}
/**
* Format a comment into a GTF comment.
* Yes I know this is just adding a # and a newline.
*
* @param {Object} comment
* @returns {String}
*/
function formatComment(comment) {
return `# ${comment.comment}\n`;
}
/**
* Format a sequence object as FASTA
*
* @param {Object} seq
* @returns {String} formatted single FASTA sequence
*/
function formatSequence(seq) {
return `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n${seq.sequence}\n`;
}
/**
* Format a directive, comment, or feature,
* or array of such items, into one or more lines of GTF.
*
* @param {Object|Array} itemOrItems
*/
function formatItem(itemOrItems) {
function formatSingleItem(item) {
if (item[0] || item.attributes) {
return formatFeature(item);
}
if (item.directive) {
return formatDirective(item);
}
if (item.sequence) {
return formatSequence(item);
}
if (item.comment) {
return formatComment(item);
}
return '# (invalid item found during format)\n';
}
if (Array.isArray(itemOrItems)) {
return itemOrItems.map(item => formatSingleItem(item));
}
return formatSingleItem(itemOrItems);
}
//# sourceMappingURL=util.js.map