@gmod/gff
Version:
read and write GFF3 data as streams
264 lines • 8.88 kB
JavaScript
;
// Fast, low-level functions for parsing and formatting GFF3.
// JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module.
Object.defineProperty(exports, "__esModule", { value: true });
exports.formatItem = exports.formatSequence = exports.formatComment = exports.formatDirective = exports.formatFeature = exports.formatAttributes = exports.parseDirective = exports.parseFeature = exports.parseAttributes = exports.escapeColumn = exports.escape = exports.unescape = void 0;
/**
* Unescape a string value used in a GFF3 attribute.
*
* @param stringVal - Escaped GFF3 string value
* @returns An unescaped string value
*/
function unescape(stringVal) {
return stringVal.replace(/%([0-9A-Fa-f]{2})/g, (_match, seq) => String.fromCharCode(parseInt(seq, 16)));
}
exports.unescape = unescape;
function _escape(regex, s) {
return String(s).replace(regex, (ch) => {
const hex = ch.charCodeAt(0).toString(16).toUpperCase().padStart(2, '0');
return `%${hex}`;
});
}
/**
* Escape a value for use in a GFF3 attribute value.
*
* @param rawVal - Raw GFF3 attribute value
* @returns An escaped string value
*/
function escape(rawVal) {
return _escape(/[\n;\r\t=%&,\x00-\x1f\x7f-\xff]/g, rawVal);
}
exports.escape = escape;
/**
* Escape a value for use in a GFF3 column value.
*
* @param rawVal - Raw GFF3 column value
* @returns An escaped column value
*/
function escapeColumn(rawVal) {
return _escape(/[\n\r\t%\x00-\x1f\x7f-\xff]/g, rawVal);
}
exports.escapeColumn = escapeColumn;
/**
* Parse the 9th column (attributes) of a GFF3 feature line.
*
* @param attrString - String of GFF3 9th column
* @returns Parsed attributes
*/
function parseAttributes(attrString) {
if (!(attrString && attrString.length) || attrString === '.')
return {};
const attrs = {};
attrString
.replace(/\r?\n$/, '')
.split(';')
.forEach((a) => {
const nv = a.split('=', 2);
if (!(nv[1] && nv[1].length))
return;
nv[0] = nv[0].trim();
let arec = attrs[nv[0].trim()];
if (!arec) {
arec = [];
attrs[nv[0]] = arec;
}
arec.push(...nv[1]
.split(',')
.map((s) => s.trim())
.map(unescape));
});
return attrs;
}
exports.parseAttributes = parseAttributes;
/**
* Parse a GFF3 feature line
*
* @param line - GFF3 feature line
* @returns The parsed feature
*/
function parseFeature(line) {
// split the line into columns and replace '.' with null in each column
const f = line.split('\t').map((a) => (a === '.' || a === '' ? null : a));
// unescape only the ref, source, and type columns
const parsed = {
seq_id: f[0] && unescape(f[0]),
source: f[1] && unescape(f[1]),
type: f[2] && unescape(f[2]),
start: f[3] === null ? null : parseInt(f[3], 10),
end: f[4] === null ? null : parseInt(f[4], 10),
score: f[5] === null ? null : parseFloat(f[5]),
strand: f[6],
phase: f[7],
attributes: f[8] === null ? null : parseAttributes(f[8]),
};
return parsed;
}
exports.parseFeature = parseFeature;
/**
* Parse a GFF3 directive line.
*
* @param line - GFF3 directive line
* @returns The parsed directive
*/
function parseDirective(line) {
const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line);
if (!match)
return null;
const [, name] = match;
let [, , contents] = match;
const parsed = { directive: name };
if (contents.length) {
contents = contents.replace(/\r?\n$/, '');
parsed.value = contents;
}
// do a little additional parsing for sequence-region and genome-build directives
if (name === 'sequence-region') {
const c = contents.split(/\s+/, 3);
return Object.assign(Object.assign({}, parsed), { seq_id: c[0], start: c[1] && c[1].replace(/\D/g, ''), end: c[2] && c[2].replace(/\D/g, '') });
}
else if (name === 'genome-build') {
const [source, buildName] = contents.split(/\s+/, 2);
return Object.assign(Object.assign({}, parsed), { source,
buildName });
}
return parsed;
}
exports.parseDirective = parseDirective;
/**
* Format an attributes object into a string suitable for the 9th column of GFF3.
*
* @param attrs - Attributes
* @returns GFF3 9th column string
*/
function formatAttributes(attrs) {
const attrOrder = [];
Object.entries(attrs).forEach(([tag, val]) => {
if (!val)
return;
let valstring;
if (val.hasOwnProperty('toString')) {
valstring = escape(val.toString());
// } else if (Array.isArray(val.values)) {
// valstring = val.values.map(escape).join(',')
}
else if (Array.isArray(val)) {
valstring = val.map(escape).join(',');
}
else {
valstring = escape(val);
}
attrOrder.push(`${escape(tag)}=${valstring}`);
});
return attrOrder.length ? attrOrder.join(';') : '.';
}
exports.formatAttributes = formatAttributes;
function _formatSingleFeature(f, seenFeature) {
const attrString = f.attributes === null || f.attributes === undefined
? '.'
: formatAttributes(f.attributes);
const fields = [
f.seq_id === null ? '.' : escapeColumn(f.seq_id),
f.source === null ? '.' : escapeColumn(f.source),
f.type === null ? '.' : escapeColumn(f.type),
f.start === null ? '.' : escapeColumn(f.start),
f.end === null ? '.' : escapeColumn(f.end),
f.score === null ? '.' : escapeColumn(f.score),
f.strand === null ? '.' : escapeColumn(f.strand),
f.phase === null ? '.' : escapeColumn(f.phase),
attrString,
];
const formattedString = `${fields.join('\t')}\n`;
// if we have already output this exact feature, skip it
if (seenFeature[formattedString]) {
return '';
}
seenFeature[formattedString] = true;
return formattedString;
}
function _formatFeature(feature, seenFeature) {
if (Array.isArray(feature)) {
return feature.map((f) => _formatFeature(f, seenFeature)).join('');
}
const strings = [_formatSingleFeature(feature, seenFeature)];
if (_isFeatureLineWithRefs(feature)) {
strings.push(...feature.child_features.map((f) => _formatFeature(f, seenFeature)), ...feature.derived_features.map((f) => _formatFeature(f, seenFeature)));
}
return strings.join('');
}
/**
* Format a feature object or array of feature objects into one or more lines of
* GFF3.
*
* @param featureOrFeatures - A feature object or array of feature objects
* @returns A string of one or more GFF3 lines
*/
function formatFeature(featureOrFeatures) {
const seen = {};
return _formatFeature(featureOrFeatures, seen);
}
exports.formatFeature = formatFeature;
/**
* Format a directive into a line of GFF3.
*
* @param directive - A directive object
* @returns A directive line string
*/
function formatDirective(directive) {
let str = `##${directive.directive}`;
if (directive.value)
str += ` ${directive.value}`;
str += '\n';
return str;
}
exports.formatDirective = formatDirective;
/**
* Format a comment into a GFF3 comment.
* Yes I know this is just adding a # and a newline.
*
* @param comment - A comment object
* @returns A comment line string
*/
function formatComment(comment) {
return `# ${comment.comment}\n`;
}
exports.formatComment = formatComment;
/**
* Format a sequence object as FASTA
*
* @param seq - A sequence object
* @returns Formatted single FASTA sequence string
*/
function formatSequence(seq) {
return `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n${seq.sequence}\n`;
}
exports.formatSequence = formatSequence;
/**
* Format a directive, comment, sequence, or feature, or array of such items,
* into one or more lines of GFF3.
*
* @param itemOrItems - A comment, sequence, or feature, or array of such items
* @returns A formatted string or array of strings
*/
function formatItem(itemOrItems) {
function formatSingleItem(item) {
if ('attributes' in item)
return formatFeature(item);
if ('directive' in item)
return formatDirective(item);
if ('sequence' in item)
return formatSequence(item);
if ('comment' in item)
return formatComment(item);
return '# (invalid item found during format)\n';
}
if (Array.isArray(itemOrItems)) {
return itemOrItems.map(formatSingleItem);
}
return formatSingleItem(itemOrItems);
}
exports.formatItem = formatItem;
function _isFeatureLineWithRefs(featureLine) {
return (featureLine.child_features !== undefined &&
featureLine.derived_features !== undefined);
}
//# sourceMappingURL=util.js.map