@gmod/gff
Version:
read and write GFF3 data as streams
270 lines • 8.3 kB
JavaScript
;
// Fast, low-level functions for parsing and formatting GFF3.
// JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module.
Object.defineProperty(exports, "__esModule", { value: true });
exports.unescape = unescape;
exports.escape = escape;
exports.escapeColumn = escapeColumn;
exports.parseAttributes = parseAttributes;
exports.parseFeature = parseFeature;
exports.parseDirective = parseDirective;
exports.formatAttributes = formatAttributes;
exports.formatFeature = formatFeature;
exports.formatDirective = formatDirective;
exports.formatComment = formatComment;
exports.formatSequence = formatSequence;
exports.formatItem = formatItem;
/**
* Unescape a string value used in a GFF3 attribute.
*
* @param stringVal - Escaped GFF3 string value
* @returns An unescaped string value
*/
function unescape(stringVal) {
return decodeURIComponent(stringVal);
}
function _escape(regex, s) {
return String(s).replaceAll(regex, (ch) => {
return encodeURIComponent(ch).toUpperCase();
});
}
/**
* Escape a value for use in a GFF3 attribute value.
*
* @param rawVal - Raw GFF3 attribute value
* @returns An escaped string value
*/
function escape(rawVal) {
return _escape(/[\n;\r\t=%&,\u0000-\u001f\u007f]/g, rawVal);
}
/**
* Escape a value for use in a GFF3 column value.
*
* @param rawVal - Raw GFF3 column value
* @returns An escaped column value
*/
function escapeColumn(rawVal) {
return _escape(/[\n\r\t%\u0000-\u001f\u007f]/g, rawVal);
}
/**
* Parse the 9th column (attributes) of a GFF3 feature line.
*
* @param attrString - String of GFF3 9th column
* @returns Parsed attributes
*/
function parseAttributes(attrString) {
if (!attrString?.length || attrString === '.') {
return {};
}
const attrs = {};
attrString
.replace(/\r\n|[\r\n]$/, '')
.split(';')
.forEach((a) => {
const nv = a.split('=', 2);
if (!nv[1]?.length) {
return;
}
nv[0] = nv[0].trim();
let arec = attrs[nv[0].trim()];
if (!arec) {
arec = [];
attrs[nv[0]] = arec;
}
arec.push(...nv[1]
.split(',')
.map((s) => s.trim())
.map(unescape));
});
return attrs;
}
/**
* Parse a GFF3 feature line
*
* @param line - GFF3 feature line
* @returns The parsed feature
*/
function parseFeature(line) {
// split the line into columns and replace '.' with null in each column
const f = line
.trim()
.split('\t')
.map((a) => (a === '.' || a === '' ? null : a));
// unescape only the ref, source, and type columns
const parsed = {
seq_id: f[0] && unescape(f[0]),
source: f[1] && unescape(f[1]),
type: f[2] && unescape(f[2]),
start: f[3] === null ? null : parseInt(f[3], 10),
end: f[4] === null ? null : parseInt(f[4], 10),
score: f[5] === null ? null : parseFloat(f[5]),
strand: f[6],
phase: f[7],
attributes: f[8] === null ? null : parseAttributes(f[8]),
};
return parsed;
}
/**
* Parse a GFF3 directive line.
*
* @param line - GFF3 directive line
* @returns The parsed directive
*/
function parseDirective(line) {
const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line);
if (!match) {
return null;
}
const [, name] = match;
let [, , contents] = match;
const parsed = { directive: name };
if (contents.length) {
contents = contents.replace(/\r\n|[\r\n]$/, '');
parsed.value = contents;
}
// do a little additional parsing for sequence-region and genome-build directives
if (name === 'sequence-region') {
const c = contents.split(/\s+/, 3);
return {
...parsed,
seq_id: c[0],
start: c[1]?.replaceAll(/\D/g, ''),
end: c[2]?.replaceAll(/\D/g, ''),
};
}
else if (name === 'genome-build') {
const [source, buildName] = contents.split(/\s+/, 2);
return {
...parsed,
source,
buildName,
};
}
return parsed;
}
/**
* Format an attributes object into a string suitable for the 9th column of GFF3.
*
* @param attrs - Attributes
* @returns GFF3 9th column string
*/
function formatAttributes(attrs) {
const attrOrder = [];
Object.entries(attrs).forEach(([tag, val]) => {
const valstring = val.map(escape).join(',');
attrOrder.push(`${escape(tag)}=${valstring}`);
});
return attrOrder.length ? attrOrder.join(';') : '.';
}
function _formatSingleFeature(f, seenFeature) {
const attrString = f.attributes === null || f.attributes === undefined
? '.'
: formatAttributes(f.attributes);
const fields = [
f.seq_id === null ? '.' : escapeColumn(f.seq_id),
f.source === null ? '.' : escapeColumn(f.source),
f.type === null ? '.' : escapeColumn(f.type),
f.start === null ? '.' : escapeColumn(f.start),
f.end === null ? '.' : escapeColumn(f.end),
f.score === null ? '.' : escapeColumn(f.score),
f.strand === null ? '.' : escapeColumn(f.strand),
f.phase === null ? '.' : escapeColumn(f.phase),
attrString,
];
const formattedString = `${fields.join('\t')}\n`;
// if we have already output this exact feature, skip it
if (seenFeature[formattedString]) {
return '';
}
seenFeature[formattedString] = true;
return formattedString;
}
function _formatFeature(feature, seenFeature) {
if (Array.isArray(feature)) {
return feature.map((f) => _formatFeature(f, seenFeature)).join('');
}
const strings = [_formatSingleFeature(feature, seenFeature)];
if (_isFeatureLineWithRefs(feature)) {
strings.push(...feature.child_features.map((f) => _formatFeature(f, seenFeature)), ...feature.derived_features.map((f) => _formatFeature(f, seenFeature)));
}
return strings.join('');
}
/**
* Format a feature object or array of feature objects into one or more lines of
* GFF3.
*
* @param featureOrFeatures - A feature object or array of feature objects
* @returns A string of one or more GFF3 lines
*/
function formatFeature(featureOrFeatures) {
const seen = {};
return _formatFeature(featureOrFeatures, seen);
}
/**
* Format a directive into a line of GFF3.
*
* @param directive - A directive object
* @returns A directive line string
*/
function formatDirective(directive) {
let str = `##${directive.directive}`;
if (directive.value) {
str += ` ${directive.value}`;
}
str += '\n';
return str;
}
/**
* Format a comment into a GFF3 comment.
* Yes I know this is just adding a # and a newline.
*
* @param comment - A comment object
* @returns A comment line string
*/
function formatComment(comment) {
return `# ${comment.comment}\n`;
}
/**
* Format a sequence object as FASTA
*
* @param seq - A sequence object
* @returns Formatted single FASTA sequence string
*/
function formatSequence(seq) {
const header = `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n`;
// split sequence chunks into lines of length 80 for embedded FASTA
const lineLength = 80;
const numChunks = Math.ceil(seq.sequence.length / lineLength);
const chunks = new Array(numChunks);
for (let i = 0; i < numChunks; i += 1) {
const start = i * lineLength;
chunks[i] = seq.sequence.slice(start, start + lineLength);
}
return `${header}${chunks.join('\n')}\n`;
}
function formatSingleItem(item) {
if ('attributes' in item) {
return formatFeature(item);
}
if ('directive' in item) {
return formatDirective(item);
}
if ('sequence' in item) {
return formatSequence(item);
}
if ('comment' in item) {
return formatComment(item);
}
return '# (invalid item found during format)\n';
}
function formatItem(itemOrItems) {
if (Array.isArray(itemOrItems)) {
return itemOrItems.map(formatSingleItem);
}
return formatSingleItem(itemOrItems);
}
function _isFeatureLineWithRefs(featureLine) {
return (featureLine.child_features !== undefined &&
featureLine.derived_features !== undefined);
}
//# sourceMappingURL=util.js.map