UNPKG

@gmod/gff

Version:

read and write GFF3 data as streams

264 lines 8.88 kB
"use strict"; // Fast, low-level functions for parsing and formatting GFF3. // JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module. Object.defineProperty(exports, "__esModule", { value: true }); exports.formatItem = exports.formatSequence = exports.formatComment = exports.formatDirective = exports.formatFeature = exports.formatAttributes = exports.parseDirective = exports.parseFeature = exports.parseAttributes = exports.escapeColumn = exports.escape = exports.unescape = void 0; /** * Unescape a string value used in a GFF3 attribute. * * @param stringVal - Escaped GFF3 string value * @returns An unescaped string value */ function unescape(stringVal) { return stringVal.replace(/%([0-9A-Fa-f]{2})/g, (_match, seq) => String.fromCharCode(parseInt(seq, 16))); } exports.unescape = unescape; function _escape(regex, s) { return String(s).replace(regex, (ch) => { const hex = ch.charCodeAt(0).toString(16).toUpperCase().padStart(2, '0'); return `%${hex}`; }); } /** * Escape a value for use in a GFF3 attribute value. * * @param rawVal - Raw GFF3 attribute value * @returns An escaped string value */ function escape(rawVal) { return _escape(/[\n;\r\t=%&,\x00-\x1f\x7f-\xff]/g, rawVal); } exports.escape = escape; /** * Escape a value for use in a GFF3 column value. * * @param rawVal - Raw GFF3 column value * @returns An escaped column value */ function escapeColumn(rawVal) { return _escape(/[\n\r\t%\x00-\x1f\x7f-\xff]/g, rawVal); } exports.escapeColumn = escapeColumn; /** * Parse the 9th column (attributes) of a GFF3 feature line. * * @param attrString - String of GFF3 9th column * @returns Parsed attributes */ function parseAttributes(attrString) { if (!(attrString && attrString.length) || attrString === '.') return {}; const attrs = {}; attrString .replace(/\r?\n$/, '') .split(';') .forEach((a) => { const nv = a.split('=', 2); if (!(nv[1] && nv[1].length)) return; nv[0] = nv[0].trim(); let arec = attrs[nv[0].trim()]; if (!arec) { arec = []; attrs[nv[0]] = arec; } arec.push(...nv[1] .split(',') .map((s) => s.trim()) .map(unescape)); }); return attrs; } exports.parseAttributes = parseAttributes; /** * Parse a GFF3 feature line * * @param line - GFF3 feature line * @returns The parsed feature */ function parseFeature(line) { // split the line into columns and replace '.' with null in each column const f = line.split('\t').map((a) => (a === '.' || a === '' ? null : a)); // unescape only the ref, source, and type columns const parsed = { seq_id: f[0] && unescape(f[0]), source: f[1] && unescape(f[1]), type: f[2] && unescape(f[2]), start: f[3] === null ? null : parseInt(f[3], 10), end: f[4] === null ? null : parseInt(f[4], 10), score: f[5] === null ? null : parseFloat(f[5]), strand: f[6], phase: f[7], attributes: f[8] === null ? null : parseAttributes(f[8]), }; return parsed; } exports.parseFeature = parseFeature; /** * Parse a GFF3 directive line. * * @param line - GFF3 directive line * @returns The parsed directive */ function parseDirective(line) { const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line); if (!match) return null; const [, name] = match; let [, , contents] = match; const parsed = { directive: name }; if (contents.length) { contents = contents.replace(/\r?\n$/, ''); parsed.value = contents; } // do a little additional parsing for sequence-region and genome-build directives if (name === 'sequence-region') { const c = contents.split(/\s+/, 3); return Object.assign(Object.assign({}, parsed), { seq_id: c[0], start: c[1] && c[1].replace(/\D/g, ''), end: c[2] && c[2].replace(/\D/g, '') }); } else if (name === 'genome-build') { const [source, buildName] = contents.split(/\s+/, 2); return Object.assign(Object.assign({}, parsed), { source, buildName }); } return parsed; } exports.parseDirective = parseDirective; /** * Format an attributes object into a string suitable for the 9th column of GFF3. * * @param attrs - Attributes * @returns GFF3 9th column string */ function formatAttributes(attrs) { const attrOrder = []; Object.entries(attrs).forEach(([tag, val]) => { if (!val) return; let valstring; if (val.hasOwnProperty('toString')) { valstring = escape(val.toString()); // } else if (Array.isArray(val.values)) { // valstring = val.values.map(escape).join(',') } else if (Array.isArray(val)) { valstring = val.map(escape).join(','); } else { valstring = escape(val); } attrOrder.push(`${escape(tag)}=${valstring}`); }); return attrOrder.length ? attrOrder.join(';') : '.'; } exports.formatAttributes = formatAttributes; function _formatSingleFeature(f, seenFeature) { const attrString = f.attributes === null || f.attributes === undefined ? '.' : formatAttributes(f.attributes); const fields = [ f.seq_id === null ? '.' : escapeColumn(f.seq_id), f.source === null ? '.' : escapeColumn(f.source), f.type === null ? '.' : escapeColumn(f.type), f.start === null ? '.' : escapeColumn(f.start), f.end === null ? '.' : escapeColumn(f.end), f.score === null ? '.' : escapeColumn(f.score), f.strand === null ? '.' : escapeColumn(f.strand), f.phase === null ? '.' : escapeColumn(f.phase), attrString, ]; const formattedString = `${fields.join('\t')}\n`; // if we have already output this exact feature, skip it if (seenFeature[formattedString]) { return ''; } seenFeature[formattedString] = true; return formattedString; } function _formatFeature(feature, seenFeature) { if (Array.isArray(feature)) { return feature.map((f) => _formatFeature(f, seenFeature)).join(''); } const strings = [_formatSingleFeature(feature, seenFeature)]; if (_isFeatureLineWithRefs(feature)) { strings.push(...feature.child_features.map((f) => _formatFeature(f, seenFeature)), ...feature.derived_features.map((f) => _formatFeature(f, seenFeature))); } return strings.join(''); } /** * Format a feature object or array of feature objects into one or more lines of * GFF3. * * @param featureOrFeatures - A feature object or array of feature objects * @returns A string of one or more GFF3 lines */ function formatFeature(featureOrFeatures) { const seen = {}; return _formatFeature(featureOrFeatures, seen); } exports.formatFeature = formatFeature; /** * Format a directive into a line of GFF3. * * @param directive - A directive object * @returns A directive line string */ function formatDirective(directive) { let str = `##${directive.directive}`; if (directive.value) str += ` ${directive.value}`; str += '\n'; return str; } exports.formatDirective = formatDirective; /** * Format a comment into a GFF3 comment. * Yes I know this is just adding a # and a newline. * * @param comment - A comment object * @returns A comment line string */ function formatComment(comment) { return `# ${comment.comment}\n`; } exports.formatComment = formatComment; /** * Format a sequence object as FASTA * * @param seq - A sequence object * @returns Formatted single FASTA sequence string */ function formatSequence(seq) { return `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n${seq.sequence}\n`; } exports.formatSequence = formatSequence; /** * Format a directive, comment, sequence, or feature, or array of such items, * into one or more lines of GFF3. * * @param itemOrItems - A comment, sequence, or feature, or array of such items * @returns A formatted string or array of strings */ function formatItem(itemOrItems) { function formatSingleItem(item) { if ('attributes' in item) return formatFeature(item); if ('directive' in item) return formatDirective(item); if ('sequence' in item) return formatSequence(item); if ('comment' in item) return formatComment(item); return '# (invalid item found during format)\n'; } if (Array.isArray(itemOrItems)) { return itemOrItems.map(formatSingleItem); } return formatSingleItem(itemOrItems); } exports.formatItem = formatItem; function _isFeatureLineWithRefs(featureLine) { return (featureLine.child_features !== undefined && featureLine.derived_features !== undefined); } //# sourceMappingURL=util.js.map