UNPKG

@gmod/gff

Version:

read and write GFF3 data as streams

270 lines 8.3 kB
"use strict"; // Fast, low-level functions for parsing and formatting GFF3. // JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module. Object.defineProperty(exports, "__esModule", { value: true }); exports.unescape = unescape; exports.escape = escape; exports.escapeColumn = escapeColumn; exports.parseAttributes = parseAttributes; exports.parseFeature = parseFeature; exports.parseDirective = parseDirective; exports.formatAttributes = formatAttributes; exports.formatFeature = formatFeature; exports.formatDirective = formatDirective; exports.formatComment = formatComment; exports.formatSequence = formatSequence; exports.formatItem = formatItem; /** * Unescape a string value used in a GFF3 attribute. * * @param stringVal - Escaped GFF3 string value * @returns An unescaped string value */ function unescape(stringVal) { return decodeURIComponent(stringVal); } function _escape(regex, s) { return String(s).replaceAll(regex, (ch) => { return encodeURIComponent(ch).toUpperCase(); }); } /** * Escape a value for use in a GFF3 attribute value. * * @param rawVal - Raw GFF3 attribute value * @returns An escaped string value */ function escape(rawVal) { return _escape(/[\n;\r\t=%&,\u0000-\u001f\u007f]/g, rawVal); } /** * Escape a value for use in a GFF3 column value. * * @param rawVal - Raw GFF3 column value * @returns An escaped column value */ function escapeColumn(rawVal) { return _escape(/[\n\r\t%\u0000-\u001f\u007f]/g, rawVal); } /** * Parse the 9th column (attributes) of a GFF3 feature line. * * @param attrString - String of GFF3 9th column * @returns Parsed attributes */ function parseAttributes(attrString) { if (!attrString?.length || attrString === '.') { return {}; } const attrs = {}; attrString .replace(/\r\n|[\r\n]$/, '') .split(';') .forEach((a) => { const nv = a.split('=', 2); if (!nv[1]?.length) { return; } nv[0] = nv[0].trim(); let arec = attrs[nv[0].trim()]; if (!arec) { arec = []; attrs[nv[0]] = arec; } arec.push(...nv[1] .split(',') .map((s) => s.trim()) .map(unescape)); }); return attrs; } /** * Parse a GFF3 feature line * * @param line - GFF3 feature line * @returns The parsed feature */ function parseFeature(line) { // split the line into columns and replace '.' with null in each column const f = line .trim() .split('\t') .map((a) => (a === '.' || a === '' ? null : a)); // unescape only the ref, source, and type columns const parsed = { seq_id: f[0] && unescape(f[0]), source: f[1] && unescape(f[1]), type: f[2] && unescape(f[2]), start: f[3] === null ? null : parseInt(f[3], 10), end: f[4] === null ? null : parseInt(f[4], 10), score: f[5] === null ? null : parseFloat(f[5]), strand: f[6], phase: f[7], attributes: f[8] === null ? null : parseAttributes(f[8]), }; return parsed; } /** * Parse a GFF3 directive line. * * @param line - GFF3 directive line * @returns The parsed directive */ function parseDirective(line) { const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line); if (!match) { return null; } const [, name] = match; let [, , contents] = match; const parsed = { directive: name }; if (contents.length) { contents = contents.replace(/\r\n|[\r\n]$/, ''); parsed.value = contents; } // do a little additional parsing for sequence-region and genome-build directives if (name === 'sequence-region') { const c = contents.split(/\s+/, 3); return { ...parsed, seq_id: c[0], start: c[1]?.replaceAll(/\D/g, ''), end: c[2]?.replaceAll(/\D/g, ''), }; } else if (name === 'genome-build') { const [source, buildName] = contents.split(/\s+/, 2); return { ...parsed, source, buildName, }; } return parsed; } /** * Format an attributes object into a string suitable for the 9th column of GFF3. * * @param attrs - Attributes * @returns GFF3 9th column string */ function formatAttributes(attrs) { const attrOrder = []; Object.entries(attrs).forEach(([tag, val]) => { const valstring = val.map(escape).join(','); attrOrder.push(`${escape(tag)}=${valstring}`); }); return attrOrder.length ? attrOrder.join(';') : '.'; } function _formatSingleFeature(f, seenFeature) { const attrString = f.attributes === null || f.attributes === undefined ? '.' : formatAttributes(f.attributes); const fields = [ f.seq_id === null ? '.' : escapeColumn(f.seq_id), f.source === null ? '.' : escapeColumn(f.source), f.type === null ? '.' : escapeColumn(f.type), f.start === null ? '.' : escapeColumn(f.start), f.end === null ? '.' : escapeColumn(f.end), f.score === null ? '.' : escapeColumn(f.score), f.strand === null ? '.' : escapeColumn(f.strand), f.phase === null ? '.' : escapeColumn(f.phase), attrString, ]; const formattedString = `${fields.join('\t')}\n`; // if we have already output this exact feature, skip it if (seenFeature[formattedString]) { return ''; } seenFeature[formattedString] = true; return formattedString; } function _formatFeature(feature, seenFeature) { if (Array.isArray(feature)) { return feature.map((f) => _formatFeature(f, seenFeature)).join(''); } const strings = [_formatSingleFeature(feature, seenFeature)]; if (_isFeatureLineWithRefs(feature)) { strings.push(...feature.child_features.map((f) => _formatFeature(f, seenFeature)), ...feature.derived_features.map((f) => _formatFeature(f, seenFeature))); } return strings.join(''); } /** * Format a feature object or array of feature objects into one or more lines of * GFF3. * * @param featureOrFeatures - A feature object or array of feature objects * @returns A string of one or more GFF3 lines */ function formatFeature(featureOrFeatures) { const seen = {}; return _formatFeature(featureOrFeatures, seen); } /** * Format a directive into a line of GFF3. * * @param directive - A directive object * @returns A directive line string */ function formatDirective(directive) { let str = `##${directive.directive}`; if (directive.value) { str += ` ${directive.value}`; } str += '\n'; return str; } /** * Format a comment into a GFF3 comment. * Yes I know this is just adding a # and a newline. * * @param comment - A comment object * @returns A comment line string */ function formatComment(comment) { return `# ${comment.comment}\n`; } /** * Format a sequence object as FASTA * * @param seq - A sequence object * @returns Formatted single FASTA sequence string */ function formatSequence(seq) { const header = `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n`; // split sequence chunks into lines of length 80 for embedded FASTA const lineLength = 80; const numChunks = Math.ceil(seq.sequence.length / lineLength); const chunks = new Array(numChunks); for (let i = 0; i < numChunks; i += 1) { const start = i * lineLength; chunks[i] = seq.sequence.slice(start, start + lineLength); } return `${header}${chunks.join('\n')}\n`; } function formatSingleItem(item) { if ('attributes' in item) { return formatFeature(item); } if ('directive' in item) { return formatDirective(item); } if ('sequence' in item) { return formatSequence(item); } if ('comment' in item) { return formatComment(item); } return '# (invalid item found during format)\n'; } function formatItem(itemOrItems) { if (Array.isArray(itemOrItems)) { return itemOrItems.map(formatSingleItem); } return formatSingleItem(itemOrItems); } function _isFeatureLineWithRefs(featureLine) { return (featureLine.child_features !== undefined && featureLine.derived_features !== undefined); } //# sourceMappingURL=util.js.map