UNPKG

phylojs

Version:

A simple typescript library for phylogenetic trees

307 lines (306 loc) 12.9 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.parseNewickAnnotations = exports.parseNHXAnnotations = exports.parseBeastAnnotations = exports.parseHybridLabels = exports.readTreesFromNewick = exports.readNewick = void 0; const __1 = require("../../"); const error_1 = require("../../utils/error"); /** * Parse a string in the New Hampshire (Newick) format and return a tree object. * * This function reads a Newick string from left to right. It is based on * the `kn_parse` function by Heng Li for jstreeview (https://github.com/lh3/jstreeview/blob/main/knhx.js), * modified for compatibility with our Tree object and to prevent ';' assignment as root label. * * Supports node annotations in both BEAST [&...] and NHX [&&NHX:...] formats through the annotationParser parameter. * Users can provide their own annotation parser function to handle custom formats. * Also handles hybrid nodes marked with # notation following Cardona et al. 2008. * * If the input string contains multiple trees (separated by newlines), only the first tree is parsed * and a warning is issued. Use readTreesFromNewick() to parse multiple trees. * * Node IDs are unique and allocated in order of parsing. Specifically, leaf node IDs are numbered * according to the order in which they are encountered. Where a close parenthesis ')' is * encountered, an internal node is added with an incremented ID. This means that leaf node IDs * are not guaranteed to be contiguous or numbered from 0 to n-1. For example, for `(A,B);`, leaf A has index 1, * leaf B has index 2, and the root node has index 3. In general the root will have the highest index. * * To renumber nodes, one could use `.preorderTraversal()` or `.postorderTraversal()` methods. * * @param {string} newick - The string in Newick format to parse * @returns {Tree} - The constructed phylogenetic tree */ function readNewick(newick, annotationParser = parseNewickAnnotations) { const stack = []; // Stack to track node relationships during parsing const nodes = []; // Array to store all created nodes // Check if multiple trees are included in the string if (newick.includes('\n')) { newick = newick.slice(0, newick.indexOf('\n')); console.warn('Multiple trees in Newick string. Only reading the first tree. Use readTreesFromNewick() to read all trees.'); } // Parse the string character by character let position = 0; while (position < newick.length) { // Skips over space and delete (non-printable) characters in ASCII while (position < newick.length && (newick.charAt(position) < '!' || newick.charAt(position) > '~')) { position++; } if (position === newick.length) break; const currentChar = newick.charAt(position); if (currentChar === ',') { // Comma separates nodes at the same level position++; } else if (currentChar === '(') { // Opening parenthesis indicates the start of child nodes stack.push(-1); // -1 marks new set of sister nodes position++; } else if (currentChar === ')') { // Closing parenthesis indicates the end of sister nodes const newNodeIndex = nodes.length; let stackIndex, childIndex; // Search backwards for first sibling node (most recent opening parenthesis) for (stackIndex = stack.length - 1; stackIndex >= 0; --stackIndex) { if (stack[stackIndex] < 0) break; } if (stackIndex < 0) { throw new Error('Unmatched closing parenthesis in Newick string'); } // Number of children we need to add const childCount = stack.length - 1 - stackIndex; // Add new node, parse its label/branch length, and update position position = kn_add_node(newick, position + 1, nodes, newNodeIndex, annotationParser); // Connect children to the new parent node for (stackIndex = stack.length - 1, childIndex = childCount - 1; childIndex >= 0; stackIndex--, childIndex--) { nodes[newNodeIndex].children[childIndex] = nodes[stack[stackIndex]]; nodes[stack[stackIndex]].parent = nodes[newNodeIndex]; } // Remove processed nodes from stack stack.length = stackIndex; stack.push(newNodeIndex); } else { // Add leaves. Parent established when ')' next encountered in case above^ stack.push(nodes.length); position = kn_add_node(newick, position, nodes, nodes.length, annotationParser); } } if (stack.length > 1) { console.warn('Multiple unconnected trees found in Newick string'); } // Create and return the tree with the last node as root return new __1.Tree(nodes[nodes.length - 1]); } exports.readNewick = readNewick; /** * Reads .newick strings, separated by ';' and returns an array of Trees. * @param {string} newick * @returns {Tree[]} Tree */ function readTreesFromNewick(newick) { const trees = []; const lines = newick.split(/;\s*\n/); for (let thisLine of lines) { thisLine = thisLine.trim(); if (thisLine.length === 0) continue; try { trees.push(readNewick(thisLine)); } catch (e) { if (e instanceof error_1.SkipTreeException) { console.log('Skipping Newick tree: ' + e.message); } else { throw e; } } } return trees; } exports.readTreesFromNewick = readTreesFromNewick; /** * Parses a node from a Newick string and adds it to the nodes array. * * This function extracts node information (label, branch length, annotations) from * the Newick string starting at position l. It creates a new Node object, populates * its properties, and adds it to the nodes array. * * @param {string} str - The Newick format string being parsed * @param {number} position - The starting position in the string to parse from * @param {Node[]} nodes - The array where all created nodes are stored * @param {number} newNodeIndex - The index to assign to the new node * @returns {number} - The position in the string where parsing for this node ended */ function kn_add_node(str, position, nodes, newNodeIndex, annotationParser) { const beg = position; let end = 0, i, j; const z = new __1.Node(newNodeIndex); let label; // Node label for (i = position; i < str.length && str.charAt(i) != ',' && str.charAt(i) != ')'; ++i) { const c = str.charAt(i); if (c == '[') { // TODO: Custom open delimiter? const meta_beg = i; if (end == 0) end = i; do ++i; while (i < str.length && str.charAt(i) != ']'); if (i == str.length) { throw new Error('Unclosed annotation bracket in Newick string'); break; } z.annotation = annotationParser(str.slice(meta_beg + 1, i)); } else if (c == ':') { // Parse branch length if (end == 0) end = i; for (j = ++i; i < str.length; ++i) { const cc = str.charAt(i); if ((cc < '0' || cc > '9') && cc != 'e' && cc != 'E' && cc != '+' && cc != '-' && cc != '.') break; } z.branchLength = parseFloat(str.slice(j, i)); --i; } else if (c < '!' && c > '~' && end == 0) end = i; } if (end == 0) end = i; if (end > beg) { label = str .slice(beg, end) .replace(/;$/g, '') .replace(/^"|"$/g, '') // remove quotes .replace(/^'|'$/g, ''); // remove quotes if (label.includes('#')) { // Hybrid case const parsedLabel = parseHybridLabels(label); z.label = parsedLabel['label']; z.hybridID = parsedLabel['hybridID']; } else { label.length > 0 ? (z.label = label) : (z.label = undefined); } } nodes.push(z); return i; } /** * Function parses hybrid id labels, which are assumed to contain '#'. * Following Cardona et al. 2008, (https://doi.org/10.1186/1471-2105-9-532). * Function expects unparsed labels to be of the form [label]#[type]i[:branch-length] * where '#' and i (the hybrid node ID) are mandatory. PhyloJS ignores the type annotation * (H for hybridisation, LGT for lateral gene transfer, R for recombination) and extracts only * the label and hybridID, following icyTREE. * @param {string} label * @returns {HybridInformation} */ function parseHybridLabels(label) { if (!label.includes('#')) throw 'No hash(#), in hybrid label.'; const splitLabel = label.split('#'); const parsedLabel = splitLabel[0].length > 0 ? splitLabel[0] : undefined; const hybridID = Number(splitLabel[1].replace(/H|LGT|R/g, '')); // remove hybridisation types if (!Number.isInteger(hybridID)) throw 'Hybrid ID is not an integer!'; const info = { label: parsedLabel, hybridID: hybridID, }; return info; } exports.parseHybridLabels = parseHybridLabels; /** * Parses BEAST-type annotations in format [&...] to object for storage * in `Node` object. Annotations in arrays are expected to be stored in braces, * and separated by ',' or ':'. For example ...Type={Blue,Red} or ...Type={Blue:Red} * @param {string} annotations - The string containing annotations in Newick format * @returns {typeof Node.prototype.annotation} - The parsed annotations as key value pairs * @property {string} [key] - The key of the annotation * @property {any} [value] - The value of the annotation */ function parseBeastAnnotations(annotations) { // Remove the '&' at the start if (annotations.startsWith('&')) { annotations = annotations.slice(1); } const annotation_object = {}; const pairs = annotations.split(/[,:](?![^{]*\})/g); // Split on all ',' and ':' not in braces '{}' pairs.forEach(pair => { const keyValue = pair.split('='); if (keyValue.length < 2) return; const key = keyValue[0].trim(); const value = keyValue[1]; // Handling array-like values enclosed in {} if (value.includes('{') && value.includes('}')) { annotation_object[key] = value.replace(/{|}/g, '').split(/,|:/g); } else { annotation_object[key] = value; } }); return annotation_object; } exports.parseBeastAnnotations = parseBeastAnnotations; /** * Parses NHX-type annotations in format [&&NHX:...] to object for storage * in `Node` object. Annotations in arrays are expected to be stored in braces. * @param {string} annotations - The string containing annotations in Newick format * @returns {typeof Node.prototype.annotation} - The parsed annotations as key value pairs * @property {string} [key] - The key of the annotation * @property {any} [value] - The value of the annotation */ function parseNHXAnnotations(annotations) { // Remove the '&&NHX:' at the start if (annotations.startsWith('&&NHX:')) { annotations = annotations.slice(6); } const annotation_object = {}; const pairs = annotations.split(/:/g); // Split on all ':' pairs.forEach(pair => { if (!pair) return; const keyValue = pair.split('='); if (keyValue.length < 2) return; const key = keyValue[0].trim(); const value = keyValue[1]; // Handling array-like values enclosed in {} if (value && value.includes('{') && value.includes('}')) { annotation_object[key] = value.replace(/{|}/g, '').split(/,/g); } else { annotation_object[key] = value; } }); return annotation_object; } exports.parseNHXAnnotations = parseNHXAnnotations; /** * Default annotation parser that checks for both BEAST and NHX formats * @param {string} annotations - The string containing annotations in Newick format * @returns {typeof Node.prototype.annotation} - The parsed annotations as key value pairs * @property {string} [key] - The key of the annotation * @property {any} [value] - The value of the annotation */ function parseNewickAnnotations(annotations) { if (annotations.startsWith('&&NHX:')) { return parseNHXAnnotations(annotations); } else { return parseBeastAnnotations(annotations); } } exports.parseNewickAnnotations = parseNewickAnnotations;