phylojs
Version:
A simple typescript library for phylogenetic trees
307 lines (306 loc) • 12.9 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseNewickAnnotations = exports.parseNHXAnnotations = exports.parseBeastAnnotations = exports.parseHybridLabels = exports.readTreesFromNewick = exports.readNewick = void 0;
const __1 = require("../../");
const error_1 = require("../../utils/error");
/**
* Parse a string in the New Hampshire (Newick) format and return a tree object.
*
* This function reads a Newick string from left to right. It is based on
* the `kn_parse` function by Heng Li for jstreeview (https://github.com/lh3/jstreeview/blob/main/knhx.js),
* modified for compatibility with our Tree object and to prevent ';' assignment as root label.
*
* Supports node annotations in both BEAST [&...] and NHX [&&NHX:...] formats through the annotationParser parameter.
* Users can provide their own annotation parser function to handle custom formats.
* Also handles hybrid nodes marked with # notation following Cardona et al. 2008.
*
* If the input string contains multiple trees (separated by newlines), only the first tree is parsed
* and a warning is issued. Use readTreesFromNewick() to parse multiple trees.
*
* Node IDs are unique and allocated in order of parsing. Specifically, leaf node IDs are numbered
* according to the order in which they are encountered. Where a close parenthesis ')' is
* encountered, an internal node is added with an incremented ID. This means that leaf node IDs
* are not guaranteed to be contiguous or numbered from 0 to n-1. For example, for `(A,B);`, leaf A has index 1,
* leaf B has index 2, and the root node has index 3. In general the root will have the highest index.
*
* To renumber nodes, one could use `.preorderTraversal()` or `.postorderTraversal()` methods.
*
* @param {string} newick - The string in Newick format to parse
* @returns {Tree} - The constructed phylogenetic tree
*/
function readNewick(newick, annotationParser = parseNewickAnnotations) {
const stack = []; // Stack to track node relationships during parsing
const nodes = []; // Array to store all created nodes
// Check if multiple trees are included in the string
if (newick.includes('\n')) {
newick = newick.slice(0, newick.indexOf('\n'));
console.warn('Multiple trees in Newick string. Only reading the first tree. Use readTreesFromNewick() to read all trees.');
}
// Parse the string character by character
let position = 0;
while (position < newick.length) {
// Skips over space and delete (non-printable) characters in ASCII
while (position < newick.length &&
(newick.charAt(position) < '!' || newick.charAt(position) > '~')) {
position++;
}
if (position === newick.length)
break;
const currentChar = newick.charAt(position);
if (currentChar === ',') {
// Comma separates nodes at the same level
position++;
}
else if (currentChar === '(') {
// Opening parenthesis indicates the start of child nodes
stack.push(-1); // -1 marks new set of sister nodes
position++;
}
else if (currentChar === ')') {
// Closing parenthesis indicates the end of sister nodes
const newNodeIndex = nodes.length;
let stackIndex, childIndex;
// Search backwards for first sibling node (most recent opening parenthesis)
for (stackIndex = stack.length - 1; stackIndex >= 0; --stackIndex) {
if (stack[stackIndex] < 0)
break;
}
if (stackIndex < 0) {
throw new Error('Unmatched closing parenthesis in Newick string');
}
// Number of children we need to add
const childCount = stack.length - 1 - stackIndex;
// Add new node, parse its label/branch length, and update position
position = kn_add_node(newick, position + 1, nodes, newNodeIndex, annotationParser);
// Connect children to the new parent node
for (stackIndex = stack.length - 1, childIndex = childCount - 1; childIndex >= 0; stackIndex--, childIndex--) {
nodes[newNodeIndex].children[childIndex] = nodes[stack[stackIndex]];
nodes[stack[stackIndex]].parent = nodes[newNodeIndex];
}
// Remove processed nodes from stack
stack.length = stackIndex;
stack.push(newNodeIndex);
}
else {
// Add leaves. Parent established when ')' next encountered in case above^
stack.push(nodes.length);
position = kn_add_node(newick, position, nodes, nodes.length, annotationParser);
}
}
if (stack.length > 1) {
console.warn('Multiple unconnected trees found in Newick string');
}
// Create and return the tree with the last node as root
return new __1.Tree(nodes[nodes.length - 1]);
}
exports.readNewick = readNewick;
/**
* Reads .newick strings, separated by ';' and returns an array of Trees.
* @param {string} newick
* @returns {Tree[]} Tree
*/
function readTreesFromNewick(newick) {
const trees = [];
const lines = newick.split(/;\s*\n/);
for (let thisLine of lines) {
thisLine = thisLine.trim();
if (thisLine.length === 0)
continue;
try {
trees.push(readNewick(thisLine));
}
catch (e) {
if (e instanceof error_1.SkipTreeException) {
console.log('Skipping Newick tree: ' + e.message);
}
else {
throw e;
}
}
}
return trees;
}
exports.readTreesFromNewick = readTreesFromNewick;
/**
* Parses a node from a Newick string and adds it to the nodes array.
*
* This function extracts node information (label, branch length, annotations) from
* the Newick string starting at position l. It creates a new Node object, populates
* its properties, and adds it to the nodes array.
*
* @param {string} str - The Newick format string being parsed
* @param {number} position - The starting position in the string to parse from
* @param {Node[]} nodes - The array where all created nodes are stored
* @param {number} newNodeIndex - The index to assign to the new node
* @returns {number} - The position in the string where parsing for this node ended
*/
function kn_add_node(str, position, nodes, newNodeIndex, annotationParser) {
const beg = position;
let end = 0, i, j;
const z = new __1.Node(newNodeIndex);
let label; // Node label
for (i = position; i < str.length && str.charAt(i) != ',' && str.charAt(i) != ')'; ++i) {
const c = str.charAt(i);
if (c == '[') {
// TODO: Custom open delimiter?
const meta_beg = i;
if (end == 0)
end = i;
do
++i;
while (i < str.length && str.charAt(i) != ']');
if (i == str.length) {
throw new Error('Unclosed annotation bracket in Newick string');
break;
}
z.annotation = annotationParser(str.slice(meta_beg + 1, i));
}
else if (c == ':') {
// Parse branch length
if (end == 0)
end = i;
for (j = ++i; i < str.length; ++i) {
const cc = str.charAt(i);
if ((cc < '0' || cc > '9') &&
cc != 'e' &&
cc != 'E' &&
cc != '+' &&
cc != '-' &&
cc != '.')
break;
}
z.branchLength = parseFloat(str.slice(j, i));
--i;
}
else if (c < '!' && c > '~' && end == 0)
end = i;
}
if (end == 0)
end = i;
if (end > beg) {
label = str
.slice(beg, end)
.replace(/;$/g, '')
.replace(/^"|"$/g, '') // remove quotes
.replace(/^'|'$/g, ''); // remove quotes
if (label.includes('#')) {
// Hybrid case
const parsedLabel = parseHybridLabels(label);
z.label = parsedLabel['label'];
z.hybridID = parsedLabel['hybridID'];
}
else {
label.length > 0 ? (z.label = label) : (z.label = undefined);
}
}
nodes.push(z);
return i;
}
/**
* Function parses hybrid id labels, which are assumed to contain '#'.
* Following Cardona et al. 2008, (https://doi.org/10.1186/1471-2105-9-532).
* Function expects unparsed labels to be of the form [label]#[type]i[:branch-length]
* where '#' and i (the hybrid node ID) are mandatory. PhyloJS ignores the type annotation
* (H for hybridisation, LGT for lateral gene transfer, R for recombination) and extracts only
* the label and hybridID, following icyTREE.
* @param {string} label
* @returns {HybridInformation}
*/
function parseHybridLabels(label) {
if (!label.includes('#'))
throw 'No hash(#), in hybrid label.';
const splitLabel = label.split('#');
const parsedLabel = splitLabel[0].length > 0 ? splitLabel[0] : undefined;
const hybridID = Number(splitLabel[1].replace(/H|LGT|R/g, '')); // remove hybridisation types
if (!Number.isInteger(hybridID))
throw 'Hybrid ID is not an integer!';
const info = {
label: parsedLabel,
hybridID: hybridID,
};
return info;
}
exports.parseHybridLabels = parseHybridLabels;
/**
* Parses BEAST-type annotations in format [&...] to object for storage
* in `Node` object. Annotations in arrays are expected to be stored in braces,
* and separated by ',' or ':'. For example ...Type={Blue,Red} or ...Type={Blue:Red}
* @param {string} annotations - The string containing annotations in Newick format
* @returns {typeof Node.prototype.annotation} - The parsed annotations as key value pairs
* @property {string} [key] - The key of the annotation
* @property {any} [value] - The value of the annotation
*/
function parseBeastAnnotations(annotations) {
// Remove the '&' at the start
if (annotations.startsWith('&')) {
annotations = annotations.slice(1);
}
const annotation_object = {};
const pairs = annotations.split(/[,:](?![^{]*\})/g); // Split on all ',' and ':' not in braces '{}'
pairs.forEach(pair => {
const keyValue = pair.split('=');
if (keyValue.length < 2)
return;
const key = keyValue[0].trim();
const value = keyValue[1];
// Handling array-like values enclosed in {}
if (value.includes('{') && value.includes('}')) {
annotation_object[key] = value.replace(/{|}/g, '').split(/,|:/g);
}
else {
annotation_object[key] = value;
}
});
return annotation_object;
}
exports.parseBeastAnnotations = parseBeastAnnotations;
/**
* Parses NHX-type annotations in format [&&NHX:...] to object for storage
* in `Node` object. Annotations in arrays are expected to be stored in braces.
* @param {string} annotations - The string containing annotations in Newick format
* @returns {typeof Node.prototype.annotation} - The parsed annotations as key value pairs
* @property {string} [key] - The key of the annotation
* @property {any} [value] - The value of the annotation
*/
function parseNHXAnnotations(annotations) {
// Remove the '&&NHX:' at the start
if (annotations.startsWith('&&NHX:')) {
annotations = annotations.slice(6);
}
const annotation_object = {};
const pairs = annotations.split(/:/g); // Split on all ':'
pairs.forEach(pair => {
if (!pair)
return;
const keyValue = pair.split('=');
if (keyValue.length < 2)
return;
const key = keyValue[0].trim();
const value = keyValue[1];
// Handling array-like values enclosed in {}
if (value && value.includes('{') && value.includes('}')) {
annotation_object[key] = value.replace(/{|}/g, '').split(/,/g);
}
else {
annotation_object[key] = value;
}
});
return annotation_object;
}
exports.parseNHXAnnotations = parseNHXAnnotations;
/**
* Default annotation parser that checks for both BEAST and NHX formats
* @param {string} annotations - The string containing annotations in Newick format
* @returns {typeof Node.prototype.annotation} - The parsed annotations as key value pairs
* @property {string} [key] - The key of the annotation
* @property {any} [value] - The value of the annotation
*/
function parseNewickAnnotations(annotations) {
if (annotations.startsWith('&&NHX:')) {
return parseNHXAnnotations(annotations);
}
else {
return parseBeastAnnotations(annotations);
}
}
exports.parseNewickAnnotations = parseNewickAnnotations;