UNPKG

auspice

Version:

Web app for visualizing pathogen evolution

330 lines (307 loc) 12.1 kB
import { isValueValid, strainSymbol } from "./globals"; /* --- TO IMPROVE ----- These "getter" functions for node-related data require knowledge of the semantics of how data is stored on a node. For instance, you need to know that `num_date` is stored in a different structure to `div`. This logic should be encapsulated within `getTraitFromNode` so we don't need separate `getDivFromNode` functions etc. james hadfield, nov 2019. */ /** * Given a coloring key or a geographic resolution key * (sometimes referred to as a "trait") * e.g. "author", "country" etc, extract it's value from a node. * * If `entropy` is truthy, then extract the entropy value instead * If `confidence` is truthy, then extract the confidence value instead * * Returns `undefined` if not set OR if the value is not valid. * * NOTE: this only accesses `node_attrs` -- if you want the name or a branch * attr then this function is not the one you are looking for. * * NOTE: do not use this for "div", "vaccine" or other traits set on `node_attrs` * which don't share the same structure as traits. See the JSON spec for more details. */ export const getTraitFromNode = (node, trait, {entropy=false, confidence=false, raw=false}={}) => { if (!node.node_attrs) return undefined; if (!entropy && !confidence && !raw) { if (!node.node_attrs[trait]) { if (trait === strainSymbol) return node.name; return undefined; } const value = node.node_attrs[trait].value; if (!isValueValid(value)) return undefined; return value; } else if (entropy) { if (node.node_attrs[trait]) return node.node_attrs[trait].entropy; return undefined; } else if (confidence) { if (node.node_attrs[trait]) return node.node_attrs[trait].confidence; return undefined; } else if (raw) { if (node.node_attrs[trait]) return node.node_attrs[trait].raw_value; return undefined; } return undefined; }; export const getDivFromNode = (node) => { /* see comment at top of this file */ if (node.node_attrs && node.node_attrs.div !== undefined) { return node.node_attrs.div; } return undefined; }; export const getVaccineFromNode = (node) => { /* see comment at top of this file */ if (node.node_attrs && node.node_attrs.vaccine) { return node.node_attrs.vaccine; } return undefined; }; export const getFullAuthorInfoFromNode = (node) => (node.node_attrs && node.node_attrs.author && node.node_attrs.author.value) ? node.node_attrs.author : undefined; export const getAccessionFromNode = (node) => { /* see comment at top of this file */ let accession, url; if (node.node_attrs) { if (isValueValid(node.node_attrs.accession)) { accession = node.node_attrs.accession; } url = validateUrl(node.node_attrs.url); } return {accession, url}; }; /* see comment at top of this file */ export const getUrlFromNode = (node, trait) => { if (!node.node_attrs || !node.node_attrs[trait]) return undefined; return validateUrl(node.node_attrs[trait].url); }; /** * Check if a URL seems valid & return it. * For historical reasons, we allow URLs to be defined as `http[s]_` and coerce these into `http[s]:` * URls are interpreted by `new URL()` and thus may be returned with a trailing slash * @param {String} url URL string to validate * @returns {String|undefined} potentially modified URL string or `undefined` (if it doesn't seem valid) */ function validateUrl(url) { if (url===undefined) return undefined; // urls are optional, so return early to avoid the console warning try { if (typeof url !== "string") throw new Error(); if (url.startsWith("http_")) url = url.replace("http_", "http:"); if (url.startsWith("https_")) url = url.replace("https_", "https:"); const urlObj = new URL(url); return urlObj.href; } catch (err) { console.warn(`Dataset provided the invalid URL ${url}`); return undefined; } } /** * Traverses the tree and returns a set of genotype states such as * {"nuc:123A", "S:418K"}. * Note 1: Only variable sites are considered. * Note 2: Basal states are included in the returned value. */ export function collectGenotypeStates(nodes) { const observedStates = new Set(); nodes.forEach((n) => { if (n.branch_attrs && n.branch_attrs.mutations && Object.keys(n.branch_attrs.mutations).length) { Object.entries(n.branch_attrs.mutations).forEach(([gene, mutations]) => { mutations.forEach((m) => { const [from, pos, to] = [m.slice(0, 1), m.slice(1, -1), m.slice(-1)]; observedStates.add(`${gene} ${pos}${to}`); observedStates.add(`${gene} ${pos}${from}`); // ancestral state, relative to this node }); }); } }); return observedStates; } /** * Walk from the proivided node back to the root, collecting all mutations as we go. * Multiple mutations (e.g. root -> A<pos>B -> B<pos>C -> fromNode) will be collapsed to as A<pos>C * Reversions to root (e.g. root -> A<pos>B -> B<pos>A -> fromNode) will be reported as A<pos>A * Returned structure is <returnedObject>.<geneName>.<position> = [<from>, <to>] */ export const getSeqChanges = (fromNode) => { const mutations = {}; const walk = (n) => { if (n.branch_attrs && n.branch_attrs.mutations && Object.keys(n.branch_attrs.mutations).length) { Object.entries(n.branch_attrs.mutations).forEach(([gene, muts]) => { if ((gene === "nuc") || gene !== "nuc") { if (!mutations[gene]) mutations[gene] = {}; muts.forEach((m) => { /* 'from' is the base closer to the root, 'to' is the more derived base (closer to the tip) */ const [from, pos, to] = [m.slice(0, 1), m.slice(1, -1), m.slice(-1)]; // note: `pos` is a string if (mutations[gene][pos]) { mutations[gene][pos][0] = from; // mutation already seen => update ancestral state. } else { mutations[gene][pos] = [from, to]; } }); } }); } const nIdx = n.arrayIdx; const parent = n.parent; if (parent && parent.arrayIdx !== nIdx) { walk(parent); } }; walk(fromNode); return mutations; }; /** * Categorise each mutation into one or more of the following categories: * (1) undeletions (probably bioinformatics errors, but not always) * (2) gaps * (3) Ns (only applicable for nucleotides) * (4) homoplasies (mutation observed elsewhere on the tree) * (5) unique mutations (those which are only observed once) * (6) reversions to root * Categories 1-5 are mutually exclusive, with the first matching category used. * (e.g. an undeletion is never a homoplasy, even if it occurs multiple times). * Entries in category 6 will also appear in a previous group. */ export const categoriseMutations = (mutations, observedMutations, seqChangesToRoot) => { const categorisedMutations = {}; for (const gene of Object.keys(mutations)) { const categories = { unique: [], homoplasies: [], gaps: [], reversionsToRoot: [], undeletions: []}; const isNuc = gene==="nuc"; if (isNuc) categories.ns = []; mutations[gene].forEach((mut) => { const oldChar = mut.slice(0, 1); const newChar = mut.slice(-1); if (oldChar==="-") { /* undeletions are most probably bioinformatics errors, so collect them into a separate category */ categories.undeletions.push(mut); } else if (newChar==="-") { categories.gaps.push(mut); } else if (isNuc && newChar==="N") { categories.ns.push(mut); } else if (observedMutations[`${gene}:${mut}`] > 1) { categories.homoplasies.push(mut); } else { categories.unique.push(mut); } // check to see if this mutation is a reversion to root const pos = mut.slice(1, -1); if (oldChar!=="-" && newChar!=="-" && newChar!=="N" && seqChangesToRoot[gene] && seqChangesToRoot[gene][pos] && seqChangesToRoot[gene][pos][0]===seqChangesToRoot[gene][pos][1]) { categories.reversionsToRoot.push(mut); } }); categorisedMutations[gene]=categories; } return categorisedMutations; }; /** * Categorise seq changes (i.e. the accumulated changes between a tip and the (subtree) root) * into the following categories (first matching category used): * (1) gaps * (2) Ns (nucleotides only) * (3) Reversions to root * (4) Base Changes * * TODO: This function shares a lot of logic with `categoriseMutations()` and is thus prone to drift */ export const categoriseSeqChanges = (seqChangesToRoot) => { const categorisedSeqChanges = {}; for (const gene of Object.keys(seqChangesToRoot)) { const categories = { changes: [], gaps: [], reversionsToRoot: []}; const isNuc = gene==="nuc"; if (isNuc) categories.ns = []; for (const [pos, [from, to]] of Object.entries(seqChangesToRoot[gene])) { const mut = `${from}${pos}${to}`; if (to==="-") { categories.gaps.push(mut); } else if (isNuc && to==="N") { categories.ns.push(mut); } else if (from===to) { categories.reversionsToRoot.push(mut); } else { categories.changes.push(mut); } } categorisedSeqChanges[gene]=categories; } return categorisedSeqChanges; }; /** * Return the mutations on the branch split into (potentially overlapping) categories * @param {Object} branchNode * @param {Object} observedMutations all observed mutations on the tree * @returns {Object} */ export const getBranchMutations = (branchNode, observedMutations) => { const mutations = branchNode.branch_attrs && branchNode.branch_attrs.mutations; if (typeof mutations !== "object") return {}; const seqChangesToRoot = branchNode.parent===branchNode ? {} : getSeqChanges(branchNode, mutations); const categorisedMutations = categoriseMutations(mutations, observedMutations, seqChangesToRoot); return categorisedMutations; }; /** * Return the changes between the terminal node and the root, split into (potentially overlapping) categories * @param {Object} tipNode * @returns {Object} */ export const getTipChanges = (tipNode) => { const mutations = tipNode.branch_attrs && tipNode.branch_attrs.mutations; const seqChanges = getSeqChanges(tipNode, mutations); const categorisedSeqChanges = categoriseSeqChanges(seqChanges); return categorisedSeqChanges; }; /** * Returns a function which will sort a list, where each element in the list * is a gene name. Sorted by start position of the gene, with "nuc" first. */ export const sortByGeneOrder = (genomeMap) => { if (!genomeMap) return () => 0; /* Sort CDSs based on the genome position of the start codon */ const cdsPos = genomeMap[0].genes.map((gene) => gene.cds.map((cds) => [cds.name, cds.segments[0].rangeGenome[cds.strand==='+' ? 0 : 1]]) ).flat(); cdsPos.sort((a, b) => a[1]>b[1] ? 1 : -1) const order = {}; cdsPos.forEach(([name,], idx) => {order[name] = idx+1;}); order.nuc=0; // Nuc is always first /* Returned function takes two CDS names so it can be used as the sort function for an array of CDS names */ return (a, b) => { if (order[a]===undefined) return 1; if (order[b]===undefined) return -1; return order[a] - order[b]; }; }; /** * Add extra per-node attrs into the `nodes` data structure. Key clashes will result in the * new data overwriting the existing data * @param {Array} nodes d * @param {Object} newAttrs * @returns undefined - modifies the `nodes` param in-place */ export const addNodeAttrs = (nodes, newAttrs) => { nodes.forEach((node) => { if (newAttrs[node.name]) { if (!node.node_attrs) node.node_attrs = {}; for (const [attrName, attrData] of Object.entries(newAttrs[node.name])) { node.node_attrs[attrName] = attrData; } } }); }; /** * Remove attrs from the `nodes` data structure. * @param {Array} nodes * @param {Array} attrsToRemove */ export const removeNodeAttrs = (nodes, attrsToRemove) => { nodes.forEach((node) => { if(!node.node_attrs) return; attrsToRemove.forEach((attrName) => { delete node.node_attrs[attrName]; }) }) }