auspice
Version:
Web app for visualizing pathogen evolution
265 lines (237 loc) • 11 kB
JavaScript
import { rgb } from "d3-color";
import { errorNotification, successNotification, warningNotification } from "../notifications";
import { ADD_EXTRA_METADATA } from "../types";
import { parseCsv } from "./parseCsv";
const handleMetadata = async (dispatch, getState, file) => {
const fileName = file.name;
const reader = new FileReader();
reader.onload = async (event) => {
try {
const XLSX = (await import("xlsx/xlsx.mini")).default;
/* Convert accepted dropped file to CSV string */
/* If dropped file is Excel workbook, only reads in the data from the first sheet */
const workbook = XLSX.read(event.target.result, { type: 'binary', raw: true });
const firstSheet = workbook.Sheets[workbook.SheetNames[0]];
const sheetAsCsv = XLSX.utils.sheet_to_csv(firstSheet);
/* All accepted file formats have been converted to CSV string by xlsx */
/* Use papaparse to parse & interrogate the CSV string */
const {errors, data, meta} = await parseCsv(sheetAsCsv);
if (errors.length) {
console.error(errors);
throw new Error(errors.map((e) => e.message).join(", "));
}
const {coloringInfo, strainKey, latLongKeys, ignoredFields} = processHeader(meta.fields);
const rows = {};
data.forEach((d) => {rows[d[strainKey]]=d;});
/* For each coloring, extract values defined in each row etc */
const newNodeAttrs = {};
const newColorings = processColorings(newNodeAttrs, coloringInfo, rows, fileName); // modifies `newNodeAttrs`
const newGeoResolution = latLongKeys ? processLatLongs(newNodeAttrs, latLongKeys, rows, fileName) : undefined;
/* Fix errors in data & dispatch warnings here, as we cannot dispatch in the reducers */
const ok = checkDataForErrors(dispatch, getState, newNodeAttrs, newColorings, ignoredFields, fileName);
if (!ok) return undefined;
dispatch({type: ADD_EXTRA_METADATA, newColorings, newGeoResolution, newNodeAttrs});
return dispatch(successNotification({
message: `Adding metadata from ${fileName}`,
details: `${Object.keys(newColorings).length} new coloring${Object.keys(newColorings).length > 1 ? "s" : ""} for ${Object.keys(newNodeAttrs).length} node${Object.keys(newNodeAttrs).length > 1 ? "s" : ""}`
}));
} catch (err) {
return dispatch(errorNotification({
message: `Parsing of ${fileName} failed`,
details: err.message
}));
}
};
return reader.readAsBinaryString(file);
};
export default handleMetadata;
/* ---------------------- helper functions to parse data ---------------------------- */
function processHeader(fields) {
const strainKey = fields[0];
/* There are a number of "special case" columns we currently ignore */
const fieldsToIgnore = new Set(["name", "div", "vaccine", "labels", "hidden", "mutations", "url", "authors", "accession", "traits", "children"]);
fieldsToIgnore.add("num_date").add("year").add("month").add("date"); /* TODO - implement date parsing */
const latLongFields = new Set(["__latitude", "__longitude", "latitude", "longitude"]);
const ignoredFields = new Set();
const coloringInfo = fields.slice(1)
.map((fieldName) => {
if (fieldsToIgnore.has(fieldName)) {
ignoredFields.add(fieldName);
return null;
}
if (latLongFields.has(fieldName)) {
return null;
}
let name = fieldName;
const lookupKey = fieldName;
const scaleType = "categorical"; // TODO
/* interpret column names using microreact-style syntax */
if (fieldName.includes("__")) {
const [prefix, suffix] = fieldName.split("__");
if (["shape", "colour", "color"].includes(suffix)) {
ignoredFields.add(fieldName);
return null;
}
if (suffix === "autocolour") {
name = prefix; /* MicroReact uses this to colour things, but we do this by default */
}
}
return {name, lookupKey, scaleKey: undefined, scaleType};
})
.filter((x) => !!x)
.map((data) => {
if (fields.includes(`${data.name}__colour`)) {
data.scaleKey = `${data.name}__colour`;
} else if (fields.includes(`${data.name}__color`)) {
data.scaleKey = `${data.name}__color`;
}
return data;
});
/* check for the presense of lat/long fields */
const latLongKeys = (fields.includes("latitude") && fields.includes("longitude")) ?
{latitude: "latitude", longitude: "longitude"} :
(fields.includes("__latitude") && fields.includes("__longitude")) ?
{latitude: "__latitude", longitude: "__longitude"} :
undefined;
return {coloringInfo, header: fields, strainKey, latLongKeys, ignoredFields};
}
/**
* Add colorings defined by the CSV header (`coloringInfo`) and specified in each CSV
* row (`rows`) to the nodes (`newNodeAttrs`) and returns a `newColorings` object.
*/
function processColorings(newNodeAttrs, coloringInfo, rows, fileName) {
const newColorings = {};
for (const info of coloringInfo) {
const scaleMap = new Map(); // will only be populated if coloringInfo.scaleKey is defined
for (const [strain, row] of Object.entries(rows)) {
const value = row[info.lookupKey];
if (value) { // ignore empty strings (which arise from an empty CSV field)
if (!newNodeAttrs[strain]) newNodeAttrs[strain] = {};
newNodeAttrs[strain][info.name] = {value};
if (info.scaleKey && row[info.scaleKey] && row[info.scaleKey].match(/^#[A-Fa-f0-9]{6}$/)) {
if (!scaleMap.has(value)) scaleMap.set(value, []);
scaleMap.get(value).push(row[info.scaleKey]);
}
}
}
newColorings[info.name] = {
title: info.name,
type: info.scaleType /* TODO - attempt to guess this if no info supplied */
};
if (scaleMap.size) newColorings[info.name].scale = makeScale(info.name, scaleMap);
}
/* Add a boolean scale for presence/absence in this file */
newColorings[fileName] = {title: fileName, type: 'boolean'};
Object.keys(rows).forEach((strain) => {
if (!newNodeAttrs[strain]) newNodeAttrs[strain] = {};
/* Ideally the value here would be `true` but this causes UI issues in <Info> */
newNodeAttrs[strain][fileName] = {value: `Strains in ${fileName}`};
});
return newColorings;
}
function makeScale(colorBy, scaleMap) {
const scale = [];
for (const [traitValue, colors] of scaleMap) {
if (new Set(colors).size===1) {
scale.push([traitValue, colors[0]]);
} else {
// console.log(`Averaging colours for ${traitValue}`);
let r=0, g=0, b=0; // same algorithm as `getAverageColorFromNodes`
colors.forEach((c) => {
const tmpRGB = rgb(c);
r += tmpRGB.r;
g += tmpRGB.g;
b += tmpRGB.b;
});
const total = colors.length;
scale.push([traitValue, rgb(r/total, g/total, b/total).toString()]);
}
}
return scale;
}
/**
* Metadata defines lat-longs _per-sample_ which is orthogonal to Nextstrain's approach
* (where we associate coords to a metadata trait). The approach here is to create a new
* `node_attr` which represents the unique lat/longs provided here.
*/
function processLatLongs(newNodeAttrs, latLongKeys, rows, fileName) {
const coordsStrains = new Map();
/* Collect groups of strains with identical lat/longs */
Object.entries(rows).forEach(([strain, row]) => {
const [latitude, longitude] = [Number(row[latLongKeys.latitude]), Number(row[latLongKeys.longitude])];
if (isNaN(latitude) || isNaN(longitude) || latitude > 90 || latitude < -90 || longitude > 180 || longitude < -180) return;
const strKey = String(row[latLongKeys.latitude])+String(row[latLongKeys.longitude]);
if (!coordsStrains.has(strKey)) {
coordsStrains.set(strKey, {latitude, longitude, strains: new Set()});
}
coordsStrains.get(strKey).strains.add(strain);
});
/* invert map to link each strain to a dummy value with lat/longs */
const traitName = fileName+"_geo"; /* dummy trait name, but will be visible to the user! */
const newGeoResolution = {key: traitName, demes: {}};
let counter = 0;
coordsStrains.values().forEach(({latitude, longitude, strains}) => {
const traitValue = `deme_${counter++}`;/* dummy variable, but will be visible to the user! */
newGeoResolution.demes[traitValue] = {latitude, longitude};
strains.forEach((strain) => {
if (!newNodeAttrs[strain]) newNodeAttrs[strain] = {};
newNodeAttrs[strain][traitName] = {value: traitValue};
});
});
return newGeoResolution;
}
function checkDataForErrors(dispatch, getState, newNodeAttrs, newColorings, ignoredFields, fileName) {
const {controls, tree, treeToo} = getState();
const [droppedColorings, droppedNodes] = [new Set(), new Set()];
/* restrict the newNodeAttrs to nodes which are actually in the tree(s) */
const nodeNamesInTree = new Set(tree.nodes.map((n) => n.name)); // can be internal nodes
if (Array.isArray(treeToo.nodes)) {
treeToo.nodes.forEach((node) => nodeNamesInTree.add(node.name));
}
for (const name of Object.keys(newNodeAttrs)) {
if (!nodeNamesInTree.has(name)) {
droppedNodes.add(name);
delete newNodeAttrs[name];
}
}
/* restrict added colorings to those which have at least one valid value in the tree! */
for (const colorName of Object.keys(newColorings)) {
if (!Object.keys(newNodeAttrs).filter((strainName) => newNodeAttrs[strainName][colorName]).length) {
droppedColorings.add(colorName);
delete newColorings[colorName];
}
}
/* restrict added colorings to those _not_ currently present on the tree. This could be relaxed. TODO. */
for (const colorName of Object.keys(newColorings)) {
if (controls.coloringsPresentOnTree.has(colorName)) {
droppedColorings.add(colorName);
delete newColorings[colorName];
}
}
/* strip those droppedColorings out of `newNodeAttrs` */
for (const colorName of droppedColorings) {
for (const nodeAttr of Object.values(newNodeAttrs)) {
delete nodeAttr[colorName]; // Note that this works even if `colorName` is not a property
}
}
if (!Object.keys(newNodeAttrs).length || !Object.keys(newColorings).length) {
dispatch(errorNotification({
message: `${fileName} had no (relevent) information`,
details: Object.keys(newNodeAttrs).length ? "No columns to add as colorings" : "No taxa which match those in the tree"
}));
return false;
}
if (droppedColorings.size) {
dispatch(warningNotification({
message: `Ignoring ${droppedColorings.size} columns as they are already set as colorings or are "special" cases to be ignored`,
details: [...droppedColorings].join(", ")
}));
}
if (droppedNodes.size) {
dispatch(warningNotification({
message: `Ignoring ${droppedNodes.size} taxa (CSV rows) nodes (rows) as they don't appear in the tree`,
details: [...droppedNodes].join(", ")
}));
}
return true;
}