UNPKG

scran.js

Version:

Single cell RNA-seq analysis in Javascript

199 lines (180 loc) 7.12 kB
import * as utils from "./utils.js"; /** * Guess the identity of the features from their names. * * @param {Array} features - Array of strings containing feature identifiers, typically Ensembl IDs or gene symbols. * Elements may also be `null` or undefined if an identifier is missing. * @param {object} [options={}] - Optional parameters. * @param {boolean} [options.forceTaxonomy=false] - Whether to force the use of taxonomy IDs for human and mouse. * This is `false` for back compatibility. * * @return {object} An object containing: * * - `species`, the inferred species as a string. * This can be either `"human"` or `"mouse"`, or an NCBI taxonomy ID (one of 6239, 10116, 9541, 7227, 7955, 9598). * If `forceTaxonomy = true`, human and mouse are replaced with 9606 and 10090, respectively. * - `type`: the feature identifier type. * This can either be `"ensembl"` or `"symbol"`. * - `confidence`: the percentage of entries in `x` that are consistent with the inferred identity. */ export function guessFeatures(features, options = {}) { const { forceTaxonomy = false, ...others } = options; utils.checkOtherOptions(others); let ntotal = features.length; let early_threshold = Math.ceil(ntotal / 2); let format = payload => { payload.confidence /= ntotal; return payload; }; // Duplicated entries only count as one match, so as to avoid problems with // chromosome positions, feature type specifications, etc. Note that we // still need to use the full length to compute 'ntotal', otherwise we // wouldn't be penalizing the duplicates properly. let unique_features = new Set; for (const f of features) { if (typeof f == "string") { unique_features.add(f); } } let ensembl_human = 0; let ensembl_mouse = 0; let ensembl_6239 = 0; let ensembl_10116 = 0; // Ensembl only, Rat symbols are indistiguishable from mice. let ensembl_9541 = 0; // Ensembl only, Mfac symbols are indistiguishable from human. let ensembl_7227 = 0; // Ensembl only, fly symbols are crazy. let ensembl_7955 = 0; let ensembl_9598 = 0; // Ensembl only, Chimp symbols are indistinguishable from human. let symbol_human = 0; let symbol_mouse = 0; let symbol_6239 = 0; let symbol_7955 = 0; let hsid = (forceTaxonomy ? "9606" : "human"); let mmid = (forceTaxonomy ? "10090" : "mouse"); let collected = []; // Checking if it's any type of Ensembl. let any_ens = 0; for (const x of unique_features) { if (x && x.match(/^ENS[A-Z]*G[0-9]{11}$/)) { any_ens++; } } if (any_ens) { for (const x of unique_features) { if (x) { if (x.startsWith("ENSG")) { ensembl_human++; } else if (x.startsWith("ENSMUSG")) { ensembl_mouse++; } else if (x.startsWith("ENSRNOG")) { ensembl_10116++; } else if (x.startsWith("ENSMFAG")) { ensembl_9541++; } else if (x.startsWith("ENSDARG")) { ensembl_7955++; } else if (x.startsWith("ENSPTRG")) { ensembl_9598++; } } } collected.push({ species: hsid, type: "ensembl", confidence: ensembl_human }); collected.push({ species: mmid, type: "ensembl", confidence: ensembl_mouse }); collected.push({ species: "10116", type: "ensembl", confidence: ensembl_10116 }); collected.push({ species: "9541", type: "ensembl", confidence: ensembl_9541 }); collected.push({ species: "7955", type: "ensembl", confidence: ensembl_7955 }); collected.push({ species: "9598", type: "ensembl", confidence: ensembl_9598 }); // See if we can quit early and avoid the other checks. for (const x of collected) { if (x.confidence >= early_threshold) { return format(x); } } } // Human symbol; starts with upper case, no lower case, and not an Ensembl of any kind. // We also ignore VEGA gene identifiers, as these are antiquated; and MGI identifiers, // which are all-caps and thus confusing. for (const x of unique_features) { if (x && x.match(/^[A-Z][^a-z]+$/) && !x.match(/^ENS[A-Z]+[0-9]{11}/) && !x.match(/^OTT.{4}[0-9]{11}/) && !x.match(/^MGI:[0-9]+/)) { symbol_human++; } } { let payload = { species: hsid, type: "symbol", confidence: symbol_human }; if (payload.confidence >= early_threshold) { return format(payload); } collected.push(payload); } // Mouse symbol; starts with upper case, but no upper case after that. for (const x of unique_features) { if (x && x.match(/^[A-Z][^A-Z]+$/)) { symbol_mouse++; } } { let payload = { species: mmid, type: "symbol", confidence: symbol_mouse }; if (payload.confidence >= early_threshold) { return format(payload); } collected.push(payload); } // Worm Ensembl (WormBase). for (const x of unique_features) { if (x && x.match(/^WBGene[0-9]+$/)) { ensembl_6239++; } } { let payload = { species: "6239", type: "ensembl", confidence: ensembl_6239 }; if (payload.confidence >= early_threshold) { return format(payload); } collected.push(payload); } // Fly Ensembl (FlyBase). for (const x of unique_features) { if (x && x.match(/^FBgn[0-9]+$/)) { ensembl_7227++; } } { let payload = { species: "7227", type: "ensembl", confidence: ensembl_7227 }; if (payload.confidence >= early_threshold) { return format(payload); } collected.push(payload); } // Worm symbols; at least three lower case with a dash and numbers. for (const x of unique_features) { if (x && x.match(/^[a-z]{3,}-[0-9]+$/)) { symbol_6239++; } } { let payload = { species: "6239", type: "symbol", confidence: symbol_6239 }; if (payload.confidence >= early_threshold) { return format(payload); } collected.push(payload); } // Zebrafish symbols; at least three lower case letters, no dash, followed by numbers and/or more lower case. for (const x of unique_features) { if (x && x.match(/^[a-z]{3,}[0-9a-z]+$/)) { symbol_7955++; } } { let payload = { species: "7955", type: "symbol", confidence: symbol_7955 }; if (payload.confidence >= early_threshold) { return format(payload); } collected.push(payload); } // Picking the best. let highest = collected[0]; for (var i = 1; i < collected.length; i++) { if (collected[i].confidence > highest.confidence) { highest = collected[i]; } } return format(highest); }