UNPKG

bakana

Version:

Backend for kana's single-cell analyses. This supports single or multiple samples, execution in Node.js or the browser, in-memory caching of results for iterative analyses, and serialization to/from file for redistribution.

384 lines (337 loc) 16.6 kB
import * as scran from "scran.js"; import * as bioc from "bioconductor"; import * as utils from "./utils/general.js"; import * as inputs_module from "./inputs.js"; import * as rutils from "../readers/index.js"; const baseUrl = "https://github.com/kanaverse/kana-special-features/releases/download/v1.0.0"; export const step_name = "rna_quality_control"; /** * Results of computing per-cell RNA-derived QC metrics, * see [here](https://kanaverse.github.io/scran.js/PerCellRnaQcMetricsResults.html) for details. * * @external PerCellRnaQcMetricsResults */ /** * Suggested filters for the RNA-derived QC metrics, * see [here](https://kanaverse.github.io/scran.js/SuggestRnaQcFiltersResults.html) for details. * * @external SuggestRnaQcFiltersResults */ /** * This step applies quality control on the RNA count matrix. * Specifically, it computes the QC metrics and filtering thresholds, * wrapping the [`perCellRnaQcMetrics`](https://kanaverse.github.io/scran.js/global.html#perCellRnaQcMetrics) * and [`suggestRnaQcFilters`](https://kanaverse.github.io/scran.js/global.html#suggestRnaQcFilters) functions * from [**scran.js**](https://github.com/kanaverse/scran.js). * Note that the actual filtering is done by {@linkplain CellFilteringState}. * * Methods not documented here are not part of the stable API and should not be used by applications. * @hideconstructor */ export class RnaQualityControlState { #inputs; #cache; #parameters; constructor(inputs, parameters = null, cache = null) { if (!(inputs instanceof inputs_module.InputsState)) { throw new Error("'inputs' should be an InputsState object"); } this.#inputs = inputs; this.#parameters = (parameters === null ? {} : parameters); this.#cache = (cache === null ? {} : cache); this.changed = false; } free() { utils.freeCache(this.#cache.metrics); utils.freeCache(this.#cache.filters); utils.freeCache(this.#cache.metrics_buffer); utils.freeCache(this.#cache.keep_buffer); } /*************************** ******** Getters ********** ***************************/ valid() { let input = this.#inputs.fetchCountMatrix(); return input.has("RNA"); } /** * @return {object} Object containing the parameters. */ fetchParameters() { let output = { ...this.#parameters }; // avoid pass-by-reference links. output.species = bioc.CLONE(output.species); return output; } /** * @return {Uint8WasmArray} Buffer containing a vector of length equal to the number of cells, * where each element is truthy if the corresponding cell is to be retained after filtering. * This is available after running {@linkcode RnaQualityControlState#compute compute}. */ fetchKeep() { return this.#cache.keep_buffer; } /** * @return {external:SuggestRnaQcFiltersResults} Result of filtering on the RNA-derived QC metrics. * This is available after running {@linkcode RnaQualityControlState#compute compute}. */ fetchFilters() { return this.#cache.filters; } /** * @return {external:PerCellRnaQcMetricsResults} RNA-derived QC metrics, * available after running {@linkcode RnaQualityControlState#compute compute}. */ fetchMetrics() { return this.#cache.metrics; } /**************************** ******** Defaults ********** ****************************/ /** * @return {object} Object containing default parameters, * see the `parameters` argument in {@linkcode RnaQualityControlState#compute compute} for details. */ static defaults () { return { guess_ids: true, gene_id_column: null, use_reference_mito: true, species: [], gene_id_type: "ENSEMBL", mito_prefix: "mt-", filter_strategy: "automatic", nmads: 3, sum_threshold: 100, detected_threshold: 100, mito_threshold: 0.2 }; } static configureFeatureParameters(use_reference_mito, guesses) { let best_key = null; let best = { type: "symbol", species: "9606", confidence: 0 }; if ("row_names" in guesses) { let val = guesses.row_names; if (val.confidence > best.confidence && (use_reference_mito || val.type == "symbol")) { best = val; } } for (const [key, val] of Object.entries(guesses.columns)) { if (val.confidence > best.confidence && (use_reference_mito || val.type == "symbol")) { best = val; best_key = key; } } return { gene_id_column: best_key, species: [best.species], gene_id_type: best.type.toUpperCase() }; } /** * Array of strings containing the taxonomy IDs for species where mitochondrial gene lists are available. * @type {Array} */ static mitochondriaSpecies = [ "9606", // Mouse "10090", // Human "6239", // C. elegans "10116", // Rat "9541", // M. fascicularis "7227", // Fly "7955", // Zebrafish "9598" // Chimp ]; /*************************** ******** Remotes ********** ***************************/ async #acquire_reference(species, feature_type) { let output = new Set; let mito_lists = RnaQualityControlState.#mito_lists; for (const s of species) { let target = s + "-mito-" + feature_type.toLowerCase() + ".txt.gz"; if (!(target in mito_lists)) { let contents = await RnaQualityControlState.#downloadFun(baseUrl + "/" + target); let lines = await rutils.readLines2(contents, { compression: "gz" }); mito_lists[target] = lines; } mito_lists[target].forEach(x => { output.add(x); }); } return output; } static #mito_lists = {}; /** * Flush all cached lists of mitochondrial genes. * * By default, {@linkcode RnaQualityControlState#compute compute} will cache the mitochondrial gene lists in a static member for re-use across {@linkplain RnaQualityControlState} instances. * These cached lists are not tied to any single instance and will not be removed by garbage collectors or by {@linkcode freeAnalysis}. * Rather, this function should be called to release the relevant memory. */ static flush() { RnaQualityControlState.#mito_lists = {}; return; } static #downloadFun = utils.defaultDownload; /** * Specify a function to download the reference mitochondrial gene lists. * * @param {function} fun - Function that accepts a single string containing a URL and returns any value that can be used in the {@linkplain SimpleFile} constructor. * This is most typically a Uint8Array of that URL's contents, but it can also be a path to a locally cached file on Node.js. * * @return `fun` is set as the global downloader for this step. * The _previous_ value of the downloader is returned. */ static setDownload(fun) { let previous = RnaQualityControlState.#downloadFun; RnaQualityControlState.#downloadFun = fun; return previous; } /*************************** ******** Compute ********** ***************************/ /** * This method should not be called directly by users, but is instead invoked by {@linkcode runAnalysis}. * * @param {object} parameters - Parameter object, equivalent to the `rna_quality_control` property of the `parameters` of {@linkcode runAnalysis}. * @param {boolean} [parameters.guess_ids] - Automatically choose feature-based parameters based on the feature annotation for the RNA modality. * If set to `true`, the following logic is applied: * * - If `use_reference_mito = true`, the annotation column that best matches human/mouse Ensembl/symbols is set as `gene_id_column`. * Based on the identified species and feature type, `species` and `gene_id_type` are also set. * - If `use_reference_mito = false`, the annotation column that best matches human/mouse symbols is set as `gene_id_column`. * * @param {?(string|number)} [parameters.gene_id_column] - Name or index of the column of the feature annotations that contains the gene identifiers for the RNA modality. * If `null`, the row names are used. * Ignored if `guess_ids = true`. * @param {boolean} [parameters.use_reference_mito] - Whether to use the reference lists of mitochondrial genes. * If `false`, mitochondrial genes are instead identified from their prefix. * @param {Array} [parameters.species] - Array of strings specifying zero, one or more species to use to obtain a reference list of mitochondrial genes. * Each entry should be a taxonomy ID (e.g. `"9606"`, `"10090"`) as specified in {@linkcode RnaQualityControlState#mitochondriaSpecies mitochondriaSpecies}). * Ignored if `guess_ids = true`. * @param {string} [parameters.gene_id_type] - Name of the feature type in the reference list of mitochondrial genes. * This can be any one of `"ENSEMBL"`, `"SYMBOL"`, or `"ENTREZ"`. * Ignored if `guess_ids = true`. * @param {?string} [parameters.mito_prefix] - Case-insensitive prefix to use to identify mitochondrial genes from the dataset. * Only used when `use_reference_mito = false`; in such cases, `gene_id_column` should point to symbols. * If `null`, no prefix-based identification is performed. * @param {string} [parameters.filter_strategy] - Strategy for defining a filter threshold for the QC metrics. * This can be `"automatic"` or `"manual"`. * @param {number} [parameters.nmads] - Number of MADs to use for automatically selecting the filter threshold for each metric. * Only used when `filter_strategy = "automatic"`. * @param {number} [parameters.sum_threshold] - Manual threshold on the sum of counts for each cell. * Cells are only retained if their sums are equal to or greater than this threshold. * Only used when `filter_strategy = "manual"`. * @param {number} [parameters.detected_threshold] - Manual threshold on the detected number of features for each cell. * Cells are only retained if the detected number is equal to or greater than this threshold. * Only used when `filter_strategy = "manual"`. * @param {number} [parameters.mito_threshold] - Manual threshold on the mitochondrial proportion for each cell. * Cells are only retained if their totals are less than or equal to this threshold. * Only used when `filter_strategy = "manual"`. * * @return The object is updated with the new results. * @async */ async compute(parameters) { parameters = utils.defaultizeParameters(parameters, RnaQualityControlState.defaults(), ["automatic"]); this.changed = false; // Some back-compatibility here. if (typeof parameters.guess_ids == "undefined") { if ("automatic" in parameters) { parameters.guess_ids = parameters.automatic; } else { parameters.guess_ids = true; } } if ( this.#inputs.changed || parameters.guess_ids !== this.#parameters.guess_ids || parameters.use_reference_mito !== this.#parameters.use_reference_mito || ( !parameters.guess_ids && ( parameters.gene_id_column !== this.#parameters.gene_id_column || (!parameters.use_reference_mito && parameters.mito_prefix !== this.#parameters.mito_prefix) || ( parameters.use_reference_mito && ( utils.changedParameters(parameters.species, this.#parameters.species) || parameters.gene_id_type !== this.#parameters.gene_id_type ) ) ) ) ) { utils.freeCache(this.#cache.metrics); if (this.valid()) { let gene_id_column2 = parameters.gene_id_column; let species2 = parameters.species; let gene_id_type2 = parameters.gene_id_type; if (parameters.guess_ids) { let guesses = this.#inputs.guessRnaFeatureTypes(); let backcomp = RnaQualityControlState.configureFeatureParameters(parameters.use_reference_mito, guesses); gene_id_column2 = backcomp.gene_id_column; species2 = backcomp.species; gene_id_type2 = backcomp.gene_id_type; } var gene_info = this.#inputs.fetchFeatureAnnotations()["RNA"]; let val = (gene_id_column2 == null ? gene_info.rowNames() : gene_info.column(gene_id_column2)); var subsets = utils.allocateCachedArray(gene_info.numberOfRows(), "Uint8Array", this.#cache, "metrics_buffer"); subsets.fill(0); if (val !== null) { if (parameters.use_reference_mito) { let lists = await this.#acquire_reference(species2, gene_id_type2); var sub_arr = subsets.array(); val.forEach((x, i) => { if (lists.has(x)) { sub_arr[i] = 1; } }); } else if (parameters.mito_prefix !== null) { var lower_mito = parameters.mito_prefix.toLowerCase(); var sub_arr = subsets.array(); val.forEach((x, i) => { if(x.toLowerCase().startsWith(lower_mito)) { sub_arr[i] = 1; } }); } } var mat = this.#inputs.fetchCountMatrix().get("RNA"); this.#cache.metrics = scran.perCellRnaQcMetrics(mat, [subsets]); this.changed = true; } else { delete this.#cache.metrics; } } if (this.changed || parameters.filter_strategy !== this.#parameters.filter_strategy || parameters.nmads !== this.#parameters.nmads || parameters.sum_threshold !== this.#parameters.sum_threshold || parameters.detected_threshold !== this.#parameters.detected_threshold || parameters.mito_threshold !== this.#parameters.mito_threshold ) { utils.freeCache(this.#cache.filters); if (this.valid()) { let block = this.#inputs.fetchBlock(); if (parameters.filter_strategy === "automatic") { this.#cache.filters = scran.suggestRnaQcFilters(this.#cache.metrics, { numberOfMADs: parameters.nmads, block: block }); } else if (parameters.filter_strategy === "manual") { let block_levels = this.#inputs.fetchBlockLevels(); this.#cache.filters = scran.emptySuggestRnaQcFiltersResults(1, block_levels === null ? 1 : block_levels.length); this.#cache.filters.sum({ copy: false }).fill(parameters.sum_threshold); this.#cache.filters.detected({ copy: false }).fill(parameters.detected_threshold); this.#cache.filters.subsetProportion(0, { copy: false }).fill(parameters.mito_threshold); } else { throw new Error("unknown RNA QC filtering strategy '" + filter_strategy + "'"); } var keep = utils.allocateCachedArray(this.#cache.metrics.numberOfCells(), "Uint8Array", this.#cache, "keep_buffer"); this.#cache.filters.filter(this.#cache.metrics, { block: block, buffer: keep }); this.changed = true; } else { delete this.#cache.filters; } } this.#parameters = parameters; this.#parameters.species = bioc.CLONE(this.#parameters.species); // avoid pass-by-reference behavior. return; } }