@plantinformatics/vcf-genotype-brapi
Version:
Client and server functions to access genotype data from VCF via a custom web API and BrAPI
709 lines (647 loc) • 27 kB
JavaScript
// import { promisifyFn as promisify } from './promisify';
// const util = require('util');
// const util = {promisify}; // require('util');
import { promisify } from 'util';
import intervalTree1d from "interval-tree-1d";
const createIntervalTree = intervalTree1d;
// import createIntervalTree from "interval-tree-1d";
console.log('createIntervalTree', createIntervalTree, intervalTree1d);
//------------------------------------------------------------------------------
import { parseStringFields } from './parseStringFields';
import * as childProcessProgressive from '@plantinformatics/child-process-progressive/dist/child-process-progressive.mjs';
console.log('childProcessProgressive', childProcessProgressive);
const { ErrorStatus } = childProcessProgressive.default.errorStatus; // = require('./errorStatus.js');
/*
function childProcess() { }
function dataOutReplyClosureLimit() { }
function dataReduceClosure() { }
function stringCountString() { }
*/
const /*import*/ {
childProcess, dataOutReplyClosureLimit, dataOutReplyClosure, dataReduceClosure,
stringCountString,
} = childProcessProgressive.default.childProcess;
// from '@plantinformatics/child-process-progressive/dist/child-process-progressive.mjs'; // ../utilities/child-process
import intervalBins from 'interval-bins/dist/interval-bins.mjs';
console.log('interval-bins', intervalBins);
const /* import*/ { binEvenLengthRound, binBoundaries } = intervalBins; // from 'interval-bins';
//------------------------------------------------------------------------------
function callOut(command, datasetId, scope, preArgs, cb) {
childProcess(
'vcfGenotypeLookup.bash',
/* postData */ '',
/* useFile */ false,
/* fileName */ undefined,
/* moreParams */ [command, datasetId, scope, /*isecFlags*/ '', /*isecDatasetIds*/''].concat(preArgs || []),
dataOutReplyClosure(cb), cb, /*progressive*/ false);
}
const callOutP = promisify(callOut);
/** Request samples, filtered by genotype values at selected SNPs, i.e. filter by required haplotype
* handle multiple SNPs :
* group SNPs by feature Symbol matchRef into alt and ref; for both issue a samples request with filter
* sample is OK if it is in all results, i.e. appears once for each SNP
* (allow some number of missing : filter.allowMissing)
*
* @param datasetId name of parent or view dataset, or vcf directory name
* @param scope e.g. '1A'; identifies the vcf file, i.e. datasetId/scope.vcf.gz
* @param filter optional filter e.g. haplotype
* {features : array of SNP {position, matchRef}, matchHet : true/false, allowMissing : not yet defined or used }
* The caller will place the feature which filters out the most samples first,
* and this will be used in the first request.
* @return promise yielding array of sample names
*/
export { vcfGenotypeSamplesFiltered }
function vcfGenotypeSamplesFiltered(datasetId, scope, filter) {
const fnName = 'vcfGenotypeSamplesFiltered';
/** result */
let promise;
if (filter) {
parseStringFields(filter, ['matchHet']);
const matchHet = filter.matchHet;
filter.features.forEach(f => parseStringFields(f, ['position', 'matchRef']));
/** The purpose of allowing the caller to nominate the first SNP to filter
* on, by listing it first, is that the 2nd query can be limited to the
* result of the first, i.e. filteredSamples; see comment below. */
const refFirst = filter.features[0].matchRef;
/** array of feature positions. index is matchRef, i.e. [0] is Alt, [1] is Ref */
const groupedFilters = filter.features.reduce((grouped, feature) => {
grouped[feature.matchRef].push(feature.position);
return grouped;
}, {true : [], false : []});
const first = groupedFilters[refFirst];
/** @return regexp to be used by grep. '.' will match | / etc */
function refToGenotype(matchRef, matchHet) {
const
/** map {false,true} -> {1,0} */
value = + ! matchRef,
/** vcfGenotypeLookup.bash uses pattern as gtMatch in : '\t'"$gtMatch"'$' */
pattern = matchHet ?
'.*' + value + '.*' :
value + '.' + value;
return pattern;
}
function groupCall(group, matchRef) {
const
/** GT= is recognised by vcfGenotypeLookup.bash to set gtMatch.
* similar to --include 'GT="1/1"' but that filters SNPs not samples. */
include = 'GT=' + refToGenotype(matchRef, matchHet),
regions = group.map(position => scope + ':' + position).join(','),
preArgs = ['-r'].concat(regions).concat([include]),
p = callOutP('filter_samples', datasetId, scope, preArgs);
console.log(fnName, preArgs.join(' '));
return p;
}
const counts = {};
function countSamples(a) {
a.forEach(s => {
if (counts[s] == undefined) {
counts[s] = 1;
} else {
counts[s]++;
}
});
}
/** Number of SNPs queried so far; filter requires counts to match this. */
let nSNPs = first.length;
function query(group, matchRef) {
const
promise =
groupCall(group, matchRef)
.then(samplesValues => {
// samplesValues.replaceAll(/\t.../g, '');
/* split on (tab genotype newline), then
* trim off the '' created from trailing (... newline).
*/
/** Sample names which matched in the first query. */
const matchedSamples = samplesValues.split(/\t...\n/g);
if (matchedSamples.at(-1) === '') {
matchedSamples.pop();
}
/* count and filter for those with count === #SNPs in first query */
countSamples(matchedSamples);
/** Sample names will appear multiple times, once for each SNP genotype which they match.
* Array.from(new Set( )) preserves order, which is preferable for GUI consistency. */
const uniqSamples = Array.from(new Set(matchedSamples));
/* first.length is the number of SNPs in the first query,
* i.e. groupedFilters[refFirst] */
// next : >= nSNPs - allowMissing
const filteredSamples = uniqSamples.filter(s => counts[s] === nSNPs);
console.log(fnName, filteredSamples.length, nSNPs, uniqSamples.length, matchedSamples.length);
return filteredSamples;
});
return promise;
}
/** Map the array of sample names to the result format of the existing
* genotypeSamples endpoint. */
function samplesToResult(samples) {
return samples.join('\n');
}
promise =
query(first, refFirst)
.then(firstSamples => {
/** if filteredSamples.length < 100 it might be a good optimisation to
* narrow the 2nd query to filteredSamples. See comment re. refFirst. */
const
secondMatch = ! refFirst,
second = groupedFilters[secondMatch];
nSNPs += second.length;
let result;
if (! second.length) {
result = samplesToResult(firstSamples);
} else {
result = query(second, secondMatch)
.then(samples => samplesToResult(samples));
}
return result;
});
} else {
// -l, --list-samples: list sample names and exit
promise = callOutP('query', datasetId, scope, /*preArgs*/ ['-l']);
}
return promise;
}
//------------------------------------------------------------------------------
/** Given a list of selected SNPs, request the list of unique haplotype values
* across those positions.
* The output includes the count and list of samples which have each haplotype value.
*
* @param datasetId name of parent or view dataset, or vcf directory name
* @param scope e.g. '1A'; identifies the vcf file, i.e. datasetId/scope.vcf.gz
* @param positions required array of positions of selected SNPs
*
* @return promise yielding array of sample names
*/
export { vcfGenotypeHaplotypesSamples }
function vcfGenotypeHaplotypesSamples(datasetId, scope, positions) {
const fnName = 'vcfGenotypeHaplotypesSamples';
/** The positions are received as strings, and can be used in that form;
* JSON.parse() sanitises the values somewhat. */
positions = positions.map(p => JSON.parse(p));
function positionsCall(positions) {
/** Extract from vcfGenotypeSamplesFiltered() : groupCall, with group -> positions */
const
regions = positions.map(position => scope + ':' + position).join(','),
preArgs = ['-r'].concat(regions),
p = callOutP('haplotypes_samples', datasetId, scope, preArgs);
console.log(fnName, preArgs.join(' '));
return p;
}
/** If handling multiple Blocks (chromosomes) of a dataset then this might use
* 1 call per chr and combine results. (related : vcfGenotypeSamplesFiltered()
* combines results from multiple calls).
*
* Also, this function could get the list of sample names and map the sample
* numbers in the result to names, but that is easily done in the frontend,
* which keeps the reply small.
*/
const promise = positionsCall(positions);
return promise;
}
//------------------------------------------------------------------------------
/**
* @param datasetDir name of directory containing the VCF dataset
* @param scope e.g. '1A'; identifies the vcf file, i.e. datasetId/scope.vcf.gz
* scope===undefined or null signifies that all scopes of the dataset should be searched.
* @param preArgs args to be inserted in command line, additional to the datasetDir / vcf dir name.
* See comment in frontend/app/services/auth.js : vcfGenotypeLookup()
* @param nLines if defined, limit the output to nLines.
* @param dataOutCb passed to childProcess() - see comment there.
* If undefined, then dataOutReplyClosureLimit(cb, lineFilter, nLines) is used.
* @param cb
*/
function vcfGenotypeLookup(datasetDir, scope, preArgs_, nLines, dataOutCb, cb) {
/** Split out the optional parameters which are passed as separate params for
* processing separately to the remainder of preArgs, which are inserted as a
* list into the command. */
let {isecFlags, isecDatasetIds, ... preArgs} = preArgs_ || {};
const
fnName = 'vcfGenotypeLookup',
headerOnly = preArgs.headerOnly,
/** snpPolymorphismFilter is not applicable if SNPList because if the
* number of samples requested is <=1 then every row appears homozygous.
*/
snpPolymorphismFilter = ! preArgs.SNPList && preArgs.snpPolymorphismFilter,
/** These parameters are supported by view only, not query, so if
* present then view | query will be used.
* In that case moreParams will be passed to view, and paramsForQuery
* will be passed to query.
*/
viewRequired = snpPolymorphismFilter || preArgs.mafThreshold ||
preArgs.featureCallRateThreshold ||
preArgs.minAlleles !== undefined || preArgs.maxAlleles !== undefined ||
preArgs.typeSNP !== undefined,
command = headerOnly ? 'view' : preArgs.SNPList ?
(viewRequired ? 'counts_view' : 'counts_query') :
preArgs.requestFormat ? (viewRequired ? 'view_query' : 'query') : 'view';
/* isec is only meaningful with >1 datasets. The caller
* vcfGenotypeLookupDataset() only passes isecDatasetIds when
* isecDatasetIds.length > 1
*/
let isecDatasetIdsText = isecDatasetIds;
if (Array.isArray(isecDatasetIds) /*&& (isecDatasetIds.length > 1)*/) {
/** this is split in vcfGenotypeLookup.bash with tr '!' ' ' */
const datasetIdsSeparator = '!';
isecDatasetIdsText = isecDatasetIds.join(datasetIdsSeparator);
}
/** The params passed to spawn (node:child_process) are passed as options.args
* to ChildProcess.spawn (node:internal/child_process) which calls
* spawn(options) which converts non-strings to strings, e.g. arrays are
* joined with ',' into a single string. undefined -> 'undefined'.
*
* If scope is undefined then preArgs.datasetVcfFile is expected.
*/
let moreParams = [
command, datasetDir, scope || preArgs.datasetVcfFile,
isecFlags || '', isecDatasetIdsText || ''],
regionParams = scope ? ['-r', preArgs.region] : ['', ''];
moreParams = moreParams.concat(regionParams);
/** from BCFTOOLS(1) :
bcftools view [OPTIONS] file.vcf.gz [REGION [...]]
-h, --header-only
output the VCF header only
-H, --no-header
suppress the header in VCF output
bcftools query [OPTIONS] file.vcf.gz [file.vcf.gz [...]]
-H, --print-header
print header
* headerOnly implies command==='view' i.e. -h
* When ! headerOnly, the header is required;
* * for view : --with-header is default
* * for query : use -H
*/
const
headerOption = headerOnly ? /*command===view*/'-h' :
(command === 'view') ? '' : '-H';
if (preArgs.requestFormat) {
const
/** from BCFTOOLS(1) :
* %GT Genotype (e.g. 0/1)
* %TGT Translated genotype (e.g. C/A)
*/
formatGT = (preArgs.requestFormat === 'CATG') ? '%TGT' : '%GT',
/** now INFO/MAF is added if not present, by
* vcfGenotypeLookup.{bash,Makefile} : dbName2Vcf() / %.MAF.vcf.gz
* So requestInfo means just 'request INFO/tSNP' - no longer needed because
* to enable SNP filters to be applied in frontend also, request all of INFO/
* (until eb969a33 just INFO/MAF and INFO/tSNP were requested)
* Note that %INFO produces a column header '(null)' instead of 'INFO';
* this is handled in addFeaturesJson() in frontend/app/utils/data/vcf-feature.js.
*/
requestInfo = preArgs.requestInfo,
formatChromosome = scope ? '' : '%CHROM\t',
format = formatChromosome + '%ID\t%POS' + '\t%REF\t%ALT' +
'\t%INFO' +
'[\t' + formatGT + ']\n';
/** Params passed to query if view|query is used, otherwise to command. */
const paramsForQuery = ['-queryStart', headerOption, '-f', format, '-queryEnd'];
moreParams = moreParams.concat(paramsForQuery);
if (preArgs.snpNames?.length) {
const snpNames = ['-snpsStart'].concat(preArgs.snpNames).concat(['-snpsEnd']);
moreParams = moreParams.concat(snpNames);
}
if (headerOnly) {
moreParams.push('--force-samples');
}
/** default is no het filter, i.e. false */
if (snpPolymorphismFilter) {
moreParams.push('--genotype');
moreParams.push('het');
}
/** Just 1 --include or --exclude is permitted, so combine these
* mafThreshold and featureCallRateThreshold into 1 condition. */
const includeConditions = [];
const mafThresholdMax = 0.5;
/** default is no MAF filter, i.e. >= 0, (0 <= MAF <= 0.5)
* Also omit when condition is <= 0.5 (i.e. .mafUpper && .mafThreshold === mafThresholdMax).
*/
if ((preArgs.mafThreshold !== undefined) &&
(preArgs.mafThreshold !== (preArgs.mafUpper ? mafThresholdMax : 0))) {
const
/** --min-af and --max-af uses "INFO/AC and INFO/AN when
* available or FORMAT/GT" quoting BCFTOOLS(1), whereas
* --include MAF< / > may utilise INFO/MAF for example ? not clear so using INFO/MAF.
* Related : mafThresholdText() (components/panel/manage-genotype.js)
*/
afOption = 'INFO/MAF' + (preArgs.mafUpper ? '<=' : '>=') + preArgs.mafThreshold;
includeConditions.push(afOption);
}
if (preArgs.featureCallRateThreshold) {
const
/** equivalent to INFO/CR :
* N_PASS(GT!="./.")/N_SAMPLES
* F_PASS(GT!="./.")
* INFO/F_MISSING is converse of INFO/CR, so the following expression is
* equivalent to : INFO/CR >= .featureCallRateThreshold
*/
fcrOption = 'INFO/F_MISSING < ' + (1 - preArgs.featureCallRateThreshold);
includeConditions.push(fcrOption);
}
if (includeConditions.length) {
moreParams.push('--include'); // aka. -i
moreParams.push(includeConditions.join(' && '));
}
if (preArgs.minAlleles !== undefined) {
moreParams.push('--min-alleles');
moreParams.push(preArgs.minAlleles);
}
if (preArgs.maxAlleles !== undefined) {
moreParams.push('--max-alleles');
moreParams.push(preArgs.maxAlleles);
}
if (preArgs.typeSNP) {
moreParams.push("--types");
moreParams.push("snps");
}
}
const samples = preArgs.samples;
if (samples?.length) {
const
samplesJoined = samples
.trimEnd(/\n/)
.replaceAll('\n', ',');
moreParams = moreParams.concat('-s', samplesJoined);
} else if (preArgs.requestSamplesAll) {
// bcftools default is All samples, no option required.
} else {
// There is not an option for 0 samples, except via using an empty file :
moreParams = moreParams.concat('-S', '/dev/null');
}
/** avoid tracing samples, and moreParams[9] which is the samples. */
console.log(fnName, datasetDir, preArgs.region, preArgs.requestFormat, samples?.length, moreParams.slice(0, 9+3));
if (! dataOutCb) {
const lineFilter = false && preArgs.snpPolymorphismFilter ? snpPolymorphismFilter : null;
dataOutCb = dataOutReplyClosureLimit(cb, lineFilter, nLines);
}
childProcess(
'vcfGenotypeLookup.bash',
/* postData */ '',
/* useFile */ false,
/* fileName */ undefined,
moreParams,
dataOutCb, cb, /*progressive*/ true);
}
export { vcfGenotypeLookup }
//------------------------------------------------------------------------------
/** Count features of the given block in bins.
*
* @param block object instance of Block model
* @param interval bin boundaries within this range
* @param nBins number of bins to group block's features into
* @param isZoomed
* @param userOptions optional. user settings : {mafThreshold, snpPolymorphismFilter}
*
* @return Promise yielding : Array : binned feature counts
* in the same format as block-features.js :
* blockFeaturesCounts(), @see vcfGenotypeFeaturesCounts()
* $bucket :
* { "_id" : 33000000, "count" : 38201, "idWidth" : [ 1000000 ] }
* { "_id" : 34000000, "count" : 47323, "idWidth" : [ 1000000 ] }
*/
export { vcfGenotypeFeaturesCounts }
function vcfGenotypeFeaturesCounts(
block, interval, nBins = 10, isZoomed, userOptions, cb) {
// header comment copied from block-features.js : blockFeaturesCounts()
const fnName = 'vcfGenotypeFeaturesCounts';
let result;
// default interval can be the whole domain of the block
if (! interval || interval.length !== 2) {
const
errorText = 'Interval is required. ' + JSON.stringify(interval),
error = new ErrorStatus(400, errorText);
result = error;
} else {
if (interval[0] > interval[1]) {
console.warn(fnName, 'reverse interval', interval, block.id);
let swap = interval[0];
interval[0] = interval[1];
interval[1] = swap;
}
const
scope = block.name,
datasetDir = block.datasetId,
// may be able to omit domainInteger if ! isZoomed
domainInteger = interval.map((d) => d.toFixed(0)),
region = scope + ':' + domainInteger.join('-'),
preArgs = {region, samples : null, requestFormat : 'CATG', SNPList : true},
// arguments 1-3 are used : block, interval, nBins
summary = new vcfToSummary(...arguments);
if (userOptions) {
Object.entries(userOptions).forEach(([key, value]) =>
{ if (value !== undefined) { preArgs[key] = value; } });
}
function sumCb(error, text) {
let result;
if (error) {
throw error;
} else if (text === undefined) {
result = summary.summarise();
} else {
summary.accumulateChunk(text);
}
return result;
}
const [blockArg, ...intervalArgs] = arguments;
const dataOutCb = dataReduceClosure(sumCb);
vcfGenotypeLookup(
datasetDir, scope,
preArgs, /*nLines*/undefined, dataOutCb, cb
);
/* vcfGenotypeLookup() includes %REF\t%ALT, which could be omitted in this case. */
}
return result;
};
const symbolCount = Symbol.for('count');
class vcfToSummary {
/**
* @param interval domainInteger
*/
constructor(block, interval, nBins) {
const
fnName = 'vcfToSummary',
lengthRounded = binEvenLengthRound(interval, nBins),
boundaries = binBoundaries(interval, lengthRounded),
/** map the boundaries into interval [start, end] pairs. */
intervals = boundaries.map((b, i, a) => (i ? [a[i-1], b] : undefined))
.slice(1, boundaries.length-1);
intervals.forEach((interval) => interval[Symbol.for('count')] = 0);
// console.log(fnName, block.id, lengthRounded, boundaries, intervals);
// set up bins and interval tree
this.summaryTree = createIntervalTree(intervals);
}
}
vcfToSummary.prototype.accumulateChunk = function (text) {
/** text has \n and \t, column format e.g. :
* # [1]ID [2]POS [3]REF [4]ALT
* scaffold38755_1190119 1190119 C T
*/
text.split('\n')
.forEach((line, i) => {
/* first line of first chunk is header line, for subsequent chunks match /^#/
* last line of chunk may be incomplete - save it to prepend to first line of next chunk.
*/
// skip header line
if (i) {
// add line to interval of summaryTree;
const
cols = line.split('\t'),
position = +cols[1];
this.summaryTree.queryInterval(position, position, addToInterval);
function addToInterval(interval) {
interval[symbolCount]++;
}
}
});
};
/**
* @return summary array, in the same format as block-features.js :
* blockFeaturesCounts(), @see vcfGenotypeFeaturesCounts()
*/
vcfToSummary.prototype.summarise = function() {
const
summaryArray = this.summaryTree.intervals
.sort((a, b) => a[0] - b[0])
.map(
(interval) =>
({
_id : interval[0],
count : interval[symbolCount],
idWidth : [interval[1] - interval[0]]
}));
return summaryArray;
};
//------------------------------------------------------------------------------
/** Count sample genotype values 0 and 2 (number of copies of Alt).
* Filter the line out if it is monomorphic, i.e. either the number of 0's or
* the number of 2's is 0.
* @param line result of split('\n'), expected to be a string
* @return undefined or null if the line should be filtered out,
* otherwise return truthy (returning the line because lineFilter signature
* could be changed to filter&map).
*/
function snpPolymorphismFilter(line) {
if (line.startsWith('#')) {
return line;
}
const
/** e.g. # [1]ID\t[2]POS\t[3]REF\t[4]ALT\t[5]tSNP\t[6]MAF\t[7]Exo
* values are genotype call values of the samples
*/
[/*chr,*/ name, position, ref, alt, tSNP, MAF, ...values] = line.split('\t');
let
counts = values.reduce((result, value) => {
/* Number of columns before sample genotype values may vary, so skip values
* which don't match the expected format for genotype values. */
if (value.match(/[012ACTG]\/[012ACTG]/)) {
/* if (requestFormat === 'CATG') {
altCopies = stringCountString(value, alt);
}
*/
const altCopies = stringCountString(value, '1');
result[altCopies]++;
}
return result;
}, [0, 0, 0]),
monomorphic = ! counts[0] || ! counts[2];
return ! monomorphic && line;
}
//------------------------------------------------------------------------------
/** Get the status of .vcf.gz files for this dataset.
* Related : vcfGenotypeFeaturesCounts().
*/
function vcfGenotypeFeaturesCountsStatus(datasetDir, cb) {
const
fnName = 'vcfGenotypeFeaturesCountsStatus',
command = 'status',
moreParams = [
command, datasetDir, /*scope*/'',
/*isecFlags*/'', /*isecDatasetIds*/''];
/** Receive the combined result (progressive===false).
* For non-progressive (expect that the result is in a single chunk) could use
* dataReduceClosure() to catenate chunks.
* @param combined Buffer
*/
function dataOutCb(combined, cb) {
// console.log(fnName, 'dataOutCb', combined);
const text = combined.toString();
cb(null, text);
}
childProcess(
'vcfGenotypeLookup.bash',
/* postData */ '',
/* useFile */ false,
/* fileName */ undefined,
moreParams,
dataOutCb, cb, /*progressive*/ false);
};
export { vcfGenotypeFeaturesCountsStatus }
const vcfGenotypeFeaturesCountsStatusP = promisify(vcfGenotypeFeaturesCountsStatus);
//------------------------------------------------------------------------------
export { checkVCFsAreInstalled }
/** Check if base VCF and SNPLists are installed for any VCF datasets in datasets.
* The requirement for SNPLists is only applied if the base VCF is large.
* vcfGenotypeLookup.{bash,Makefile} will automatically generate
* .MAF.SNPList.vcf.gz if it is not present.
* If the size of the base .vcf.gz is such that this will take > ~5mins then
* require the user to install this .MAF.SNPList.vcf.gz before uploading the VCF
* worksheet.
* @return a promise yielding datasets status, with VCF datasets which are not
* installed having status falsey
*/
function checkVCFsAreInstalled(datasets, status) {
const
fnName = 'checkVCFsAreInstalled',
checkPs = datasets.map(dataset => {
console.log(fnName, dataset.name, dataset.tags);
const
isVCF = dataset.tags?.includes('VCF'),
checkP = ! isVCF ? Promise.resolve(true) :
vcfGenotypeFeaturesCountsStatusP(dataset.name)
.then(vcfStatus => {
const
status = statusToObj(vcfStatus),
notInstalled = dataset.blocks.filter(block => {
const
chrName = block.name,
s = status[chrName],
/** size and time of chr base .vcf.gz e.g. ' 354566 Sep 12 16:20' */
sizeTime = s?.[''] ,
sizeMatch = sizeTime?.match(/^ *([0-9]+)/),
small = ! sizeMatch || (+sizeMatch[1] < 100e6),
ok = small || s['.MAF.SNPList'];
return ! ok;
});
console.log(dataset.name, notInstalled, status, vcfStatus);
return ! notInstalled.length;
});
return checkP;
});
return checkPs;
}
//------------------------------------------------------------------------------
/** Construct a mapping from chr name to a list of suffixes of available .vcf.gz
* files for that chromosome.
*/
function statusToObj(vcfStatus) {
const
fnName = 'statusToObj',
/** extract from frontend/app/utils/data/vcf-files.js : statusToMatrix() */
a = vcfStatus.split('\n'),
/** collated into a summary object[chrName][colName] -> sizeTime
* This has the same information as map; combined with cols[] this enables
* producing a matrix with sorted column names.
*/
summary = a.reduce((s, line) => {
const
m = line.match(/(.*) ([^.]+)(.*).vcf.gz(.*)/);
if (m) {
const
[whole, sizeTime, chrName, suffix, csi] = m,
colName = (suffix + csi), // .replaceAll('.', unicodeDot),
chr = s[chrName] || (s[chrName] = {});
s[chrName][colName] = sizeTime;
}
return s;
}, {});
return summary;
}
//------------------------------------------------------------------------------