@plantinformatics/vcf-genotype-brapi
Version:
Client and server functions to access genotype data from VCF via a custom web API and BrAPI
1,090 lines (965 loc) • 41.3 kB
JavaScript
import { pick } from 'lodash/object.js';
import { intervalMerge } from './interval-merge.js';
// import { get as Ember_get, set as Ember_set } from '@ember/object';
/*
function Ember_get(object, fieldName) { return object[fieldName]; }
function Ember_set(object, fieldName, value) { object[fieldName] = value; }
*/
let Ember_get, Ember_set;
export { setFrameworkFunctions };
function setFrameworkFunctions(functions) {
Ember_get = functions.Ember_get;
Ember_set = functions.Ember_set;
console.log('setFrameworkFunctions', 'Ember_get', Ember_get, 'Ember_set', Ember_set);
}
//------------------------------------------------------------------------------
const dLog = console.debug;
const trace = 1;
//------------------------------------------------------------------------------
/** number of columns in the vcf output before the first sample column. */
const nColumnsBeforeSamples = 9;
/** Copied from components/panel/manage-genotype.js */
const callRateSymbol = Symbol.for('callRate');
//------------------------------------------------------------------------------
/** map from vcf column name to Feature field name.
*/
const vcfColumn2Feature = {
'CHROM' : 'blockId',
'POS' : 'value',
'ID' : '_name',
'REF' : 'values.ref',
'ALT' : 'values.alt',
};
//------------------------------------------------------------------------------
/** Map the column name '(null)' to 'INFO'
*
* Using --format %INFO outputs the whole of the INFO value; with the column
* header name '(null)'
* (when using e.g. %INFO/MAF, the column header name is the sub-field name, i.e. 'MAF')
*/
function columnNameINFOFix(columnNames) {
columnNames = columnNames.map(name => name == '(null)' ? 'INFO' : name);
return columnNames;
}
//------------------------------------------------------------------------------
/**
* @return true if value is 0, 1, 2, or 0/0, 0/1, 1/0, 1/1,
* @param value is defined, and is a string
* It is assumed to be well-formed - only the first char is checked.
*/
function gtValueIsNumeric(value) {
const char = value[0];
return ['0', '1', '2'].includes(char);
}
//------------------------------------------------------------------------------
/** Convert punctuation in datasetId to underscore, to sanitize it and enable
* use of the result as a CSS class name.
*
* Used in genotype table column headers for the dataset colour rectangle (border-left).
* This is in support of selecting dataset colour using datasetId instead of
* hard-wiring it onto every element; this will support future plans for user
* editing of dataset colour.
*/
function datasetId2Class(datasetId) {
const className = datasetId.replaceAll(/[ -,.-/:-?\[-^`{-~]/g, '_');
return className;
}
// -----------------------------------------------------------------------------
/** If block is Germinate and block._meta.linkageGroupName is defined, insert
* linkageGroupName into requestOptions, for use with URL path parameter /chromosome/
* by utils/data/germinate.js : callsetsCalls(), via
* germinate-genotype.js : germinateGenotypeLookup()
*/
function addGerminateOptions(requestOptions, block) {
if (block?.hasTag('Germinate') && block._meta?.linkageGroupName) {
requestOptions.linkageGroupName = block._meta.linkageGroupName;
}
return requestOptions;
}
//------------------------------------------------------------------------------
/** Request featuresCounts (histograms) for all blocks (chromosomes) of the
* given dataset.
* @param auth service for sending API requests
* @param datasetId
* @param genotypeSNPFilters current user-controlled thresholds for SNP filters
* controls.genotypeSNPFilters
*/
function getDatasetFeaturesCounts(auth, datasetId, genotypeSNPFilters) {
const promise = auth.getDatasetFeaturesCounts(datasetId, genotypeSNPFilters);
return promise;
}
//------------------------------------------------------------------------------
/** Lookup the genotype for the selected samples in the interval of the brushed block.
* The server store to add the features to is derived from
* vcfGenotypeLookupDataset() param blockV, from brushedOrViewedVCFBlocksVisible,
* which matches vcfDatasetId : scope
* @param auth auth service for ajax
* @param samples to request, may be undefined or []
* Not used if requestSamplesAll
* @param domainInteger [start,end] of interval, where start and end are integer values
* domainInteger is not applicable if scope is undefined, so this parameter is
* used in that case to carry {datasetVcfFiles, snpNames} from genotype-search.
* @param requestOptions :
* {requestFormat, requestSamplesAll, headerOnly},
* . requestFormat 'CATG' (%TGT) or 'Numerical' (%GT for 01)
* . headerOnly true means -h (--header-only), otherwise -H (--no-header)
* . linkageGroupName defined if isGerminate
*
* @param vcfDatasetId id of VCF dataset to lookup
* @param scope chromosome, e.g. 1A, or chr1A - match %CHROM chromosome in .vcf.gz file
* scope===undefined signifies to search across all scopes of the dataset;
* in this case preArgs.region is passed undefined.
* @param rowLimit
*/
function vcfGenotypeLookup(auth, samples, domainInteger, requestOptions, vcfDatasetId, scope, rowLimit) {
const
fnName = 'vcfGenotypeLookup',
region = scope && (scope + ':' + domainInteger.join('-')),
requestFormat = requestOptions.requestFormat,
/** this dataset has tSNP in INFO field */
requestInfo = requestFormat && (vcfDatasetId === 'Triticum_aestivum_IWGSC_RefSeq_v1.0_vcf_data'),
preArgs = Object.assign({
region, samples, requestInfo
}, requestOptions);
if (! scope) {
const searchScope = domainInteger;
// preArgs.datasetVcfFiles = searchScope.datasetVcfFiles;
preArgs.snpNames = searchScope.snpNames; // actually genotype-search.selectedFeaturesText
}
/** Noted in vcfGenotypeLookup.bash : When requestOptions.isecDatasetIds is given,
* -R is used, so -r is not given, i.e. preArgs.region is not used.
*/
// parent is .referenceDatasetName
/* reply time is generally too quick to see the non-zero count, so to see the
* count in operation use +2 here. */
auth.apiStatsCount(fnName, +1);
/** Currently passing datasetId as param 'parent', until requirements evolve.
* The VCF dataset directories are just a single level in $vcfDir;
* it may be desirable to interpose a parent level, e.g.
* vcf/
* Triticum_aestivum_IWGSC_RefSeq_v1.0/
* Triticum_aestivum_IWGSC_RefSeq_v1.0_vcf_data
* It's not necessary because datasetId is unique.
* (also the directory name could be e.g. lookupDatasetId ._meta.vcfFilename instead of the default datasetId).
*/
const
textP = auth.vcfGenotypeLookup(vcfDatasetId, scope, preArgs, rowLimit, {} )
.then(
(textObj) => {
/* Result from Pretzel API endpoint is vcfGenotypeLookup is {text};
* result from Germinate is an array, recognised by vcf-feature.js : resultIsGerminate(). */
const text = textObj.text || textObj;
auth.apiStatsCount(fnName, -1);
return text;
});
return textP;
}
//------------------------------------------------------------------------------
/** Inspect the header text of a VCF file, and return true if
* the header indicates that INFO contains %NU Null Genotype values.
* @return true if headerText contains '\n##FORMAT=<ID=NU'
*/
export function vcfHeaderIndicatesGenotypeHasNull(headerText) {
/** copied from manage-genotype.js : headerTextP(). */
const genotypeHasNull = !! headerText.match(/\n##FORMAT=<ID=NU/);
return genotypeHasNull;
}
//------------------------------------------------------------------------------
/** Get the header of a VCF Genotype file.
* The focus in this case is the configuration information in comments in the header.
* The list of samples in the #CHROM line are not required here.
*
* @param auth auth service, for sending API requests
* @param block a VCF block which is viewed
* vcfDatasetId + scope can be used to find block, and block contains .datasetId
* and .scope, so it would be possible to use either block param or {datasetId,
* scope}.
* @param vcfDatasetId block.datasetId
* @param scope block.scope
* @return a promise yielding the header text.
*/
export function vcfGenotypeHeader(auth, block /*, vcfDatasetId, scope*/) {
/* based on extracts from manage-genotype.js : headerTextP() */
const
fnName = 'vcfGenotypeHeader',
vcfDatasetId = block.datasetId.id,
scope = block.scope;
//------------------------------------
/** Request the VCF file header for block / vcfDatasetId / scope.
* A promise is recorded in dataset[fnName] and the result flag is recorded in
* dataset._meta.genotypeHasNull; this function can be split and those actions
* pushed from this library to the application.
*/
function doRequest() {
const
/** Not used */
requestFormat = 'Numerical',
requestSamplesAll = false,
requestOptions = {requestFormat, requestSamplesAll, headerOnly : true};
/** Possibly not required - Null genotype data is not added to Germinate yet. */
addGerminateOptions(requestOptions, block);
const
/** Not used */
/** -s '' gets :
* Warn: subset called for sample that does not exist in header: ""... skipping
* Warn: subsetting has removed all samples
* -S /dev/null gets that last warning also.
*/
samples = null,
/** Could use block.domain.map(locn => locn.toFixed()), but region /
* interval is not used when requesting the header. */
domainInteger = [0, 1],
/** if rowLimit aka nLines is 0 then finished() is true and the text is not returned */
rowLimit = 1e4,
/** The above settings are not applicable to header request. */
/** Request header. */
headerP = vcfGenotypeLookup(
auth, samples, domainInteger,
requestOptions, vcfDatasetId, scope, rowLimit)
.then(text => {
// dLog(fnName, text);
const hasNull = text && vcfHeaderIndicatesGenotypeHasNull(text);
/** copied from manage-genotype.js : headerTextP(). */
dLog(fnName, 'hasNull', hasNull);
if (hasNull) {
block.datasetId.set('_meta.genotypeHasNull', true);
}
/** result is not used (in getSummary()). It contains the #CHROM line
* with the sample list, so it could be used. */
return text;
});
block.datasetId.set(fnName, headerP);
return headerP;
}
//------------------------------------
/* store promise in dataset;
* do task if undefined or (resolve and .readyState !== 4 (which indicates OK));
* or promise.catch( retry ) */
let requestP = block.datasetId.get(fnName);
/** could check (requestP.state() === 'resolved') prior to .readyState;
* probably not required. */
const requestNow = ! requestP || (requestP.readyState !== 4);
if (requestNow) {
requestP = doRequest();
} else if (requestP) {
/** Retry of doRequest() updates dataset .fnName. */
requestP.catch(() => doRequest());
}
return requestP;
}
//------------------------------------------------------------------------------
/* sample data :
* -------------------------------------
* default output format :
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##phasing=none
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype as 0/1">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ExomeCapture-DAS5-003227 ExomeCapture-DAS5-002775 ExomeCapture-DAS5-002986
chr1A 327382120 scaffold22435_31704476 G A 100 PASS AC=3;AN=6;NS=616;MAF=0.418019;AC_Het=233;tSNP=.;pass=no;passRelaxed=no;selected=no GT:GL:DP 1/0:-7.65918,-2.74391e-08,-7.48455:6 1/0:-5.41078,-0.00397816,-2.1981:3 1/0:-4.50477,-1.46346e-05,-10.5809:6
* -------------------------------------
* requestFormat === 'CATG' : formatArgs = '-H -f "%ID\t%POS[\t%TGT]\n"' :
# [1]ID [2]POS [3]ExomeCapture-DAS5-002978:GT [4]ExomeCapture-DAS5-003024:GT [5]ExomeCapture-DAS5-003047:GT [6]ExomeC
scaffold38755_709316 709316 C/C C/T C/C C/C C/C ./. C/C C/C C/C C/T C/C C/C C/C C/C C/T C/C C/C C/C C/C C/T C/C C/C C
* -------------------------------------
* requestFormat === 'Numerical' : formatArgs = '-H -f "%ID\t%POS[\t%GT]\n"' :
# [1]ID [2]POS [3]ExomeCapture-DAS5-002978:GT [4]ExomeCapture-DAS5-003024:GT [5]ExomeCapture-DAS5-003047:GT [6]ExomeC
scaffold38755_709316 709316 0/0 0/1 0/0 0/0 0/0 ./. 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0
*/
/** Parse VCF output and add features to block.
* @return
* { createdFeatures : array of created Features,
* sampleNames : array of sample names,
* resultBlocks : blocks of the result rows, in the case of [genotype-search], otherwise undefined
* }
*
* @param block view dataset block for corresponding scope (chromosome)
* In the case of [genotype-search] all scopes (chromosomes) of the dataset are searched,
* and block is dataset
* @param requestFormat 'CATG', 'Numerical', ...
* @param replaceResults true means remove previous results for this block from block.features[] and selectedFeatures.
* @param selectedService if defined then update selectedFeatures
* @param text result from bcftools request
*/
function addFeaturesJson(block, requestFormat, replaceResults, selectedService, text) {
/** true if block is given; otherwise determine block of each row, from CHROM column. */
const blockGiven = block.constructor.modelName === 'block';
/** If ! blockGiven, collate the blocks of the result rows. */
const resultBlocks = blockGiven ? undefined : new Map();
let dataset;
if (! blockGiven) {
if (block.constructor.modelName !== 'dataset') {
dLog(fnName, blockGiven, block.constructor.modelName, block?.id);
} else {
dataset = block;
block = undefined;
}
}
const fnName = 'addFeaturesJson';
dLog(fnName, blockGiven, block?.id, block?.mapName, text.length);
/** optional : add fileformat, FILTER, phasing, INFO, FORMAT to block meta
* read #CHROM or '# [1]ID' column headers as feature field names
* parse /^[^#]/ (chr) lines into features, add to block
*/
let
createdFeatures = [],
/** if the output is truncated by rowLimit aka nLines, the last line will not
* have a trailing \n, and is discarded. If incomplete lines were not
* discarded, values.length may be < 4, and feature.value may be undefined.
*/
lines = text.split('\n'),
meta = {},
/** true if column is genotype format value. */
columnIsGT,
columnIsNU,
columnNames,
sampleNames,
nFeatures = 0;
dLog(fnName, lines.length);
if (text && text.length && (text.charAt(text.length-1) !== '\n')) {
dLog(fnName, 'discarding incomplete last line', lines[lines.length-1]);
lines.splice(-1, 1);
}
/* If block is not given, could remove its feature and selected features when it is seen in the results.
* i.e. factor this to a function and call it when a new block is seen in the results.
*/
if (replaceResults && blockGiven) {
if (selectedService) {
const selectedFeatures = selectedService.selectedFeatures;
// let mapChrName = Ember_get(block, 'brushName');
/* remove features of block from createdFeatures, i.e. matching Chromosome : mapChrName
* If the user has renewed the axis brush, then selectedFeatures will not
* contain any features from selectionFeature in previous result; in that
* case this has no effect and none is required.
* If the user send a new request with e.g. changed samples, then this would apply.
* This can also be moved to selectedService.
*/
let blockSelectedFeatures = selectedFeatures.filter((f) => f.feature.get('blockId.id') === block.id);
if (blockSelectedFeatures.length) {
selectedFeatures.removeObjects(blockSelectedFeatures);
}
}
if (block.get('features.length')) {
// alternative : block.set('features', Ember_A());
block.features.removeAt(0, block.get('features.length'));
}
}
if (selectedService) {
selectedService.selectedFeaturesUpdateIndex();
}
lines.forEach((l, lineNum) => {
if (l.startsWith('##')) {
const nameVal = l.match(/^##([^=]+)=(.*)/);
if (nameVal.length > 2) {
/** ##INFO and ##FORMAT are duplicated : could .match(/.*ID=(.+),(.+)>/) and use ID to store [2] in meta.{INFO,FORMAT}.<ID>
* ##bcftools_{viewVersion,viewCommand} are also duplicated, the last pair generated this output so it is of more interest.
*/
meta[nameVal[1]] = nameVal[2];
}
} else if (l.startsWith('#CHROM')) {
// Column header row output by bcftools view
columnNames = l.slice(1).split('\t');
columnNames = columnNameINFOFix(columnNames);
sampleNames = columnNames.slice(nColumnsBeforeSamples);
// from columnNames.slice(0,9), appended tSNP.
const nonSampleFields = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'tSNP'];
columnIsGT = columnNames.map(c => nonSampleFields.includes(c));
} else if (l.startsWith('# [1]') || l.startsWith('#[1]')) { // expect ID or CHROM
// Column header row output by bcftools query
// # [1]ID [2]POS [3]ExomeCapture-DAS5-002978:GT [4]ExomeCapture-DAS5-003024:GT [5]ExomeCapture-DAS5-003047:GT [6]ExomeC
/* between versions 1.9 and 1.19 of bcftools, this changed '# [1]ID' to '#[1]ID'
* 1.9 is current on centos (2024Jan).
*/
const
columnHeaders = l
.split(/\t\[[0-9]+\]/);
columnIsNU = columnHeaders.map(ch => ch.endsWith(':NU'));
const
/** reduce :GT...:NU to :GT form */
withoutNU = columnHeaders.map((ch, i) => {
// e.g. ch === 'AGG47808CHIC1-B00001-10-02:GT:[7]AGG47808CHIC1-B00001-10-02:NU'
// i.e. ch.endsWith(':NU')
if (columnIsNU[i]) {
/** if true, check that the sample name part matches,
* i.e. sampleName:GT:[7]sampleName:NU */
const verify = true;
if (verify) {
const match = ch.match(/(.+):GT:\[[0-9]+\](.+):NU/);
if ((match.length !== 3) || (match [1] !== match[2])) {
dLog(fnName, match, ch, i);
}
}
const match = ch.match(/(.+):\[[0-9]+\].+:NU/);
if (match.length !== 2) {
dLog(fnName, match, ch, i);
} else {
ch = match[1];
}
}
return ch;
});
columnIsGT = withoutNU
.map((name) => name.endsWith(':GT'));
// trim off :GT, and split at 'tab[num]'
columnNames = withoutNU
.map(ch => ch.replaceAll(':GT', ''));
columnNames[0] = columnNames[0].replace(/^# ?\[1\]/, '');
columnNames = columnNameINFOFix(columnNames);
// nColumnsBeforeSamples is 2 or 3 in this case : skip (CHROM,) ID, POS.
const posColumn = columnNames.indexOf('POS');
sampleNames = columnNames.slice(posColumn + 1);
// skip the (null) / INFO column name
// (2 for REF, ALT)
sampleNames.splice(2, 1);
} else if (columnNames && l.length) {
const values = l.split('\t');
let feature = values.reduce((f, value, i) => {
const fieldName = columnNames[i];
let fieldNameF;
/* vcfColumn2Feature[] provides Feature field name corresponding to the
* column name, for the common columns; for other cases this is
* overridden in the switch default.
*/
fieldNameF = vcfColumn2Feature[fieldName];
/** maybe handle samples differently, e.g. Feature.values.samples: []
* if (i > nColumnsBeforeSamples) { ... } else
*/
switch (fieldName) {
case 'CHROM' :
// Update required : now Block.name may be e.g. 'Chr1A' - can compare value with .name instead of trimming off /^chr/.
let scope = value.replace(/^chr/, '');
if (! blockGiven) {
block = dataset.blocks.findBy('name', value);
if (! block) {
dLog(fnName, i, value, 'not in', dataset.blocks.mapBy('name'), dataset.blocks.mapBy('scope'));
} else {
resultBlocks.has(block) || resultBlocks.set(block, []);
value = block;
}
} else
if (scope !== block.scope) {
dLog(fnName, value, scope, block.scope, fieldName, i);
value = null;
} else {
value = block;
}
break;
case 'POS' :
value = parseNumber(value);
f['value_0'] = value;
value = [ value ];
break;
case 'ID' :
case 'REF' :
case 'ALT' :
break;
case 'INFO' : // (null)
fieldNameF = 'values.' + fieldName;
const infoEntries = value.split(';').map(kv => kv.split('='));
value = Object.fromEntries(infoEntries);
if (value.MAF) {
value.MAF = parseNumber(value.MAF);
}
if (value.tSNP) {
if ((value.tSNP === '.') || (value.tSNP === '')) {
delete value.tSNP;
} else {
value.tSNP = parseNumber(value.tSNP);
}
}
parseNumberFields(value);
break;
default :
fieldNameF = 'values.' + fieldName;
value = parseNumber(value);
}
if (! fieldNameF) {
dLog(fnName, fieldName, value, i);
} else {
/** match values. and meta. */
let prefix = fieldNameF.match(/^([^.]+)\..*/);
prefix = prefix && prefix[1];
if (prefix) {
if (columnIsNU[i]) {
/** The original test datasets were created with :1 meaning the
* data is valid (may be 0/1/2/N) i.e. not missing, ./.:1 meant N.
* Up to this commit, this function interpreted :1 to mean null (N).
* The data format is planned to switch to :1 to mean N, so
* digitMeansNull is added to enable working with the current test
* data, and switch this interpretation when the new data is
* received.
* Update (2026Apr16) : the first data release is available,
* so this switch is now changed from false to true.
*/
const digitMeansNull = true;
/** :0 means not null, so discard it and keep just the genotype value
* :1 means null, so discard the genotype value (./.) and show as N.
* i.e. map x:0 to x, and x:1 to N */
const match = value.match(/^(.+):([01])$/);
if (match) {
const digitIsZero = match[2] === '0';
if (digitMeansNull) {
if (digitIsZero) {
value = match[1];
} else {
value = 'N';
}
} else {
value = (value === './.:0') ? 'N' : match[1];
}
} else {
dLog(fnName, value, i);
}
}
/** replace A/A with A, 1/1 with 2 (i.e. x/y -> x+y). */
if ((value !== 'N') && columnIsGT[i]) {
let match = value.match(/^(\w)[|/](\w)$/);
if (! match) {
} else if (requestFormat === 'Numerical') {
// +"0" + "0" is "00", so the + + is required.
value = '' + (+match[1] + +match[2]);
} else /* CATG */
if (match[1] === match[2]) {
value = match[1];
}
}
if (! f[prefix]) {
f[prefix] = {};
}
if (fieldName.match(/\./)) {
// Ember_set() interprets dot in field name, so use [] =
f[prefix][fieldName] = value;
} else {
/* could also use Ember_set() when ! prefix. */
Ember_set(f, fieldNameF, value);
}
/* These will not be needed after changing references to e.g.
* feature.values.MAF to feature.values.INFO.MAF, which is
* equivalent and replaces it. */
if (value.MAF !== undefined) {
f.values.MAF = value.MAF;
}
if (value.tSNP !== undefined) {
f.values.tSNP = value.tSNP;
}
} else {
f[fieldNameF] = value;
}
}
return f;
}, {});
// or EmberObject.create({value : []});
if (! blockGiven && block) {
const featuresDomain = resultBlocks.get(block);
intervalMerge(featuresDomain, feature.value);
}
/* CHROM column is present in default format, and omitted when -f is used
* i.e. 'CATG', 'Numerical', so in this case set .blockId here. */
if (requestFormat) {
feature.blockId = block;
}
/** based on similar : components/table-brushed.js : afterPaste() */
/** If it is required for vcfFeatures2MatrixView() to create displayData
* without creating model:Feature in the Ember data store, the following
* part can factor out as a separate function, returning an array of
* native JS objects at this point, and passing those to the 2nd function
* for creation of model:Feature
*/
if (feature.blockId && feature.value?.length && feature._name) {
// trace level is e.g. 0,1,2,3; the number of rows displayed will be e.g. 0,2,4,8.
if (trace && (lineNum < (1 << trace))) {
dLog(fnName, 'newFeature', feature);
}
// in this case feature.blockId is block
let store = feature.blockId.get('store');
/** name is used in CSS selector, e.g. in utils/draw/axis.js :
* axisFeatureCircles_selectOne{,InAxis}(), and . and : are not valid
* for that use. */
const separator = '_';
if (feature._name === '.') {
// Use chr:position:ref:alt, with separator in place of ':'
feature._name = block.name + separator + feature.value[0];
['ref', 'alt'].forEach(a => {
const value = feature.values[a];
if (value) {
feature._name += separator + value;
}
});
}
/* Previously sanitized feature._name using datasetId2Class(), but it is
* desired to retain the '.' which may appear in SNP names in VCF files.
* Before use DOM element id / class, they are sanitized via
* eltClassName() in axisFeatureCircles_eltId().
*/
// .id is used by axisFeatureCircles_eltId().
// ._name may be also added to other blocks.
/* append .value[0] to handle datasets with duplicate .name in 1 chr
* This could be optional - done just when
* (existingFeature.get('value.0') !== feature.value[0])
*/
feature.id = block.id + '_' + feature._name + '_' + feature.value[0];
let existingFeature = store.peekRecord('feature', feature.id);
if (existingFeature) {
mergeFeatureValues(existingFeature, feature);
feature = existingFeature;
// this is included in createdFeatures, since it is a result from the current request.
} else {
// Replace Ember.Object() with models/feature.
feature = store.createRecord('feature', feature);
/** fb is a Proxy */
let fb = feature.get('blockId');
if (fb.then) {
fb.then((b) => feature.set('blockId', b));
}
}
nFeatures++;
let mapChrName = Ember_get(feature, 'blockId.brushName');
if (selectedService) {
selectedService.selectedFeaturesMergeFeature(mapChrName, feature);
}
/* vcfFeatures2MatrixView() uses createdFeatures to populate
* displayData; it could be renamed to resultFeatures; the
* feature is added to createdFeatures regardless of
* existingFeature.
*/
createdFeatures.push(feature);
// block may be undefined if CHROM is not in dataset.blocks[]
// If existingFeature then addObject(feature) is a no-op.
if (block && (replaceResults || ! existingFeature)) {
block.features.addObject(feature);
}
}
}
});
/* in the case of [genotype-search], this is just the block of the last row.
* - collate blocks and update each
*/
if (block) {
blockEnsureFeatureCount(block);
block.addFeaturePositions(createdFeatures);
}
if (! columnNames || ! sampleNames) {
dLog(fnName, lines.length, text.length);
}
let result = {createdFeatures, sampleNames, resultBlocks};
return result;
}
//------------------------------------------------------------------------------
/** If block.featureCount is undefined, then it can be set from block.features.length.
* This is used when features are added from genotype calls received from VCF or Germinate.
* The features received are likely only a small part of the chromosome, so the
* count is just a lower bound. Also it is likely that block.featureCount will
* be defined from received blockFeaturesCounts; this is just a fall-back.
* (possibly the first vcf result may arrive before blockFeaturesCounts if
* blocks are viewed from URL)
*/
function blockEnsureFeatureCount(block) {
const featuresLength = block.get('features.length');
if ((block.get('featureCount') ?? 0) < featuresLength) {
block.set('featureCount', featuresLength);
}
}
// -----------------------------------------------------------------------------
/** Merge feature.values into existingFeature.values
*/
function mergeFeatureValues(existingFeature, feature) {
Object.entries(feature.values).forEach((e) => {
if (existingFeature.values[e[0]] !== e[1]) {
if (trace > 2) {
dLog(feature.id, existingFeature.values[e[0]] ? 'setting' : 'adding', e);
}
existingFeature.values[e[0]] = e[1];
}
});
}
//------------------------------------------------------------------------------
/** @return true if the genotypeLookup API result is from Germinate,
* false if VCF, from bcftools
*/
function resultIsGerminate(data) {
return Array.isArray(data);
}
/** Parse Germinate genotype calls result and add features to block.
* @return
* { createdFeatures : array of created Features,
* sampleNames : array of sample names }
*
* @param block view dataset block for corresponding scope (chromosome)
* @param requestFormat 'CATG', 'Numerical', ...
* Unlike bcftools, Germinate probably sends results only in CATG (nucleotide)
* format, which is the format it uses for upload and storage in HDF.
* @param replaceResults true means remove previous results for this block from block.features[] and selectedFeatures.
* @param selectedService if defined then update selectedFeatures
* @param data result from Germinate callsets/<datasetDbId>/calls request
* @param options { nSamples }
*/
function addFeaturesGerminate(block, requestFormat, replaceResults, selectedService, data, options) {
const fnName = 'addFeaturesGerminate';
dLog(fnName, block.id, block.mapName, data.length);
if (replaceResults) {
dLog(fnName, 'replaceResults not implemented');
}
if (false)
if ((options.nSamples !== undefined) && (data.length > options.nSamples)) {
dLog(fnName, 'truncate data', data.length, options.nSamples);
data = data.slice(0, options.nSamples);
}
const
store = block.get('store'),
columnNames = data.mapBy('callSetName').uniq(),
sampleNames = columnNames,
createdFeatures = data.map((call, i) => {
const f = {values : {}};
/* Will lookup f.value in block.features interval tree,
* and if found, merge with existing feature - factor out use of
* mergeFeatureValues() in addFeaturesJson().
* Using createdFeatures.push(feature) instead of =data.map()
*/
// call.callSetDbId identifies sample name : callSetName
// previously seeing in results : 'CnullT' - this is now fixed in java.
const genotypeValue = call.genotypeValue;
f.values[call.callSetName] = genotypeValue;
let
{markerName, positionText} = variantNameSplit(call.variantName, i < 5),
position = +positionText;
if (isNaN(position)) {
// handle Oct19 format : dbid_mapid_ exome SNP name e.g. 6_20_6_scaffold77480, or? scaffold72661_85293-85293.0
markerName = positionText;
} else {
f.value_0 = position;
f.value = [position];
}
let feature = f;
/** sampleID corresponds to callSetName, so exclude it from the feature name/id */
const [datasetID, sampleID] = call.callSetDbId.split('-');
/* .id is unique per genotype table row; for 1 feature per cell, append : + '_' + call.callSetName */
feature._name = markerName;
feature.id =
block.id + '_' + datasetID + '_' + markerName;
feature = featureMergeOrCreate(store, block, feature);
return feature;
});
dLog(fnName, data.length, columnNames.length);
featureUpdateSelectedAndBlock(selectedService, block, createdFeatures);
let result = {createdFeatures, sampleNames};
return result;
}
function featureMergeOrCreate(store, block, feature) {
/** used in addFeaturesGerminate() and addFeaturesBrapi() */
let existingFeature = store.peekRecord('feature', feature.id);
if (existingFeature) {
mergeFeatureValues(existingFeature, feature);
feature = existingFeature;
// this is included in createdFeatures, since it is a result from the current request.
// as noted in addFeaturesJson(), can rename to resultFeatures.
} else {
// addFeaturesJson() uses feature.blockId - not sure if that is applicable
feature.blockId = block;
// Replace Ember.Object() with models/feature.
feature = store.createRecord('feature', feature);
const server = block.server;
if (! feature.value) {
brapiGetVariantPosition(server, feature);
}
}
return feature;
}
function featureUpdateSelectedAndBlock(selectedService, block, createdFeatures) {
/** used in addFeaturesGerminate() and addFeaturesBrapi() */
if (selectedService) {
const
feature = createdFeatures[0],
mapChrName = feature?.get('blockId.brushName');
// selectedService = feature?.get('blockId.axis.selected');
selectedService.selectedFeaturesUpdateIndex();
createdFeatures.forEach(
feature => selectedService.selectedFeaturesMergeFeature(mapChrName, feature));
}
// createRecord() connects to block OK, so this is not required :
// createdFeatures.forEach(feature => {
// block.features.addObject(feature);
blockEnsureFeatureCount(block);
block.addFeaturePositions(createdFeatures);
}
//------------------------------------------------------------------------------
/** Split the variantName from either Germinate or Spark server into component elements.
* @param variantName
* @param traceUnmatched enable tracing of failure to parse variantName
* @return {markerName, positionText}
*/
function variantNameSplit(variantName, traceUnmatched) {
const fnName = 'variantNameSplit';
/** Germinate :
* "variantName": "m2-23.0"
* m2-23.0 => m2 is marker name and 23.0 is its position
* Some of the marker names contain '-', e.g. 'scaffold77480-1_24233-24233.0'
* so instead of split('-'), use .match(/(.+) ... ) which is greedy.
*
* Spark server : e.g. "variantName":"Chr1A_4188418"
*/
let match, wholeString, markerName, positionText;
if ((match = variantName.match(/(.+)-(.+)/))) {
// Germinate
[wholeString, markerName, positionText] = match;
} else if ((match = variantName.match(/(.+)_(.+)/))) {
// Spark server
let chrName;
[wholeString, chrName, positionText] = match;
// markerName is used to make feature .id and ._name unique
markerName = positionText;
} else if (traceUnmatched) {
dLog(fnName, variantName, 'not matched');
}
return {markerName, positionText};
}
//------------------------------------------------------------------------------
export { resultIsBrapi }
/** @return true if the genotypeLookup API result is from Brapi,
* false if VCF, from bcftools, or Germinate.
* Related : resultIsGerminate().
*/
function resultIsBrapi(data) {
return ! resultIsGerminate(data) && (typeof data === 'object');
}
export { addFeaturesBrapi }
/** Parse Brapi genotype calls result and add features to block.
* Params are the same as addFeaturesGerminate(), except for data.
* @return
* { createdFeatures : array of created Features,
* sampleNames : array of sample names }
* @param requestFormat 'CATG', 'Numerical', ...
* Not used; BrAPI "GT" returns Numerical format.
* refn : dataMatrixAbbreviations and dataMatrixNames in https://brapigenotyping21.docs.apiary.io/#/reference/allele-matrix/get-allelematrix
* @param data result from BrAPI allelematrix request
* {callSetDbIds, dataMatrices, variantDbIds, ... }
*/
function addFeaturesBrapi(block, requestFormat, replaceResults, selectedService, data, options) {
const fnName = 'addFeaturesBrapi';
dLog(fnName, block.id, block.mapName, data.callSetDbIds?.length,
data.variantDbIds?.length, data.dataMatrices?.length);
if (replaceResults) {
dLog(fnName, 'replaceResults not implemented');
}
const
store = block.get('store'),
columnNames = data.callSetDbIds,
sampleNames = columnNames,
dataset = block.get('datasetId'),
samples = dataset.get('samples'),
samplesById = Object.fromEntries(samples.map(s => [s.sampleDbId, s])),
createdFeatures = data.variantDbIds.map((variantDbId, variantIndex) => {
const
/** only .dataMatrices[0] is handled; [0] should be the data type requested
* by .dataMatrixAbbreviations / dataMatrixNames, and this function will
* request 'GT'.
*/
row = data.dataMatrices[0].dataMatrix[variantIndex],
entries = data.callSetDbIds.map((callSetDbId, sampleIndex) => [samplesById[callSetDbId].sampleName, row[sampleIndex]]),
values = Object.fromEntries(entries),
f = {values};
let position;
let match;
/** BrAPI is different to the other 2 flows - it requires an extra lookup
* (POST /search/variants/) for position of variants which are returned in
* data.variantDbIds
*/
const usePositionFromName = false;
if (usePositionFromName && (match = variantDbId.match(/(.+)_(.+)/))) {
const
[wholeString, database_scaffoldNumber, scaffoldOffsetText] = match;
position = +scaffoldOffsetText;
if (isNaN(position)) {
position = 0;
}
f.value_0 = position;
f.value = [position];
}
let feature = f;
feature._name = variantDbId;
feature.id = variantDbId;
feature = featureMergeOrCreate(store, block, feature);
return feature;
});
dLog(fnName, data.dataMatrices?.length, data.callSetDbIds?.length, columnNames.length);
featureUpdateSelectedAndBlock(selectedService, block, createdFeatures);
let result = {createdFeatures, sampleNames};
return result;
}
// export { brapiGetVariantPosition }
function brapiGetVariantPosition(server, feature) {
const
fnName = 'brapiGetVariantPosition',
variantDbId = feature.id,
variantsP = server.variants([variantDbId]).then(data => {
const
/** data is response.result.data[] */
d = data[0],
values = feature.values,
info = d.additionalInfo;
feature.value = [+d.start];
feature.value_0 = feature.value[0];
if (d.end !== undefined) {
feature.value[1] = +d.end;
}
if (d.referenceBases) {
values.ref = d.referenceBases;
}
if (d.alternateBases) {
values.alt = d.alternateBases.join(',');
}
if (info !== undefined) {
values.INFO = info;
const
valuesAdd = pick(info, ['MAF', 'tSNP']);
Object.assign(values, valuesAdd);
if (info.AC && info.AN && +info.AN) {
feature[callRateSymbol] = +info.AC / +info.AN;
}
}
// dLog(fnName, feature);
});
return variantsP;
}
// -----------------------------------------------------------------------------
/** Convert numeric value from string to number.
* If given value is not numeric, return the param.
* Related : parseNumberFields(), parseBooleanFields();
* @param text
* @return text unchanged if it is not numeric
*/
function parseNumber(text) {
const
number = Number(text),
result = isNaN(number) ? text : number;
return result;
}
/** Convert numeric string values in object to number.
*/
function parseNumberFields(obj) {
/** Convert numeric strings to numbers. */
Object.entries(obj).forEach(([k, v]) => {
const number = Number(v);
if (! isNaN(number)) {
obj[k] = number;
}
});
}
//------------------------------------------------------------------------------
export {
// vcfColumn2Feature
// columnNameINFOFix
gtValueIsNumeric,
datasetId2Class,
addGerminateOptions,
getDatasetFeaturesCounts,
vcfGenotypeLookup,
addFeaturesJson,
// blockEnsureFeatureCount
// mergeFeatureValues
resultIsGerminate,
addFeaturesGerminate,
variantNameSplit,
parseNumber,
parseNumberFields,
};