molstar
Version:
A comprehensive macromolecular library.
461 lines (460 loc) • 18.8 kB
JavaScript
/**
* Copyright (c) 2017-2022 mol* contributors, licensed under MIT, See LICENSE file for more info.
*
* @author Alexander Rose <alexander.rose@weirdbyte.de>
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.getFieldType = getFieldType;
exports.generateSchema = generateSchema;
const schema_1 = require("./schema");
const helper_1 = require("./helper");
function getFieldType(type, description, values, container) {
switch (type) {
// mmCIF
case 'code':
case 'line':
case 'text':
case 'char':
case 'boolean':
return values && values.length ? (0, schema_1.EnumCol)(values, 'str', description) : (0, schema_1.StrCol)(description);
case 'ucode':
case 'uline':
case 'uchar3':
case 'uchar1':
case 'uchar5':
// only force lower-case for enums
return values && values.length ? (0, schema_1.EnumCol)(values.map(x => x.toLowerCase()), 'lstr', description) : (0, schema_1.StrCol)(description);
case 'aliasname':
case 'name':
case 'idname':
case 'any':
case 'atcode':
case 'fax':
case 'phone':
case 'email':
case 'code30':
case 'seq-one-letter-code':
case 'author':
case 'orcid_id':
case 'pdbx_PDB_obsoleted_db_id':
case 'pdbx_related_db_id':
case 'sequence_dep':
case 'pdb_id':
case 'pdb_id_u': // should be case insensitve, but can't express that
case 'emd_id':
// todo, consider adding specialised fields
case 'yyyy-mm-dd':
case 'yyyy-mm-dd:hh:mm':
case 'yyyy-mm-dd:hh:mm-flex':
case 'int-range':
case 'float-range':
case 'binary':
case 'operation_expression':
case 'point_symmetry':
case '4x3_matrix':
case '3x4_matrix':
case '3x4_matrices':
case 'point_group':
case 'point_group_helical':
case 'symmetry_operation':
case 'date_dep':
case 'url':
case 'symop':
case 'exp_data_doi':
case 'asym_id':
case 'uniprot_ptm_id':
return (0, schema_1.StrCol)(description);
case 'int':
case 'non_negative_int':
case 'positive_int':
return values && values.length ? (0, schema_1.EnumCol)(values, 'int', description) : (0, schema_1.IntCol)(description);
case 'float':
return (0, schema_1.FloatCol)(description);
case 'ec-type':
case 'ucode-alphanum-csv':
case 'id_list':
case 'entity_id_list':
return (0, schema_1.ListCol)('str', ',', description);
case 'id_list_spc':
return (0, schema_1.ListCol)('str', ' ', description);
// cif
case 'Text':
case 'Code':
case 'Complex':
case 'Symop':
case 'List':
case 'List(Real,Real)':
case 'List(Real,Real,Real,Real)':
case 'Date':
case 'DateTime':
case 'Tag':
case 'Implied':
case 'Word':
case 'Uri':
return wrapContainer('str', ',', description, container);
case 'Real':
return wrapContainer('float', ',', description, container);
case 'Integer':
return wrapContainer('int', ',', description, container);
}
console.log(`unknown type '${type}'`);
return (0, schema_1.StrCol)(description);
}
function ColFromType(type, description) {
switch (type) {
case 'int': return (0, schema_1.IntCol)(description);
case 'str': return (0, schema_1.StrCol)(description);
case 'float': return (0, schema_1.FloatCol)(description);
case 'coord': return (0, schema_1.CoordCol)(description);
}
}
function wrapContainer(type, separator, description, container) {
return container && container === 'List' ? (0, schema_1.ListCol)(type, separator, description) : ColFromType(type, description);
}
function getImportFrames(d, imports) {
const frames = [];
if (!('import' in d.categories))
return frames;
const importGet = (0, helper_1.parseImportGet)(d.categories['import'].getField('get').str(0));
for (const g of importGet) {
const { file, save } = g;
if (!file || !save) {
console.warn(`missing 'save' or 'file' for import in '${d.header}'`);
continue;
}
const importFrames = imports.get(file);
if (!importFrames) {
console.warn(`missing '${file}' entry in imports`);
continue;
}
const importSave = importFrames.find(id => id.header.toLowerCase() === save.toLowerCase());
if (!importSave) {
console.warn(`missing '${save}' save frame in '${file}'`);
continue;
}
frames.push(importSave);
}
return frames;
}
/** get field from given or linked category */
function getField(category, field, d, imports, ctx) {
const { categories, links } = ctx;
const cat = d.categories[category];
if (cat) {
return cat.getField(field);
}
else if (d.header in links) {
const linkName = links[d.header];
if (linkName in categories) {
return getField(category, field, categories[linkName], imports, ctx);
}
else {
// console.log(`link '${linkName}' not found`)
}
}
else {
const importFrames = getImportFrames(d, imports);
for (const idf of importFrames) {
return getField(category, field, idf, imports, ctx);
}
}
}
function getEnums(d, imports, ctx) {
const value = getField('item_enumeration', 'value', d, imports, ctx);
const enums = [];
if (value) {
for (let i = 0; i < value.rowCount; ++i) {
enums.push(value.str(i));
// console.log(value.str(i))
}
return enums;
}
else {
// console.log(`item_enumeration.value not found for '${d.header}'`)
}
}
function getContainer(d, imports, ctx) {
const value = getField('type', 'container', d, imports, ctx);
return value ? value.str(0) : undefined;
}
function getCode(d, imports, ctx) {
const code = getField('item_type', 'code', d, imports, ctx) || getField('type', 'contents', d, imports, ctx);
if (code) {
return [code.str(0), getEnums(d, imports, ctx), getContainer(d, imports, ctx)];
}
else {
console.log(`item_type.code or type.contents not found for '${d.header}'`);
}
}
function getSubCategory(d, imports, ctx) {
const value = getField('item_sub_category', 'id', d, imports, ctx);
if (value) {
return value.str(0);
}
}
function getDescription(d, imports, ctx) {
const value = getField('item_description', 'description', d, imports, ctx) || getField('description', 'text', d, imports, ctx);
if (value) {
// trim (after newlines) and remove references to square brackets
return value.str(0).trim()
.replace(/(\r\n|\r|\n)([ \t]+)/g, '\n')
.replace(/(\[[1-3]\])+ element/, 'elements')
.replace(/(\[[1-3]\])+/, '');
}
}
function getAliases(d, imports, ctx) {
const value = getField('item_aliases', 'alias_name', d, imports, ctx) || getField('alias', 'definition_id', d, imports, ctx);
return value ? value.toStringArray().map(v => v.substr(1)) : undefined;
}
const reMatrixField = /\[[1-3]\]\[[1-3]\]/;
const reVectorField = /\[[1-3]\]/;
const FORCE_INT_FIELDS = [
'_atom_site.id',
'_atom_site.auth_seq_id',
'_atom_site_anisotrop.id',
'_atom_site_anisotrop.pdbx_auth_seq_id',
'_pdbx_struct_mod_residue.auth_seq_id',
'_pdbx_unobs_or_zero_occ_residues.auth_seq_id',
'_struct_conf.beg_auth_seq_id',
'_struct_conf.end_auth_seq_id',
'_struct_conn.ptnr1_auth_seq_id',
'_struct_conn.ptnr2_auth_seq_id',
'_struct_sheet_range.beg_auth_seq_id',
'_struct_sheet_range.end_auth_seq_id',
'_struct_site.pdbx_auth_seq_id',
'_struct_site_gen.auth_seq_id',
'_struct_mon_prot_cis.auth_seq_id',
'_struct_mon_prot_cis.pdbx_auth_seq_id_2',
];
/**
* Note that name and mapped name must share a prefix. This is not always the case in
* the cifCore dictionary, but for downstream code to work a container field with the
* same prefix as the member fields must be given here and in the field names filter
* list.
*/
const FORCE_MATRIX_FIELDS_MAP = {
'atom_site_aniso.u_11': 'u', // is matrix_u in the the dic
'atom_site_aniso.u_22': 'u',
'atom_site_aniso.u_33': 'u',
'atom_site_aniso.u_23': 'u',
'atom_site_aniso.u_13': 'u',
'atom_site_aniso.u_12': 'u',
};
const FORCE_MATRIX_FIELDS = Object.keys(FORCE_MATRIX_FIELDS_MAP);
const EXTRA_ALIASES = {
'atom_site_aniso.matrix_u': [
'atom_site_anisotrop_U',
'atom_site_aniso.U'
],
};
const COMMA_SEPARATED_LIST_FIELDS = [
'_atom_site.pdbx_struct_group_id',
'_chem_comp.mon_nstd_parent_comp_id',
'_diffrn_radiation.pdbx_wavelength_list',
'_diffrn_source.pdbx_wavelength_list',
'_em_diffraction.tilt_angle_list', // 20,40,50,55
'_em_entity_assembly.entity_id_list',
'_entity.pdbx_description', // Endolysin,Beta-2 adrenergic receptor
'_entity.pdbx_ec',
'_entity_poly.pdbx_strand_id', // A,B
'_entity_src_gen.pdbx_gene_src_gene', // ADRB2, ADRB2R, B2AR
'_pdbx_depui_entry_details.experimental_methods',
'_pdbx_depui_entry_details.requested_accession_types',
'_pdbx_soln_scatter_model.software_list', // INSIGHT II, HOMOLOGY, DISCOVERY, BIOPOLYMER, DELPHI
'_pdbx_soln_scatter_model.software_author_list', // MSI
'_pdbx_soln_scatter_model.entry_fitting_list', // Odd example: 'PDB CODE 1HFI, 1HCC, 1HFH, 1VCC'
'_pdbx_struct_assembly_gen.entity_inst_id',
'_pdbx_struct_assembly_gen.asym_id_list',
'_pdbx_struct_assembly_gen.auth_asym_id_list',
'_pdbx_struct_assembly_gen_depositor_info.asym_id_list',
'_pdbx_struct_assembly_gen_depositor_info.chain_id_list',
'_pdbx_struct_group_list.group_enumeration_type',
'_reflns.pdbx_diffrn_id',
'_refine.pdbx_diffrn_id',
'_reflns_shell.pdbx_diffrn_id',
'_struct_keywords.text',
];
const SPACE_SEPARATED_LIST_FIELDS = [
'_chem_comp.pdbx_subcomponent_list', // TSM DPH HIS CHF EMR
'_pdbx_soln_scatter.data_reduction_software_list', // OTOKO
'_pdbx_soln_scatter.data_analysis_software_list', // SCTPL5 GNOM
];
const SEMICOLON_SEPARATED_LIST_FIELDS = [
'_chem_comp.pdbx_synonyms' // GLYCERIN; PROPANE-1,2,3-TRIOL
];
/**
* Useful when a dictionary extension will add enum values to an existing dictionary.
* By adding them here, the dictionary extension can be tested before the added enum
* values are available in the existing dictionary.
*/
const EXTRA_ENUM_VALUES = {};
function generateSchema(frames, imports = new Map()) {
const tables = {};
const aliases = { ...EXTRA_ALIASES };
const categories = {};
const links = {};
const ctx = { categories, links };
// get category metadata
frames.forEach(d => {
// category definitions in mmCIF start with '_' and don't include a '.'
// category definitions in cifCore don't include a '.'
if (d.header[0] === '_' || d.header.includes('.'))
return;
const categoryName = d.header.toLowerCase();
// console.log(d.header, d.categoryNames, d.categories)
let descriptionField;
const categoryKeyNames = new Set();
if ('category' in d.categories && 'category_key' in d.categories) {
const category = d.categories['category'];
const categoryKey = d.categories['category_key'];
if (categoryKey) {
const categoryKey_names = categoryKey.getField('name');
if (categoryKey_names) {
for (let i = 0, il = categoryKey_names.rowCount; i < il; ++i) {
categoryKeyNames.add(categoryKey_names.str(i));
}
}
}
descriptionField = category.getField('description');
if (categoryKeyNames.size === 0) {
console.log(`no key given for category '${categoryName}'`);
}
}
if ('description' in d.categories) {
descriptionField = d.categories['description'].getField('text');
}
let description = '';
if (descriptionField) {
description = descriptionField.str(0).trim()
.replace(/(\r\n|\r|\n)([ \t]+)/g, '\n'); // remove padding after newlines
}
else {
console.log(`no description given for category '${categoryName}'`);
}
tables[categoryName] = { description, key: categoryKeyNames, columns: {} };
// console.log('++++++++++++++++++++++++++++++++++++++++++')
// console.log('name', categoryName)
// console.log('desc', description)
// console.log('key', categoryKeyNames)
});
// build list of links between categories
frames.forEach(d => {
if (d.header[0] !== '_' && !d.header.includes('.'))
return;
categories[d.header] = d;
const item_linked = d.categories['item_linked'];
if (item_linked) {
const child_name = item_linked.getField('child_name');
const parent_name = item_linked.getField('parent_name');
if (child_name && parent_name) {
for (let i = 0; i < item_linked.rowCount; ++i) {
const childName = child_name.str(i);
const parentName = parent_name.str(i);
if (childName in links && links[childName] !== parentName) {
console.log(`${childName} linked to ${links[childName]}, ignoring link to ${parentName}`);
}
links[childName] = parentName;
}
}
}
});
// get field data
Object.keys(categories).forEach(fullName => {
const d = categories[fullName];
if (!d) {
console.log(`'${fullName}' not found, moving on`);
return;
}
const categoryName = d.header.substring(d.header[0] === '_' ? 1 : 0, d.header.indexOf('.'));
const itemName = d.header.substring(d.header.indexOf('.') + 1);
let fields;
if (categoryName in tables) {
fields = tables[categoryName].columns;
tables[categoryName].key.add(itemName);
}
else if (categoryName.toLowerCase() in tables) {
// take case from category name in 'field' data as it is better if data is from cif dictionaries
tables[categoryName] = tables[categoryName.toLowerCase()];
fields = tables[categoryName].columns;
}
else {
console.log(`category '${categoryName}' has no metadata`);
fields = {};
tables[categoryName] = {
description: '',
key: new Set(),
columns: fields
};
}
const itemAliases = getAliases(d, imports, ctx);
if (itemAliases)
aliases[`${categoryName}.${itemName}`] = itemAliases;
const description = getDescription(d, imports, ctx) || '';
// need to use regex to check for matrix or vector items
// as sub_category assignment is missing for some entries
const subCategory = getSubCategory(d, imports, ctx);
if (subCategory === 'cartesian_coordinate' || subCategory === 'fractional_coordinate') {
fields[itemName] = (0, schema_1.CoordCol)(description);
}
else if (FORCE_INT_FIELDS.includes(d.header)) {
fields[itemName] = (0, schema_1.IntCol)(description);
console.log(`forcing int: ${d.header}`);
}
else if (FORCE_MATRIX_FIELDS.includes(d.header)) {
fields[itemName] = (0, schema_1.FloatCol)(description);
fields[FORCE_MATRIX_FIELDS_MAP[d.header]] = (0, schema_1.MatrixCol)(3, 3, description);
console.log(`forcing matrix: ${d.header}`);
}
else if (subCategory === 'matrix') {
fields[itemName.replace(reMatrixField, '')] = (0, schema_1.MatrixCol)(3, 3, description);
}
else if (subCategory === 'vector') {
fields[itemName.replace(reVectorField, '')] = (0, schema_1.VectorCol)(3, description);
}
else {
if (itemName.match(reMatrixField)) {
fields[itemName.replace(reMatrixField, '')] = (0, schema_1.MatrixCol)(3, 3, description);
console.log(`${d.header} should have 'matrix' _item_sub_category.id`);
}
else if (itemName.match(reVectorField)) {
fields[itemName.replace(reVectorField, '')] = (0, schema_1.VectorCol)(3, description);
console.log(`${d.header} should have 'vector' _item_sub_category.id`);
}
else {
const code = getCode(d, imports, ctx);
if (code) {
let fieldType = getFieldType(code[0], description, code[1], code[2]);
if (fieldType.type === 'str') {
if (COMMA_SEPARATED_LIST_FIELDS.includes(d.header)) {
fieldType = (0, schema_1.ListCol)('str', ',', description);
console.log(`forcing comma separated: ${d.header}`);
}
else if (SPACE_SEPARATED_LIST_FIELDS.includes(d.header)) {
fieldType = (0, schema_1.ListCol)('str', ' ', description);
console.log(`forcing space separated: ${d.header}`);
}
else if (SEMICOLON_SEPARATED_LIST_FIELDS.includes(d.header)) {
fieldType = (0, schema_1.ListCol)('str', ';', description);
console.log(`forcing space separated: ${d.header}`);
}
}
if (d.header in EXTRA_ENUM_VALUES) {
if (fieldType.type === 'enum') {
fieldType.values.push(...EXTRA_ENUM_VALUES[d.header]);
}
else {
console.warn(`expected enum: ${d.header}`);
}
}
fields[itemName] = fieldType;
}
else {
fields[itemName] = (0, schema_1.StrCol)(description);
// console.log(`could not determine code for '${d.header}'`)
}
}
}
});
return { tables, aliases };
}
;