UNPKG

iham-parsers

Version:

orthoxml, phyloxml and newick parsers for the iHam widget

1,436 lines (1,292 loc) 48.2 kB
/** * Copyright (C) 2017 Christian M. Zmasek * Copyright (C) 2017 J. Craig Venter Institute * All rights reserved * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA * * Created by czmasek on 7/7/2016. */ /** * * Version 0.912 20171013 * * This requires sax-js from https://github.com/isaacs/sax-js * * Usage * ----- * * Synchronous parsing of phyloXML-formatted string * ------------------------------------------------ * * var px = require('./phyloxml').phyloXml; * * var phys = px.parse(phyloxmlText, {trim: true, normalize: true}); * * console.log(px.toPhyloXML(phys[0], 6)); * * * * Asynchronous parsing of phyloXML-formatted stream * ------------------------------------------------- * * var fs = require('fs'); * var px = require('./phyloxml').phyloXml; * * var stream = fs.createReadStream(xmlFile, {encoding: 'utf8'}); * * px.parseAsync(stream, {trim: true, normalize: true}); * * * */ (function phyloXml() { "use strict"; var sax = require('sax'); // -------------------------------------------------------------- // phyloXML constants // -------------------------------------------------------------- // Accession var ACCESSION = 'accession'; var ACCESSION_SOURCE_ATTR = 'source'; var ACCESSION_COMMENT_ATTR = 'comment'; // Annotation var ANNOTATION = 'annotation'; var ANNOTATION_REF_ATTR = 'ref'; var ANNOTATION_SOURCE_ATTR = 'source'; var ANNOTATION_EVIDENCE_ATTR = 'evidence'; var ANNOTATION_TYPE_ATTR = 'type'; var ANNOTATION_DESC = 'desc'; // Clade var CLADE = 'clade'; var CLADE_BRANCH_LENGTH = 'branch_length'; var CLADE_ID_SOURCE_ATTR = 'id_source'; var CLADE_COLLAPSE_ATTR = 'collapse'; var CLADE_NAME = 'name'; var CLADE_WIDTH = 'width'; // Clade Relation var CLADE_RELATION = 'clade_relation'; var CLADE_RELATION_DISTANCE_ATTR = 'distance'; var CLADE_RELATION_ID_REF_0_ATTR = 'id_ref_0'; var CLADE_RELATION_ID_REF_1_ATTR = 'id_ref_1'; var CLADE_RELATION_TYPE_ATTR = 'type'; // Sequence Relation var SEQUENCE_RELATION = 'sequence_relation'; var SEQUENCE_RELATION_DISTANCE_ATTR = 'distance'; var SEQUENCE_RELATION_ID_REF_0_ATTR = 'id_ref_0'; var SEQUENCE_RELATION_ID_REF_1_ATTR = 'id_ref_1'; var SEQUENCE_RELATION_TYPE_ATTR = 'type'; // Color var COLOR = 'color'; var COLOR_RED = 'red'; var COLOR_GREEN = 'green'; var COLOR_BLUE = 'blue'; var COLOR_ALPHA = 'alpha'; // Confidence var CONFIDENCE = 'confidence'; var CONFIDENCE_TYPE_ATTR = 'type'; var CONFIDENCE_STDDEV_ATTR = 'stddev'; var CONFIDENCES = 'confidences'; // Cross References var CROSS_REFERENCES = 'cross_references'; // Date var DATE = 'date'; var DATE_UNIT_ATTR = 'unit'; var DATE_DESC = 'desc'; var DATE_VALUE = 'value'; var DATE_MINIMUM = 'minimum'; var DATE_MAXIMUM = 'maximum'; // Distribution var DISTRIBUTION = 'distribution'; var DISTRIBUTION_DESC = 'desc'; // Domain Architecture var DOMAIN_ARCHITECTURE = 'domain_architecture'; var DOMAIN_ARCHITECTURE_LENGTH_ATTR = 'length'; // Events var EVENTS = 'events'; var EVENTS_TYPE = 'type'; var EVENTS_DUPLICATIONS = 'duplications'; var EVENTS_SPECIATIONS = 'speciations'; var EVENTS_LOSSES = 'losses'; // Id var ID = 'id'; var ID_PROVIDER_ATTR = 'provider'; // Mol Seq var MOLSEQ = 'mol_seq'; var MOLSEQ_IS_ALIGNED_ATTR = 'is_aligned'; // Phylogeny var PHYLOGENY = 'phylogeny'; // Phyloxml var PHYLOXML = 'phyloxml'; // Point var POINT = 'point'; var POINT_ALT_UNIT_ATTR = 'alt_unit'; var POINT_GEODETIC_DATUM_ATTR = 'geodetic_datum'; var POINT_LAT = 'lat'; var POINT_LONG = 'long'; var POINT_ALT = 'alt'; // Property var PROPERTY = 'property'; var PROPERTY_REF_ATTR = 'ref'; var PROPERTY_ID_REF_ATTR = 'id_ref'; var PROPERTY_UNIT_ATTR = 'unit'; var PROPERTY_DATATYPE_ATTR = 'datatype'; var PROPERTY_APPLIES_TO_ATTR = 'applies_to'; var PROPERTIES = 'properties'; // Protein Domain var PROTEINDOMAIN = 'domain'; var PROTEINDOMAIN_FROM_ATTR = 'from'; var PROTEINDOMAIN_TO_ATTR = 'to'; var PROTEINDOMAIN_CONFIDENCE_ATTR = 'confidence'; var PROTEINDOMAIN_ID_ATTR = 'id'; // Reference var REFERENCE = 'reference'; var REFERENCE_DOI_ATTR = 'doi'; var REFERENCE_DESC = 'desc'; // Sequence var SEQUENCE = 'sequence'; var SEQUENCE_ID_SOURCE_ATTR = 'id_source'; var SEQUENCE_ID_REF_ATTR = 'id_ref'; var SEQUENCE_TYPE_ATTR = 'type'; var SEQUENCE_SYMBOL = 'symbol'; var SEQUENCE_NAME = 'name'; var SEQUENCE_GENE_NAME = 'gene_name'; var SEQUENCE_LOCATION = 'location'; var SEQUENCES = 'sequences'; // Taxonomy var TAXONOMY = 'taxonomy'; var TAXONOMY_ID_SOURCE_ATTR = 'id_source'; var TAXONOMY_CODE = 'code'; var TAXONOMY_SCIENTIFIC_NAME = 'scientific_name'; var TAXONOMY_AUTHORITY = 'authority'; var TAXONOMY_COMMON_NAME = 'common_name'; var TAXONOMY_SYNONYM = 'synonym'; var TAXONOMY_RANK = 'rank'; var TAXONOMIES = 'taxonomies'; var TAXONOMY_SYNONYMS = 'synonyms'; // Uri var URI = 'uri'; var URI_TYPE_ATTR = 'type'; var URI_DESC_ATTR = 'desc'; // Phylogeny var PHYLOGENY_ROOTED_ATTR = 'rooted'; var PHYLOGENY_REROOTABLE_ATTR = 'rerootable'; var PHYLOGENY_BRANCH_LENGTH_UNIT_ATTR = 'branch_length_unit'; var PHYLOGENY_TYPE_ATTR = 'type'; var PHYLOGENY_NAME = 'name'; var PHYLOGENY_DESCRIPTION = 'description'; var PHYLOGENY_DATE = 'date'; // Simple_Characteristics (to be deprecated!) var X_SIMPLE_CHARACTERISTICS = 'Simple_Characteristics'; var X_SIMPLE_CHARACTERISTIC_COUNTRY = 'Country'; var X_SIMPLE_CHARACTERISTIC_YEAR = 'Year'; var X_SIMPLE_CHARACTERISTIC_HOST = 'Host'; var X_SIMPLE_CHARACTERISTIC_HA = 'HA'; var X_SIMPLE_CHARACTERISTIC_NA = 'NA'; // appType (special for Virus BRC) var APPTYPE = 'flu_type'; // Unknown source, id, confidence type: var UNKNOWN = 'unknown'; // -------------------------------------------------------------- // Instance variables // -------------------------------------------------------------- var _phylogenies = null; var _phylogeny = null; var _cladeStack = null; var _tagStack = null; var _objectStack = null; // -------------------------------------------------------------- // Others // -------------------------------------------------------------- var PROPERTY_REF_RE = /[a-zA-Z0-9_]+:\S+/; var PROPERTY_UNIT_RE = /[a-zA-Z0-9_]+:\S+/; var PROPERTY_DATATYPE_RE = /xsd:\S+/; // -------------------------------------------------------------- // Functions for object creation // -------------------------------------------------------------- function newAccession(tag) { var parent = _tagStack.get(1); if (!(parent === SEQUENCE || parent === CROSS_REFERENCES)) { throw new PhyloXmlError("found accession outside of sequence or cross-references"); } var acc = {}; acc.value = null; acc.source = getAttribute(ACCESSION_SOURCE_ATTR, tag.attributes); acc.comment = getAttribute(ACCESSION_COMMENT_ATTR, tag.attributes); if (!acc.source) { acc.source = UNKNOWN; } if (parent === SEQUENCE) { getCurrentObject().accession = acc; } else { addToArrayInCurrentObjectUnnamed(acc); } _objectStack.push(acc); } function newAnnotation(tag) { var parent = _tagStack.get(1); if (parent != SEQUENCE) { throw new PhyloXmlError("found annotation outside of sequence"); } var ann = {}; ann.evidence = getAttribute(ANNOTATION_EVIDENCE_ATTR, tag.attributes); ann.ref = getAttribute(ANNOTATION_REF_ATTR, tag.attributes); ann.source = getAttribute(ANNOTATION_SOURCE_ATTR, tag.attributes); ann.type = getAttribute(ANNOTATION_TYPE_ATTR, tag.attributes); addToArrayInCurrentObject('annotations', ann); _objectStack.push(ann); } function newBranchColor() { var parent = _tagStack.get(1); if (parent != CLADE) { throw new PhyloXmlError("found branch color outside of clade"); } var col = {}; col.red = 0; col.green = 0; col.blue = 0; getCurrentObject().color = col; _objectStack.push(col); } function newClade(tag) { var newClade = {}; newClade.branch_length = getAttributeAsFloat(CLADE_BRANCH_LENGTH, tag.attributes); newClade.collapse = getAttributeAsBoolean(CLADE_COLLAPSE_ATTR, tag.attributes); if (CLADE_ID_SOURCE_ATTR in tag.attributes) { newClade.id_source = tag.attributes[CLADE_ID_SOURCE_ATTR]; } if (_phylogeny === null) { var phylogeny_data = _objectStack.pop(); if (!_objectStack.isEmpty()) { throw new PhyloXmlError('severe phyloXML format error'); } _phylogeny = phylogeny_data; _phylogeny.children = [newClade]; } else { var currClade = getCurrentClade(); if (currClade.children === undefined) { currClade.children = [newClade]; } else { currClade.children.push(newClade); } } _cladeStack.push(newClade); _objectStack.push(newClade); } function newCladeRelation(tag) { var cr = {}; cr.distance = getAttributeAsFloat(CLADE_RELATION_DISTANCE_ATTR, tag.attributes); cr.id_ref_0 = getAttribute(CLADE_RELATION_ID_REF_0_ATTR, tag.attributes); cr.id_ref_1 = getAttribute(CLADE_RELATION_ID_REF_1_ATTR, tag.attributes); cr.type = getAttribute(CLADE_RELATION_TYPE_ATTR, tag.attributes); addToArrayInCurrentObject('clade_relations', cr); _objectStack.push(cr); } function newSequenceRelation(tag) { var sr = {}; sr.distance = getAttributeAsFloat(SEQUENCE_RELATION_DISTANCE_ATTR, tag.attributes); sr.id_ref_0 = getAttribute(SEQUENCE_RELATION_ID_REF_0_ATTR, tag.attributes); sr.id_ref_1 = getAttribute(SEQUENCE_RELATION_ID_REF_1_ATTR, tag.attributes); sr.type = getAttribute(SEQUENCE_RELATION_TYPE_ATTR, tag.attributes); addToArrayInCurrentObject('sequence_relations', sr); _objectStack.push(sr); } function newConfidence(tag) { var conf = {}; conf.value = null; conf.type = getAttribute(CONFIDENCE_TYPE_ATTR, tag.attributes); conf.stddev = getAttributeAsFloat(CONFIDENCE_STDDEV_ATTR, tag.attributes); var parent = _tagStack.get(1); if (parent === CLADE || parent === PHYLOGENY) { addToArrayInCurrentObject(CONFIDENCES, conf); } else if (parent === ANNOTATION || parent === EVENTS || parent === CLADE_RELATION || parent === SEQUENCE_RELATION) { getCurrentObject().confidence = conf; } _objectStack.push(conf); } function newCrossReferences() { var parent = _tagStack.get(1); if (parent != SEQUENCE) { throw new PhyloXmlError("found cross-reference outside of sequence"); } var xrefs = []; getCurrentObject().cross_references = xrefs; _objectStack.push(xrefs); } function newDate(tag) { var date = {}; date.unit = getAttribute(DATE_UNIT_ATTR, tag.attributes); getCurrentObject().date = date; _objectStack.push(date); } function newDistribution(tag) { var dist = {}; dist.desc = null; dist.unit = getAttribute(DATE_UNIT_ATTR, tag.attributes); addToArrayInCurrentObject('distributions', dist); _objectStack.push(dist); } function newDomainArchitecture(tag) { var da = {}; da.domains = null; da.length = getAttributeAsInt(DOMAIN_ARCHITECTURE_LENGTH_ATTR, tag.attributes); getCurrentObject().domain_architecture = da; _objectStack.push(da); } function newEvents() { var events = {}; getCurrentObject().events = events; _objectStack.push(events); } function newId(tag) { var i = {}; i.value = null; i.provider = getAttribute(ID_PROVIDER_ATTR, tag.attributes); getCurrentObject().id = i; _objectStack.push(i); } function newMolecularSequence(tag) { var mol_seq = {}; mol_seq.is_aligned = getAttributeAsBoolean(MOLSEQ_IS_ALIGNED_ATTR, tag.attributes); getCurrentObject().mol_seq = mol_seq; _objectStack.push(mol_seq); } function newPoint(tag) { var p = {}; p.alt_unit = getAttribute(POINT_ALT_UNIT_ATTR, tag.attributes); p.geodetic_datum = getAttribute(POINT_GEODETIC_DATUM_ATTR, tag.attributes); var parent = _tagStack.get(1); if (parent === DISTRIBUTION) { addToArrayInCurrentObject('points', p); } _objectStack.push(p); } function newProperty(tag) { var prop = {}; prop.ref = getAttribute(PROPERTY_REF_ATTR, tag.attributes); prop.unit = getAttribute(PROPERTY_UNIT_ATTR, tag.attributes); prop.datatype = getAttribute(PROPERTY_DATATYPE_ATTR, tag.attributes); prop.applies_to = getAttribute(PROPERTY_APPLIES_TO_ATTR, tag.attributes); prop.id_ref = getAttribute(PROPERTY_ID_REF_ATTR, tag.attributes); if (!prop.ref) { throw new PhyloXmlError('property ref is missing'); } if (!prop.datatype) { throw new PhyloXmlError('property data-type is missing'); } if (!prop.applies_to) { throw new PhyloXmlError('property applies-to is missing'); } if (!PROPERTY_REF_RE.test(prop.ref)) { throw new PhyloXmlError('property ref is ill-formatted: ' + prop.ref); } if (!PROPERTY_DATATYPE_RE.test(prop.datatype)) { throw new PhyloXmlError('property data-type is ill-formatted: ' + prop.datatype); } if (prop.unit && !PROPERTY_UNIT_RE.test(prop.unit)) { throw new PhyloXmlError('property unit is ill-formatted: ' + prop.unit); } addToArrayInCurrentObject(PROPERTIES, prop); _objectStack.push(prop); } function newProteinDomain(tag) { var pd = {}; pd.name = null; pd.from = getAttributeAsInt(PROTEINDOMAIN_FROM_ATTR, tag.attributes); pd.to = getAttributeAsInt(PROTEINDOMAIN_TO_ATTR, tag.attributes); pd.confidence = getAttributeAsFloat(PROTEINDOMAIN_CONFIDENCE_ATTR, tag.attributes); pd.id = getAttribute(PROTEINDOMAIN_ID_ATTR, tag.attributes); addToArrayInCurrentObject('domains', pd); _objectStack.push(pd); } function newReference(tag) { var reference = {}; reference.doi = getAttribute(REFERENCE_DOI_ATTR, tag.attributes); addToArrayInCurrentObject('references', reference); _objectStack.push(reference); } function newSequence(tag) { var seq = {}; seq.type = getAttribute(SEQUENCE_TYPE_ATTR, tag.attributes); seq.id_source = getAttribute(SEQUENCE_ID_SOURCE_ATTR, tag.attributes); seq.id_ref = getAttribute(SEQUENCE_ID_REF_ATTR, tag.attributes); addToArrayInCurrentObject(SEQUENCES, seq); _objectStack.push(seq); } function newTaxonomy(tag) { var tax = {}; tax.id_source = getAttribute(TAXONOMY_ID_SOURCE_ATTR, tag.attributes); addToArrayInCurrentObject(TAXONOMIES, tax); _objectStack.push(tax); } function newUri(tag) { var uri = {}; uri.value = null; uri.desc = getAttribute(URI_DESC_ATTR, tag.attributes); uri.type = getAttribute(URI_TYPE_ATTR, tag.attributes); addToArrayInCurrentObject('uris', uri); _objectStack.push(uri); } function newPhylogeny(tag) { var phy = {}; phy.rooted = getAttributeAsBoolean(PHYLOGENY_ROOTED_ATTR, tag.attributes); if (phy.rooted === undefined) { phy.rooted = true; } phy.rerootable = getAttributeAsBoolean(PHYLOGENY_REROOTABLE_ATTR, tag.attributes); if (phy.rerootable === undefined) { phy.rerootable = true; } phy.branch_length_unit = getAttribute(PHYLOGENY_BRANCH_LENGTH_UNIT_ATTR, tag.attributes); phy.type = getAttribute(PHYLOGENY_TYPE_ATTR, tag.attributes); _objectStack.push(phy); } function newSimpleCharacteristics() { //To be deprecated. var sc = {}; getCurrentObject().simple_characteristics = sc; _objectStack.push(sc); } // -------------------------------------------------------------- // Functions for processing text // -------------------------------------------------------------- function inAccession(text) { getCurrentObject().value = text; } function inAnnotation(text) { if (getCurrentTag() === ANNOTATION_DESC) { getCurrentObject().desc = text; } } function inAppType(text) { if (getCurrentTag() === APPTYPE) { getCurrentObject().desc = text; } } function inBranchColor(text) { if (getCurrentTag() === COLOR_RED) { getCurrentObject().red = parseIntNumber(text); } else if (getCurrentTag() === COLOR_GREEN) { getCurrentObject().green = parseIntNumber(text); } else if (getCurrentTag() === COLOR_BLUE) { getCurrentObject().blue = parseIntNumber(text); } if (getCurrentTag() === COLOR_ALPHA) { getCurrentObject().alpha = parseIntNumber(text); } } function inClade(text) { if (getCurrentTag() === CLADE_NAME) { getCurrentClade().name = text; } else if (getCurrentTag() === CLADE_BRANCH_LENGTH) { getCurrentClade().branch_length = parseFloatNumber(text); } else if (getCurrentTag() === CLADE_WIDTH) { getCurrentClade().width = parseFloatNumber(text); } } function inConfidence(text) { getCurrentObject().value = parseFloatNumber(text); } function inDate(text) { if (getCurrentTag() === DATE_DESC) { getCurrentObject().desc = text; } else if (getCurrentTag() === DATE_VALUE) { getCurrentObject().value = parseFloatNumber(text); } else if (getCurrentTag() === DATE_MINIMUM) { getCurrentObject().minimum = parseFloatNumber(text); } else if (getCurrentTag() === DATE_MAXIMUM) { getCurrentObject().maximum = parseFloatNumber(text); } } function inDistribution(text) { if (getCurrentTag() === DISTRIBUTION_DESC) { getCurrentObject().desc = text; } } function inEvents(text) { if (getCurrentTag() === EVENTS_TYPE) { getCurrentObject().type = text; } else if (getCurrentTag() === EVENTS_DUPLICATIONS) { getCurrentObject().duplications = parseIntNumber(text); } else if (getCurrentTag() === EVENTS_SPECIATIONS) { getCurrentObject().speciations = parseIntNumber(text); } else if (getCurrentTag() === EVENTS_LOSSES) { getCurrentObject().losses = parseIntNumber(text); } } function inId(text) { getCurrentObject().value = text; } function inMolecularSequence(text) { getCurrentObject().value = text; } function inPoint(text) { if (getCurrentTag() === POINT_LAT) { getCurrentObject().lat = text; } else if (getCurrentTag() === POINT_LONG) { getCurrentObject().long = text; } else if (getCurrentTag() === POINT_ALT) { getCurrentObject().alt = text; } } function inProperty(text) { getCurrentObject().value = text; } function inProteinDomain(text) { getCurrentObject().name = text; } function inPhylogeny(text) { if (getCurrentTag() === PHYLOGENY_NAME) { getCurrentObject().name = text; } else if (getCurrentTag() === PHYLOGENY_DESCRIPTION) { getCurrentObject().description = text; } else if (getCurrentTag() === PHYLOGENY_DATE) { getCurrentObject().date = text; } } function inReference(text) { if (getCurrentTag() === REFERENCE_DESC) { getCurrentObject().desc = text; } } function inSequence(text) { if (getCurrentTag() === SEQUENCE_SYMBOL) { getCurrentObject().symbol = text; } else if (getCurrentTag() === SEQUENCE_NAME) { getCurrentObject().name = text; } else if (getCurrentTag() === SEQUENCE_GENE_NAME) { getCurrentObject().gene_name = text; } else if (getCurrentTag() === SEQUENCE_LOCATION) { getCurrentObject().location = text; } } function inTaxonomy(text) { if (getCurrentTag() === TAXONOMY_CODE) { getCurrentObject().code = text; } else if (getCurrentTag() === TAXONOMY_SCIENTIFIC_NAME) { getCurrentObject().scientific_name = text; } else if (getCurrentTag() === TAXONOMY_AUTHORITY) { getCurrentObject().authority = text; } else if (getCurrentTag() === TAXONOMY_COMMON_NAME) { getCurrentObject().common_name = text; } else if (getCurrentTag() === TAXONOMY_SYNONYM) { addToArrayInCurrentObject(TAXONOMY_SYNONYMS, text); } else if (getCurrentTag() === TAXONOMY_RANK) { getCurrentObject().rank = text; } } function inUri(text) { getCurrentObject().value = text; } function inSimpleCharacteristics(text) { //To be deprecated. if (getCurrentTag() === X_SIMPLE_CHARACTERISTIC_COUNTRY) { getCurrentObject().country = text; } else if (getCurrentTag() === X_SIMPLE_CHARACTERISTIC_HOST) { getCurrentObject().host = text; } else if (getCurrentTag() === X_SIMPLE_CHARACTERISTIC_YEAR) { getCurrentObject().year = text; } else if (getCurrentTag() === X_SIMPLE_CHARACTERISTIC_HA) { getCurrentObject().ha = text; } else if (getCurrentTag() === X_SIMPLE_CHARACTERISTIC_NA) { getCurrentObject().na = text; } } // -------------------------------------------------------------- // Functions for SAX parser // -------------------------------------------------------------- function phyloxmlOnopentag(tag) { _tagStack.push(tag.name); switch (tag.name) { case CLADE: newClade(tag); break; case ACCESSION: newAccession(tag); break; case ANNOTATION: newAnnotation(tag); break; case CLADE_RELATION: if (_tagStack.get(1) === PHYLOGENY) { newCladeRelation(tag); } break; case COLOR: newBranchColor(); break; case CONFIDENCE: newConfidence(tag); break; case CROSS_REFERENCES: newCrossReferences(); break; case DATE: if (_tagStack.get(1) === CLADE) { newDate(tag); } break; case DISTRIBUTION: newDistribution(tag); break; case DOMAIN_ARCHITECTURE: newDomainArchitecture(tag); break; case EVENTS: newEvents(); break; case ID: newId(tag); break; case MOLSEQ: newMolecularSequence(tag); break; case POINT: newPoint(tag); break; case PROTEINDOMAIN: newProteinDomain(tag); break; case PHYLOGENY: newPhylogeny(tag); break; case PROPERTY: newProperty(tag); break; case REFERENCE: newReference(tag); break; case SEQUENCE: newSequence(tag); break; case SEQUENCE_RELATION: if (_tagStack.get(1) === PHYLOGENY) { newSequenceRelation(tag); } break; case TAXONOMY: newTaxonomy(tag); break; case URI: newUri(tag); break; case X_SIMPLE_CHARACTERISTICS: //To be deprecated. newSimpleCharacteristics(); break; default: } } function phyloxmlOnclosetag(tag) { if (tag === CLADE) { _tagStack.pop(); _objectStack.pop(); _cladeStack.pop(); } else if ( tag === ACCESSION || tag === ANNOTATION || ( tag === CLADE_RELATION && (_tagStack.get(1) === PHYLOGENY) ) || tag === COLOR || tag === CONFIDENCE || tag === CROSS_REFERENCES || ( tag === DATE && (_tagStack.get(1) === CLADE) ) || tag === DISTRIBUTION || tag === TAXONOMY || tag === ID || tag === EVENTS || tag === MOLSEQ || tag === REFERENCE || tag === DOMAIN_ARCHITECTURE || tag === PROTEINDOMAIN || tag === SEQUENCE || ( tag === SEQUENCE_RELATION && (_tagStack.get(1) === PHYLOGENY) ) || tag === PROPERTY || tag === POINT || tag === URI || tag === X_SIMPLE_CHARACTERISTICS) { _tagStack.pop(); _objectStack.pop(); } else if (!(tag === PHYLOGENY || tag === PHYLOXML)) { _tagStack.pop(); } else if (tag === PHYLOGENY) { phyloxmlOnClosetagSanityCheck(); _phylogenies.push(_phylogeny); startNewPhylogeny(); } } function phyloxmlOntext(text) { var parentTag = _tagStack.get(1); var currentTag = _tagStack.peek(); if (parentTag === CLADE) { inClade(text); } else if (parentTag === ANNOTATION) { inAnnotation(text); } else if (parentTag === COLOR) { inBranchColor(text); } else if (parentTag === DATE) { inDate(text); } else if (parentTag === DISTRIBUTION) { inDistribution(text); } else if (parentTag === EVENTS) { inEvents(text); } else if (parentTag === REFERENCE) { inReference(text); } else if (parentTag === PHYLOGENY) { inPhylogeny(text); } else if (parentTag === POINT) { inPoint(text); } else if (parentTag === SEQUENCE) { inSequence(text); } else if (parentTag === TAXONOMY) { inTaxonomy(text); } if (currentTag === ACCESSION) { inAccession(text); } else if (currentTag === APPTYPE) { inAppType(text); } else if (currentTag === CONFIDENCE) { inConfidence(text); } else if (currentTag === ID) { inId(text); } else if (currentTag === MOLSEQ) { inMolecularSequence(text); } else if (currentTag === PROTEINDOMAIN) { inProteinDomain(text); } else if (currentTag === PROPERTY) { inProperty(text); } else if (currentTag === URI) { inUri(text); } else if (parentTag === X_SIMPLE_CHARACTERISTICS) { inSimpleCharacteristics(text); } } function phyloxmlOnerror(error) { throw error; } function addPhyloxmlParserEvents(sax_parser) { sax_parser.onopentag = phyloxmlOnopentag; sax_parser.onclosetag = phyloxmlOnclosetag; sax_parser.ontext = phyloxmlOntext; sax_parser.onerror = phyloxmlOnerror; // Ignoring: oncdata, oncomment, ondoctype } // -------------------------------------------------------------- // Helper functions // -------------------------------------------------------------- function getCurrentClade() { return _cladeStack.peek(); } function getCurrentTag() { return _tagStack.peek(); } function getCurrentObject() { return _objectStack.peek(); } function getAttribute(attribute_name, attributes) { if (attribute_name in attributes) { return attributes[attribute_name]; } return undefined; } function getAttributeAsInt(attribute_name, attributes) { if (attribute_name in attributes) { return parseIntNumber(attributes[attribute_name]); } return undefined; } function getAttributeAsFloat(attribute_name, attributes) { if (attribute_name in attributes) { return parseFloatNumber(attributes[attribute_name]); } return undefined; } function getAttributeAsBoolean(attribute_name, attributes) { if (attribute_name in attributes) { return parseBoolean(attributes[attribute_name]); } return undefined; } function addToArrayInCurrentObject(name, value) { var parent = null; if (getCurrentObject()) { parent = getCurrentObject(); } else { parent = _phylogeny; } var ary = parent[name]; if (ary) { ary.push(value); } else { parent[name] = [value]; } } function addToArrayInCurrentObjectUnnamed(value) { var obj = getCurrentObject(); obj.push(value); } function parseFloatNumber(text) { var f = parseFloat(text); if (isNaN(f)) { throw new PhyloXmlError("could not parse floating point number from '" + text + "'"); } return f; } function parseIntNumber(text) { var i = parseInt(text); if (isNaN(i)) { throw new PhyloXmlError("could not parse integer number from '" + text + "'"); } return i; } function parseBoolean(text) { if (text === 'true') { return true; } else if (text === 'false') { return false; } else { throw new PhyloXmlError("could not parse boolean from '" + text + "'"); } } function startNewPhylogeny() { _phylogeny = null; _cladeStack = new Stack(); _tagStack = new Stack(); _objectStack = new Stack(); } function phyloxmlOnClosetagSanityCheck() { if (!(_cladeStack.isEmpty() && _objectStack.isEmpty() )) { throw new PhyloXmlError('severe phyloXML format error'); } } function finalSanityCheck() { if (!_tagStack.isEmpty()) { throw new PhyloXmlError('severe phyloXML format error'); } } // -------------------------------------------------------------- // Stack // -------------------------------------------------------------- function Stack() { this._stack = []; this.pop = function () { var p = this._stack.pop(); if (p === undefined) { throw new Error('severe phyloXML format error'); } return p; }; this.push = function (item) { this._stack.push(item); }; this.peek = function () { return this._stack[this._stack.length - 1]; }; this.get = function (i) { return this._stack[this._stack.length - (1 + i)]; }; this.length = function () { return this._stack.length; }; this.isEmpty = function () { return this._stack.length < 1; }; } // -------------------------------------------------------------- // phyloXML error // -------------------------------------------------------------- function PhyloXmlError(message) { this.name = 'phyloXmlError'; this.message = message || 'phyloXML format error'; } PhyloXmlError.prototype = Object.create(Error.prototype); // -------------------------------------------------------------- // To phyloXML // -------------------------------------------------------------- phyloXml.toPhyloXML_ = function (phy, dec) { var x = ''; var ind = ''; openPhyloXml(); openPhylogeny(phy, [PHYLOGENY_ROOTED_ATTR, PHYLOGENY_REROOTABLE_ATTR, PHYLOGENY_BRANCH_LENGTH_UNIT_ATTR, PHYLOGENY_TYPE_ATTR]); addSingleElement(PHYLOGENY_NAME, phy.name); addSingleElement(PHYLOGENY_DESCRIPTION, phy.description); addSingleElement(PHYLOGENY_DATE, phy.date); if (phy.children && phy.children.length === 1) { toPhyloXMLhelper(phy.children[0]); } closePhylogeny(); closePhyloXml(); return x; function toPhyloXMLhelper(node) { var l; var i; openClade(node, [CLADE_ID_SOURCE_ATTR, CLADE_COLLAPSE_ATTR]); addSingleElement(CLADE_NAME, node.name); if (node[CLADE_BRANCH_LENGTH]) { addSingleElement(CLADE_BRANCH_LENGTH, (dec && dec > 0) ? roundNumber(node[CLADE_BRANCH_LENGTH], dec) : node[CLADE_BRANCH_LENGTH]); } if (node[CONFIDENCES] && node[CONFIDENCES].length > 0) { l = node[CONFIDENCES].length; for (i = 0; i < l; ++i) { var conf = node[CONFIDENCES][i]; if (!conf[CONFIDENCE_TYPE_ATTR]) { conf[CONFIDENCE_TYPE_ATTR] = UNKNOWN; } addSingleElement(CONFIDENCE, conf.value, conf, [CONFIDENCE_TYPE_ATTR, CONFIDENCE_STDDEV_ATTR]); } } addSingleElement(CLADE_WIDTH, node[CLADE_WIDTH]); if (node[COLOR]) { var col = node[COLOR]; open(COLOR); addSingleElement(COLOR_RED, col[COLOR_RED]); addSingleElement(COLOR_GREEN, col[COLOR_GREEN]); addSingleElement(COLOR_BLUE, col[COLOR_BLUE]); addSingleElement(COLOR_ALPHA, col[COLOR_ALPHA]); close(COLOR); } if (node[TAXONOMIES] && node[TAXONOMIES].length > 0) { l = node[TAXONOMIES].length; for (i = 0; i < l; ++i) { var tax = node[TAXONOMIES][i]; open(TAXONOMY, tax, [TAXONOMY_ID_SOURCE_ATTR]); if (tax[ID]) { if (!tax[ID][ID_PROVIDER_ATTR]) { tax[ID][ID_PROVIDER_ATTR] = UNKNOWN; } addSingleElement(ID, tax[ID].value, tax[ID], [ID_PROVIDER_ATTR]); } addSingleElement(TAXONOMY_CODE, tax[TAXONOMY_CODE]); addSingleElement(TAXONOMY_SCIENTIFIC_NAME, tax[TAXONOMY_SCIENTIFIC_NAME]); addSingleElement(TAXONOMY_AUTHORITY, tax[TAXONOMY_AUTHORITY]); addSingleElement(TAXONOMY_COMMON_NAME, tax[TAXONOMY_COMMON_NAME]); if (tax[TAXONOMY_SYNONYMS] && tax[TAXONOMY_SYNONYMS].length > 0) { var ll = tax[TAXONOMY_SYNONYMS].length; for (var ii = 0; ii < ll; ++ii) { addSingleElement(TAXONOMY_SYNONYM, tax[TAXONOMY_SYNONYMS][ii]); } } addSingleElement(TAXONOMY_RANK, tax[TAXONOMY_RANK]); close(TAXONOMY); } } if (node[SEQUENCES] && node[SEQUENCES].length > 0) { l = node[SEQUENCES].length; for (i = 0; i < l; ++i) { var seq = node[SEQUENCES][i]; open(SEQUENCE, seq, [SEQUENCE_TYPE_ATTR, SEQUENCE_ID_SOURCE_ATTR, SEQUENCE_ID_REF_ATTR]); addSingleElement(SEQUENCE_SYMBOL, seq[SEQUENCE_SYMBOL]); if (seq[ACCESSION]) { if (!seq[ACCESSION][ACCESSION_SOURCE_ATTR]) { seq[ACCESSION][ACCESSION_SOURCE_ATTR] = UNKNOWN; } addSingleElement(ACCESSION, seq[ACCESSION].value, seq[ACCESSION], [ACCESSION_SOURCE_ATTR, ACCESSION_COMMENT_ATTR]); } addSingleElement(SEQUENCE_NAME, seq[SEQUENCE_NAME]); addSingleElement(SEQUENCE_GENE_NAME, seq[SEQUENCE_GENE_NAME]); addSingleElement(SEQUENCE_LOCATION, seq[SEQUENCE_LOCATION]); if (seq[MOLSEQ]) { addSingleElement(MOLSEQ, seq[MOLSEQ].value, seq[MOLSEQ], [MOLSEQ_IS_ALIGNED_ATTR]); } close(SEQUENCE); } } if (node[EVENTS]) { var ev = node[EVENTS]; open(EVENTS); addSingleElement(EVENTS_TYPE, ev[EVENTS_TYPE]); addSingleElement(EVENTS_DUPLICATIONS, ev[EVENTS_DUPLICATIONS]); addSingleElement(EVENTS_SPECIATIONS, ev[EVENTS_SPECIATIONS]); addSingleElement(EVENTS_LOSSES, ev[EVENTS_LOSSES]); if (ev[CONFIDENCE]) { var evconf = ev[CONFIDENCE]; addSingleElement(CONFIDENCE, evconf.value, evconf, [CONFIDENCE_TYPE_ATTR, CONFIDENCE_STDDEV_ATTR]); } close(EVENTS); } if (node[PROPERTIES] && node[PROPERTIES].length > 0) { l = node[PROPERTIES].length; for (i = 0; i < l; ++i) { var prop = node[PROPERTIES][i]; if (!prop[PROPERTY_APPLIES_TO_ATTR]) { throw new PhyloXmlError("property applies-to is missing"); } if (!prop[PROPERTY_DATATYPE_ATTR]) { throw new PhyloXmlError("property data-type is missing"); } if (!prop[PROPERTY_REF_ATTR]) { throw new PhyloXmlError("property ref is missing"); } addSingleElement(PROPERTY, prop.value, prop, [PROPERTY_REF_ATTR, PROPERTY_UNIT_ATTR, PROPERTY_DATATYPE_ATTR, PROPERTY_APPLIES_TO_ATTR, PROPERTY_ID_REF_ATTR]); } } if (node.children) { l = node.children.length; for (i = 0; i < l; ++i) { toPhyloXMLhelper(node.children[i]); } } else if (node._children) { l = node._children.length; for (i = 0; i < l; ++i) { toPhyloXMLhelper(node._children[i]); } } closeClade(); } // toPhyloXMLhelper function addSingleElement(elemName, elemValue, object, attributeNames) { if ((elemValue !== null) && (elemValue !== undefined)) { if (typeof elemValue === 'string' || elemValue instanceof String) { elemValue = elemValue.trim(); if (elemValue.length > 0) { if ((elemValue.indexOf('&') > -1) || ( elemValue.indexOf('<') > -1) || ( elemValue.indexOf('>') > -1) || (elemValue.indexOf('"') > -1) || ( elemValue.indexOf("'") > -1)) { elemValue = replaceUnsafeChars(elemValue); } } else { return; } } x += ( ind + '<' + elemName); if (object && attributeNames && attributeNames.length > 0) { addAttributes(object, attributeNames); } x += ( '>' + elemValue + '</' + elemName + '>\n'); } } function open(elemName, object, attributeNames) { if (object && attributeNames && attributeNames.length > 0) { x += ( ind + '<' + elemName); addAttributes(object, attributeNames); x += '>\n'; } else { x += (ind + '<' + elemName + '>\n' ); } ind = ind + ' '; } function close(elemName) { reduceInd(); x += ( ind + '</' + elemName + '>\n' ); } function openClade(object, attributeNames) { if (object && attributeNames && attributeNames.length > 0) { x += ind + '<clade'; addAttributes(object, attributeNames); x += '>\n'; } else { x += ind + '<clade>\n'; } ind = ind + ' '; } function closeClade() { reduceInd(); x += ind + '</clade>\n'; } function openPhylogeny(object, attributeNames) { if (object[PHYLOGENY_ROOTED_ATTR] === undefined || object[PHYLOGENY_ROOTED_ATTR] === null) { object[PHYLOGENY_ROOTED_ATTR] = true; } if (object[PHYLOGENY_REROOTABLE_ATTR] === undefined || object[PHYLOGENY_REROOTABLE_ATTR] === null) { object[PHYLOGENY_REROOTABLE_ATTR] = true; } if (object && attributeNames && attributeNames.length > 0) { x += ' <phylogeny'; addAttributes(object, attributeNames); x += '>\n'; } else { x += ' <phylogeny>\n'; } ind = ' '; } function closePhylogeny() { ind = ' '; x += ' </phylogeny>\n'; } function openPhyloXml() { ind = ''; x += '<?xml version="1.0" encoding="UTF-8"?>\n'; x += '<phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.20/phyloxml.xsd" xmlns="http://www.phyloxml.org">\n'; } function closePhyloXml() { x += '</phyloxml>\n'; } function addAttributes(object, attributeNames) { var l = attributeNames.length; for (var i = 0; i < l; ++i) { var attributeName = attributeNames[i]; if (attributeName && ( object[attributeName] !== undefined && object[attributeName] !== null )) { x += (' ' + attributeName + '="' + object[attributeName] + '"' ); } } } function roundNumber(num, dec) { return Math.round(num * Math.pow(10, dec)) / Math.pow(10, dec); } function replaceUnsafeChars(str) { return str .replace(/&/g, "&amp;") .replace(/</g, "&lt;") .replace(/>/g, "&gt;") .replace(/"/g, "&quot;") .replace(/'/g, "&apos;"); } function reduceInd() { var l = ind.length; ind = ''; for (var i = 0; i <= l - 2; ++i) { ind += ' '; } } }; // toPhyloXML_ // -------------------------------------------------------------- // Main functions // -------------------------------------------------------------- /** * To parse phyloXML formatted trees from a stream asynchronously. * * @param stream - The stream to be parsed. * @param parseOptions - Options dict for the SAX parser. * (example: {trim: true, normalize: true}). */ phyloXml.parseAsync = function (stream, parseOptions) { _phylogenies = []; startNewPhylogeny(); var sax_parser = sax.createStream(true, parseOptions); addPhyloxmlParserEvents(sax_parser); stream.pipe(sax_parser); sax_parser.on('end', function () { finalSanityCheck(); var len = _phylogenies.length; console.log("parsed " + len + " trees"); // for (var i = 0; i < len; ++i) { // do something, for example: // var str = JSON.stringify(_phylogenies[i], null, 2); //} }); process.stdout.on('drain', function () { stream.resume(); }); }; /** * To parse a phyloXML formatted source. * * @param source - The source. * @param parseOptions - Options dict for the SAX parser * (example: {trim: true, normalize: true}). * @returns {*} - Array of phylogentic tree objects. */ phyloXml.parse = function (source, parseOptions) { source && ( source = source.toString().trim()); if (!source) { throw new Error('phyloXML source is empty'); } _phylogenies = []; startNewPhylogeny(); var sax_parser = sax.parser(true, parseOptions); addPhyloxmlParserEvents(sax_parser); sax_parser.onend = function () { finalSanityCheck(); }; sax_parser.write(source).close(); return _phylogenies; }; /** * To convert a phylogentic tree object to a phyloXML formatted string. * * @param phy - A phylogentic tree object. * @param decPointsMax - Maximal number of decimal points for branch lengths (optional). * @returns A phyloXML formatted string. */ phyloXml.toPhyloXML = function (phy, decPointsMax) { return phyloXml.toPhyloXML_(phy, decPointsMax); }; // -------------------------------------------------------------- // For exporting // -------------------------------------------------------------- if (typeof module !== 'undefined' && module.exports && !global.xmldocAssumeBrowser) module.exports.phyloXml = phyloXml; else if (typeof window !== "undefined") window.phyloXml = phyloXml; else this.phyloXml = phyloXml; })();