UNPKG

seqparse

Version:

Parse sequence files (GenBank, FASTA, SnapGene, SBOL) and accession IDs (NCBI, iGEM) to a common format

1,289 lines (1,230 loc) 64.3 kB
#!/usr/bin/env node /******/ (() => { // webpackBootstrap /******/ "use strict"; /******/ var __webpack_modules__ = ([ /* 0 */, /* 1 */ /***/ ((module) => { module.exports = require("fs"); /***/ }), /* 2 */ /***/ (function(__unused_webpack_module, exports, __webpack_require__) { var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __generator = (this && this.__generator) || function (thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (_) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } }; Object.defineProperty(exports, "__esModule", ({ value: true })); exports.parseFile = void 0; var fetchFile_1 = __webpack_require__(3); var parseFile_1 = __webpack_require__(5); exports.parseFile = parseFile_1.default; /* Parse a sequence file. Or download a sequence with an Accession ID. */ exports["default"] = (function (input, options) { return __awaiter(void 0, void 0, void 0, function () { return __generator(this, function (_a) { switch (_a.label) { case 0: if (!(!(options === null || options === void 0 ? void 0 : options.fileName) && (0, fetchFile_1.isAccession)(input))) return [3 /*break*/, 2]; return [4 /*yield*/, (0, fetchFile_1.default)(input, options)]; case 1: return [2 /*return*/, _a.sent()]; case 2: return [2 /*return*/, (0, parseFile_1.default)(input, options)[0]]; } }); }); }); /***/ }), /* 3 */ /***/ (function(__unused_webpack_module, exports, __webpack_require__) { var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __generator = (this && this.__generator) || function (thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (_) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } }; Object.defineProperty(exports, "__esModule", ({ value: true })); exports.isAccession = void 0; var node_fetch_1 = __webpack_require__(4); var parseFile_1 = __webpack_require__(5); /** * Get a remote sequence from NCBI or the iGEM registry. */ exports["default"] = (function (accession, options) { return __awaiter(void 0, void 0, void 0, function () { var url, body, response, err_1; return __generator(this, function (_a) { switch (_a.label) { case 0: url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=".concat(accession.trim(), "&rettype=gbwithparts&retmode=text"); if (accession.startsWith("BB")) { // it's a BioBrick... target the iGEM repo if ((typeof window !== "undefined" && typeof process === "undefined") || (options === null || options === void 0 ? void 0 : options.cors)) { // use this hack to get around a no-CORS setting on iGEM webserver, pending fix on their side url = "https://cors-anywhere.herokuapp.com/http://parts.igem.org/cgi/xml/part.cgi?part=".concat(accession.trim()); } else { url = "http://parts.igem.org/cgi/xml/part.cgi?part=".concat(accession.trim()); } } body = ""; _a.label = 1; case 1: _a.trys.push([1, 4, , 5]); return [4 /*yield*/, (0, node_fetch_1.default)(url)]; case 2: response = _a.sent(); return [4 /*yield*/, response.text()]; case 3: body = _a.sent(); return [3 /*break*/, 5]; case 4: err_1 = _a.sent(); throw new Error("Failed to get part: accession=".concat(accession, " url=").concat(url, " err=").concat(err_1)); case 5: if (!response.ok || !body.length) { throw new Error("Failed to get part, no body returned: accession=".concat(accession, " url=").concat(url)); } return [4 /*yield*/, (0, parseFile_1.default)(body)]; case 6: return [2 /*return*/, (_a.sent())[0]]; } }); }); }); /** returns whether the passed ID is an accession in iGEM or NCBI */ var isAccession = function (accession) { if (accession.startsWith("BB")) { return true; // biobrick } if (accession.length < 14 && accession.match(/^[a-z0-9_\-.]+$/i)) { return true; } return false; }; exports.isAccession = isAccession; /***/ }), /* 4 */ /***/ ((module) => { module.exports = require("node-fetch"); /***/ }), /* 5 */ /***/ ((__unused_webpack_module, exports, __webpack_require__) => { Object.defineProperty(exports, "__esModule", ({ value: true })); var path_1 = __webpack_require__(6); var benchling_1 = __webpack_require__(7); var biobrick_1 = __webpack_require__(9); var fasta_1 = __webpack_require__(11); var genbank_1 = __webpack_require__(12); var jbei_1 = __webpack_require__(13); var sbol_1 = __webpack_require__(14); var seqbuilder_1 = __webpack_require__(17); var snapgene_1 = __webpack_require__(18); var utils_1 = __webpack_require__(8); /** * parseFile converts the contents of a sequence file to a an array of Seq */ exports["default"] = (function (file, opts) { var fileName = (opts === null || opts === void 0 ? void 0 : opts.fileName) || ""; var sourceName = fileName.split(path_1.sep).pop() || fileName; if (!file) { throw Error("cannot parse null or empty string"); } // this is a check for an edge case, where the user uploads come kind // of file that's full of bps but doesn't fit into a defined type var firstLine = file.substring(0, file.search("\n")); var dnaCharLength = firstLine.replace(/[^atcgATCG]/, "").length; var dnaOnlyFile = dnaCharLength / firstLine.length > 0.8; // is it >80% dna? var name = fileName && sourceName ? sourceName.substring(0, sourceName.search("\\.")) : "Untitled"; // another edge case check for whether the seq is a JSON seq from Benchling // just a heuristic that says 1) yes it can be parsed 2) it contains a list of // fields that are common to Benchling files var isBenchling = false; try { var benchlingJSON_1 = JSON.parse(file); // will err out if not JSON if (["bases", "annotations", "primers"].every(function (k) { return typeof benchlingJSON_1[k] !== "undefined"; })) { isBenchling = true; } } catch (ex) { // expected } var prefix = file.substring(0, 200); var seqs; switch (true) { // JBEI case prefix.includes(':seq="http://jbei.org/sequence"'): case file.startsWith("<seq:seq"): seqs = (0, jbei_1.default)(file); break; // FASTA case file.startsWith(">"): case file.startsWith(";"): case fileName.endsWith(".seq"): case fileName.endsWith(".fa"): case fileName.endsWith(".fas"): case fileName.endsWith(".fasta"): seqs = (0, fasta_1.default)(file, fileName); break; // Genbank case file.includes("LOCUS") && file.includes("ORIGIN"): case fileName.endsWith(".gb"): case fileName.endsWith(".gbk"): case fileName.endsWith(".genbank"): case fileName.endsWith(".ape"): seqs = (0, genbank_1.default)(file, fileName); break; // SnapGene case fileName.endsWith(".dna"): seqs = (0, snapgene_1.default)(opts); break; // SeqBuilder case prefix.includes("Written by SeqBuilder"): case fileName.endsWith(".sbd"): seqs = (0, seqbuilder_1.default)(file, fileName); break; // BioBrick XML case prefix.includes("Parts from the iGEM"): case prefix.includes("<part_list>"): seqs = (0, biobrick_1.default)(file); break; // Benchling JSON case isBenchling: seqs = (0, benchling_1.default)(file); break; // SBOL case prefix.includes("RDF"): seqs = (0, sbol_1.default)(file, fileName); break; // a DNA text file without an official formatting case dnaOnlyFile: { var seq = (0, utils_1.complement)(file).seq; seqs = [{ annotations: [], name: name, seq: seq, type: (0, utils_1.guessType)(seq) }]; break; } default: throw Error("".concat(fileName, " File type not recognized: ").concat(file)); } // bit of clean up to: only return the fields in a Seq and reorder to match expectations. return seqs.map(function (p) { return ({ annotations: p.annotations .sort(function (a, b) { return a.start - b.start || a.end - b.end; }) .map(function (a) { return ({ color: a.color, direction: a.direction, end: a.end, name: a.name, start: a.start, type: a.type, }); }), name: p.name, seq: p.seq, type: p.type, }); }); }); /***/ }), /* 6 */ /***/ ((module) => { module.exports = require("path"); /***/ }), /* 7 */ /***/ (function(__unused_webpack_module, exports, __webpack_require__) { var __assign = (this && this.__assign) || function () { __assign = Object.assign || function(t) { for (var s, i = 1, n = arguments.length; i < n; i++) { s = arguments[i]; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; } return t; }; return __assign.apply(this, arguments); }; Object.defineProperty(exports, "__esModule", ({ value: true })); var utils_1 = __webpack_require__(8); /** * Benchling format is just JSON. It's virtually the same format. */ exports["default"] = (function (text) { var partJSON = JSON.parse(text); var seq = (0, utils_1.complement)(partJSON.bases).seq; // throw an error if the sequence is empty if (seq.length < 1) { throw new Error("Invalid Benchling part: empty sequence"); } return [ { annotations: partJSON.annotations.map(function (a) { return (__assign(__assign({}, a), { direction: (0, utils_1.parseDirection)(a.strand) })); }), name: partJSON.name || partJSON._id, seq: seq, type: (0, utils_1.guessType)(seq), }, ]; }); /***/ }), /* 8 */ /***/ ((__unused_webpack_module, exports) => { Object.defineProperty(exports, "__esModule", ({ value: true })); exports.guessType = exports.parseDirection = exports.firstElement = exports.reverseComplement = exports.complement = void 0; // from http://arep.med.harvard.edu/labgc/adnan/projects/Utilities/revcomp.html var comp = { A: "T", B: "V", C: "G", D: "H", G: "C", H: "D", K: "M", M: "K", N: "N", R: "Y", S: "S", T: "A", U: "A", V: "B", W: "W", X: "X", Y: "R", a: "t", b: "v", c: "g", d: "h", g: "c", h: "d", k: "m", m: "k", n: "n", r: "y", s: "s", t: "a", u: "a", v: "b", w: "w", x: "x", y: "r", }; /** * Return the filtered sequence and its complement if its an empty string, return the same for both. */ var complement = function (origSeq) { if (!origSeq) { return { compSeq: "", seq: "" }; } // filter out unrecognized basepairs and build up the complement var seq = ""; var compSeq = ""; for (var i = 0, origLength = origSeq.length; i < origLength; i += 1) { if (comp[origSeq[i]]) { seq += origSeq[i]; compSeq += comp[origSeq[i]]; } } return { compSeq: compSeq, seq: seq }; }; exports.complement = complement; /** * Return the reverse complement of a DNA sequence */ var reverseComplement = function (inputSeq) { var compSeq = (0, exports.complement)(inputSeq).compSeq; return compSeq.split("").reverse().join(""); }; exports.reverseComplement = reverseComplement; var firstElement = function (arr) { if (!Array.isArray(arr)) return undefined; return arr[0]; }; exports.firstElement = firstElement; var fwd = new Set(["FWD", "fwd", "FORWARD", "forward", "FOR", "for", "TOP", "top", "1", 1]); var rev = new Set(["REV", "rev", "REVERSE", "reverse", "BOTTOM", "bottom", "-1", -1]); /** * Parse the user defined direction, estimate the direction of the element * * ```js * parseDirection("FWD") => 1 * parseDirection("FORWARD") => 1 * ``` */ var parseDirection = function (direction) { if (!direction) { return 0; } if (fwd.has(direction)) { return 1; } if (rev.has(direction)) { return -1; } return 0; }; exports.parseDirection = parseDirection; /** * mapping the 64 standard codons to amino acids * no synth AA's * * adapted from: "https://github.com/keithwhor/NtSeq/blob/master/lib/nt.js */ var codon2AA = { AAA: "K", AAC: "N", AAG: "K", AAT: "N", ACA: "T", ACC: "T", ACG: "T", ACT: "T", AGA: "R", AGC: "S", AGG: "R", AGT: "S", ATA: "I", ATC: "I", ATG: "M", ATT: "I", CAA: "Q", CAC: "H", CAG: "Q", CAT: "H", CCA: "P", CCC: "P", CCG: "P", CCT: "P", CGA: "R", CGC: "R", CGG: "R", CGT: "R", CTA: "L", CTC: "L", CTG: "L", CTT: "L", GAA: "E", GAC: "D", GAG: "E", GAT: "D", GCA: "A", GCC: "A", GCG: "A", GCT: "A", GGA: "G", GGC: "G", GGG: "G", GGT: "G", GTA: "V", GTC: "V", GTG: "V", GTT: "V", TAA: "*", TAC: "Y", TAG: "*", TAT: "Y", TCA: "S", TCC: "S", TCG: "S", TCT: "S", TGA: "*", TGC: "C", TGG: "W", TGT: "C", TTA: "L", TTC: "F", TTG: "L", TTT: "F", }; var aminoAcids = Array.from(new Set(Object.values(codon2AA)).values()).join(""); var aminoAcidRegex = new RegExp("^[".concat(aminoAcids, "]+$"), "i"); /** Infer the type of a sequence. This only allows a couple wildcard characters so may be overly strict. */ var guessType = function (seq) { if (/^[atgcn.]+$/i.test(seq)) { return "dna"; } else if (/^[augcn.]+$/i.test(seq)) { return "rna"; } else if (aminoAcidRegex.test(seq)) { return "aa"; } return "unknown"; }; exports.guessType = guessType; /***/ }), /* 9 */ /***/ ((__unused_webpack_module, exports, __webpack_require__) => { Object.defineProperty(exports, "__esModule", ({ value: true })); var fast_xml_parser_1 = __webpack_require__(10); var utils_1 = __webpack_require__(8); /** * Parse a BioBrick in XML format to Seq[] * * Eg: https://parts.igem.org/cgi/xml/part.cgi?part=BBa_J23100 */ exports["default"] = (function (file) { var bail = function (err) { throw new Error("Failed on BioBrick: ".concat(err)); }; // parse var parsedBiobrick = new fast_xml_parser_1.XMLParser({ isArray: function (name) { return ["features", "part_name", "sequences"].includes(name); }, removeNSPrefix: true, }).parse(file); // get the first part var part = parsedBiobrick.rsbpml.part_list.part; if (!part) bail("No part seen in part_list"); // extract the useful fields var features = part.features, part_name = part.part_name, sequences = part.sequences; var name = (0, utils_1.firstElement)(part_name); // parse the iGEM annotations var annotations = features .map(function (_a) { var feature = _a.feature; if (!feature) return null; var direction = feature.direction, endpos = feature.endpos, startpos = feature.startpos, type = feature.type; return { direction: (0, utils_1.parseDirection)(direction), end: +endpos, name: "".concat(direction, "-").concat(startpos), start: +startpos || 0, type: type || undefined, }; }) .filter(function (a) { return a; }); // parse the sequence var seq = (0, utils_1.complement)(sequences[0].seq_data).seq; return [ { annotations: annotations, name: name, seq: seq, type: (0, utils_1.guessType)(seq), }, ]; }); /***/ }), /* 10 */ /***/ ((module) => { module.exports = require("fast-xml-parser"); /***/ }), /* 11 */ /***/ ((__unused_webpack_module, exports, __webpack_require__) => { Object.defineProperty(exports, "__esModule", ({ value: true })); var utils_1 = __webpack_require__(8); exports["default"] = (function (text, fileName) { // partFactory returns a negative "circular" prop, we assume they're all linear if (text.trim().startsWith(">")) { return text .split(">") // split up if it's a multi-seq FASTA file .map(function (t) { // this starts at the end of the first line, grabs all other characters, // and removes any newlines (leaving only the original sequence) // sequence "cleaning" happens in complement (we don't support bps other than // the most common right now) var seq = t.substr(t.indexOf("\n"), t.length).replace(/\s/g, ""); // the first line contains the name, though there's lots of variability around // the information on this line... // >MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken var name = t.substring(0, t.search(/\n|\|/)).replace(/\//g, ""); return { annotations: [], name: name, seq: seq, type: (0, utils_1.guessType)(seq), }; }) .filter(function (p) { return p.name && p.seq; }); } if (text.trim().startsWith(";")) { // it's an old-school style FASTA that's punctuated with semi-colons // ;my|NAME // ;my comment // actGacgata var name_1 = text.substring(0, text.search(/\n|\|/)).replace(/\//g, ""); var newlineBeforeSeq = text.indexOf("\n", text.lastIndexOf(";")); var seq_1 = text.substring(newlineBeforeSeq, text.length); return [ { annotations: [], name: name_1, seq: seq_1, type: (0, utils_1.guessType)(seq_1), }, ]; } // assume that it's a no name FASTA. Ie it's just a file with dna and no header // try and get the name from the fileName var lastChar = fileName.lastIndexOf(".") || fileName.length; var name = fileName.substring(0, lastChar) || "Untitled"; var seq = text; return [ { annotations: [], name: name, seq: seq, type: (0, utils_1.guessType)(seq), }, ]; }); /***/ }), /* 12 */ /***/ ((__unused_webpack_module, exports, __webpack_require__) => { Object.defineProperty(exports, "__esModule", ({ value: true })); var utils_1 = __webpack_require__(8); // a list of recognized types that would constitute an annotation name var tagNameSet = new Set(["gene", "product", "note", "db_xref", "protein_id", "label", "lab_host", "locus_tag"]); // a list of tags that could represent colors var tagColorSet = new Set(["ApEinfo_fwdcolor", "ApEinfo_revcolor", "loom_color"]); /** * takes in a string representation of a GenBank file and outputs our * part representation of it. an example of a Genbank file can be found * at ./parsers/Gebank, though there is significant variability to the * format * * another official example can be found at: * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html */ exports["default"] = (function (fileInput, fileName) { return fileInput .split(/\/\/\s/g) .filter(function (f) { return f.length > 5; }) .map(function (file) { // the first row contains the name of the part and its creation date // LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 var HEADER_ROW = file.substring(file.indexOf("LOCUS"), file.search(/\\n|\n/)); var _a = HEADER_ROW.split(/\s{2,}/g).filter(function (h) { return h; }), name = _a[1]; // trying to avoid giving a stupid name like Exported which Snapgene has by default // also, if there is not name in header, the seq length will be used as name, which should // be corrected (Number.parseInt to check for this case) https://stackoverflow.com/a/175787/7541747 var parsedName = name; if ((parsedName === "Exported" && file.includes("SnapGene")) || // stupid Snapgene name Number.parseInt(parsedName, 10) // it thinks seq-length is the name ) { // first try and get the name from ACCESSION var accessionName = false; if (file.includes("ACCESSION")) { // this will be undefined is there is no var accession = file .substring(file.indexOf("ACCESSION"), file.indexOf("\n", file.indexOf("ACCESSION"))) .replace(".", "") .split(/\s{2,}/) .filter(function (a) { return a !== "ACCESSION"; }) .pop(); if (accession) { parsedName = accession; accessionName = true; } } // otherwise, revert to trying to get the part name from the file name if (!accessionName && fileName) { parsedName = fileName .substring(0, Math.max(fileName.search(/\n|\||\./), fileName.lastIndexOf("."))) .replace(/\/\s/g, ""); } else if (!accessionName) { parsedName = "Unnamed"; // give up } } // the part sequence is contained in and after the line that begins with ORIGIN // do this before annotations so we can calc seqlength // // ORIGIN // 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg // 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct var SEQ_ROWS = file.substring(file.lastIndexOf("ORIGIN") + "ORIGIN".length, file.length); var seq = SEQ_ROWS.replace(/[^gatc]/gi, ""); (seq = (0, utils_1.complement)(seq).seq); // seq and compSeq // the features are translated into annotations // region is FEATURES thru ORIGIN // FEATURES Location/Qualifiers // source 1..5028 // /organism="Saccharomyces cerevisiae" // /db_xref="taxon:4932" // /chromosome="IX" // /map="9" // // in the example above, source is the annotation "type" and name is "taxon:4932" // because "db_xref" is a recognized name type // the name depends on whether the tag type is in the reocgnized list of types var annotations = []; var primers = []; if (file.indexOf("FEATURES")) { var FEATURES_LINE = file.indexOf("FEATURES"); var FEATURES_NEW_LINE = file.indexOf("\n", FEATURES_LINE); var ORIGIN_LINE = file.lastIndexOf("ORIGIN"); // some files have a contig file line that needs to parsed out/ shouldn't be included in // the features parsing if (file.includes("CONTIG")) { ORIGIN_LINE = Math.min(ORIGIN_LINE, file.indexOf("CONTIG")); } var FEATURES_ROWS = file .substring(FEATURES_NEW_LINE, ORIGIN_LINE) .split(/\n/) .filter(function (r) { return r; }); FEATURES_ROWS.forEach(function (r) { // in the example above, the following converts it to ['source', '1..5028'] var currLine = r.split(/\s{2,}/g).filter(function (l) { return l; }); if (currLine.length > 1) { // it's the beginning of a new feature/annotation var type = currLine[0], rangeString = currLine[1]; var rangeRegex = /\d+/g; var direction = r.includes("complement") ? -1 : 1; // using the example above, this parses 1..5028 into 1 and 5028 var _a = [0, 0], start = _a[0], end = _a[1]; var startSearch = rangeRegex.exec(rangeString); if (startSearch) { // the - 1 is because genbank is 1-based while we're 0 start = +startSearch[0] - (1 % seq.length); // single bp annotations are a thing in Genbank: // https://github.com/Lattice-Automation/seqviz/issues/117 end = (start + 1) % seq.length; var endSearch = rangeRegex.exec(rangeString); if (endSearch) { end = +endSearch[0] % seq.length; } } if (type !== "source") { // create a new annotation around the properties in this line (type and range) annotations.push({ direction: direction, // set in next block end: end, name: "", start: start, type: type, }); } } else if (currLine.length === 1) { // it's a continuation of a prior feature/annotation // any updates (to name or color) to the last annotation should affect // the last annotation that's in the array if (currLine[0].startsWith("/")) { var tag = currLine[0]; tag = tag.replace(/[/"]/g, ""); // get rid of quotation marks and forward slaches // should now look like ['organism', 'Saccharomyces cerevisiae'] var _b = tag.split(/=/), tagName = _b[0], tagValue = _b[1]; // the two values that can be extracted are name or color var lastAnn = annotations.length - 1; if (tagNameSet.has(tagName.toLowerCase())) { // the key is something we recognize as an annotation name if (lastAnn >= 0 && !annotations[lastAnn].name) { annotations[lastAnn].name = tagValue.trim(); } } else if (tagColorSet.has(tagName)) { // the key is something we recognize as an annotation color if (lastAnn > -1) { annotations[lastAnn].color = tagValue; } } } } }); } return { annotations: annotations, name: parsedName.trim() || fileName, primers: primers, seq: seq, type: (0, utils_1.guessType)(seq), }; }); }); /***/ }), /* 13 */ /***/ ((__unused_webpack_module, exports, __webpack_require__) => { Object.defineProperty(exports, "__esModule", ({ value: true })); var fast_xml_parser_1 = __webpack_require__(10); var utils_1 = __webpack_require__(8); /** * Converts a JBEI file to a Seq * * https://j5.jbei.org/j5manual/pages/94.html */ exports["default"] = (function (JBEI) { // weird edge case with directed quotation characters var fileString = JBEI.replace(/“|”/g, '"'); // parse var parsedJbei = new fast_xml_parser_1.XMLParser({ removeNSPrefix: true, }).parse(fileString); // destructure the parameters from JBEI var seq = parsedJbei.seq; var features = seq.features, name = seq.name, sequence = seq.sequence; // attempt to get the name out of the JBEI var parsedName = "Unnamed"; if (name) { parsedName = name; } // attempt to get the sequence. fail if it's not findable var parsedSeq = (0, utils_1.complement)(sequence).seq; // seq and compSeq if (!parsedSeq) return []; // attempt to parse the JBEI annotations into our version of annotations var annotations = []; if (features && features.feature) { features.feature.forEach(function (feature) { if (!feature) return; var complement = feature.complement, label = feature.label, location = feature.location, type = feature.type; if (location && location.genbankStart && location.end) { annotations.push({ direction: complement ? -1 : 1, // JBEI is 1-based end: +location.end || 0, name: label || "Untitled", start: +location.genbankStart - 1 || 0, type: type || "N/A", }); } }); } return [ { annotations: annotations, name: parsedName, seq: parsedSeq, type: (0, utils_1.guessType)(parsedSeq), }, ]; }); /***/ }), /* 14 */ /***/ ((__unused_webpack_module, exports, __webpack_require__) => { Object.defineProperty(exports, "__esModule", ({ value: true })); var sbol_v1_1 = __webpack_require__(15); var sbol_v2_1 = __webpack_require__(16); /** * takes in an SBOL file in v1 or v2 format, and parses to an array of parts * that match the Loom data model */ exports["default"] = (function (sbol, fileName) { return sbol.includes("sbols.org/v1#") ? (0, sbol_v1_1.default)(sbol) : (0, sbol_v2_1.default)(sbol, fileName); }); /***/ }), /* 15 */ /***/ ((__unused_webpack_module, exports, __webpack_require__) => { Object.defineProperty(exports, "__esModule", ({ value: true })); var fast_xml_parser_1 = __webpack_require__(10); var utils_1 = __webpack_require__(8); /* <sbol:Sequence rdf:about="https://synbiohub.cidarlab.org/public/Demo/A1_sequence/1"> <sbol:persistentIdentity rdf:resource="https://synbiohub.cidarlab.org/public/Demo/A1_sequence"/> <sbol:displayId>A1_sequence</sbol:displayId> <sbol:version>1</sbol:version> <prov:wasDerivedFrom rdf:resource="https://github.com/CIDARLAB/cello/blob/master/resources/UCF/Eco1C1G1T0.UCF.json"/> <prov:wasGeneratedBy rdf:resource="https://synbiohub.cidarlab.org/public/Demo/cello2sbol/1"/> <dcterms:title>A1_sequence</dcterms:title> <sbh:ownedBy rdf:resource="https://synbiohub.cidarlab.org/user/prash"/> <sbh:topLevel rdf:resource="https://synbiohub.cidarlab.org/public/Demo/A1_sequence/1"/> <sbol:elements>AATGTTCCCTAATAATCAGCAAAGAGGTTACTAG</sbol:elements> <sbol:encoding rdf:resource="http://www.chem.qmul.ac.uk/iubmb/misc/naseq.html"/> </sbol:Sequence> */ /** * takes an SBOL file, as a string, and converts it into our DB * representation of a part(s). an example of this type of file can be * found in ../examples/j5.SBOL.xml */ exports["default"] = (function (sbol) { // weird edge case with directed quotation characters var fileString = sbol.replace(/“|”/g, '"'); // parse var parsedSBOL = new fast_xml_parser_1.XMLParser({ ignoreAttributes: false, isArray: function (name) { return [ "Sequence", "Collection", "DnaComponent", "dnaSequence", "ComponentDefinition", "SequenceAnnotation", "sequenceAnnotation", "elements", "component", "annotation", ].includes(name); }, removeNSPrefix: true, }).parse(fileString); var RDF = null; if (parsedSBOL.RDF) (RDF = parsedSBOL.RDF); // @ts-expect-error ts-migrate(2339) FIXME: Property 'Collection' does not exist on type 'null... Remove this comment to see the full error message var Collection = RDF.Collection, DnaComponent = RDF.DnaComponent; if (Collection && Collection.length) { // it's a collection of DnaComponents, parse each to a part var partList_1 = []; Collection.forEach(function (_a) { var component = _a.component; if (component && component.length) { component.forEach(function (_a) { var nestedDnaComponent = _a.DnaComponent; partList_1.push( // @ts-expect-error ts-migrate(2345) FIXME: Argument of type '{ seq: string; compSeq: string; ... Remove this comment to see the full error message dnaComponentToPart(nestedDnaComponent[0], { file: sbol, strict: false, })); }); } }); // check whether any parts were created from the collection if (partList_1.length) return partList_1; } else if (DnaComponent && DnaComponent.length) { // create a single part from the single one passed var validPart = dnaComponentToPart(DnaComponent[0], { file: sbol, strict: false, }); // it will be null if there isn't any sequence information beneath it if (validPart) return [validPart]; } // go on a fishing expedition for DnaComponents // everything else has failed // accumulate all that are "valid" (name + seq) var dnaComponentAccumulator = []; findDnaComponentNodes(dnaComponentAccumulator, RDF); // @ts-ignore var attemptedSeqs = dnaComponentAccumulator .map(function (p) { return dnaComponentToPart(p, { file: sbol, strict: true, }); }) .filter(function (p) { return !!p; }); // invalid parts will be null if (attemptedSeqs.length) return attemptedSeqs; // go on another fishing expedition, but for Sequence nodes var dnaSequenceAccumulator = []; findSequenceNodes(dnaSequenceAccumulator, RDF); return dnaSequenceAccumulator.map(function (p) { return sequenceToPart(p, sbol); }).filter(function (p) { return p; }); // invalid parts will be null }); /** * find all the nodes within the JSON document that are keyed "Sequence" * * this is another last-resort scrapper for trying to find valid parts */ var findSequenceNodes = function (acc, doc) { Object.keys(doc).forEach(function (k) { if (k === "Sequence" && doc[k].length) acc.push.apply(acc, doc[k]); if (Array.isArray(doc[k])) { doc[k].forEach(function (nestedNode) { findSequenceNodes(acc, nestedNode); }); } }); }; /** * after getting a DnaComponent out of the SBOL document, * at either the root RDF level or from within a Collection/Annotation * hierarchy, convert that DnaComponent to a Seq */ var dnaComponentToPart = function (DnaComponent, options) { var _a = options.strict, strict = _a === void 0 ? false : _a; // destructure the params from DnaComponent var annotation = DnaComponent.annotation, displayId = DnaComponent.displayId, dnaSequence = DnaComponent.dnaSequence, name = DnaComponent.name; // attempt to get the name out of the SBOL var parsedName = "Unnamed"; if (name) { parsedName = name; } else if (displayId) { parsedName = displayId; } else if (strict) { // in this scenario, we're really scrapping to find parts, but shouldn't // accept any that don't at least have some name and sequence information return null; } // attempt to get the sequence. fail if it's not findable var seq = ""; if (dnaSequence && dnaSequence[0].DnaSequence) { seq = dnaSequence[0].DnaSequence.nucleotides; } var parsedSeq = (0, utils_1.complement)(seq).seq; // seq and compSeq if (!parsedSeq) return null; // attempt to parse the SBOL annotations into our version of annotations var annotations = []; if (annotation) { annotation.forEach(function (_a) { var SequenceAnnotation = _a.SequenceAnnotation; if (!SequenceAnnotation || !SequenceAnnotation[0]) return; var _b = SequenceAnnotation[0], bioEnd = _b.bioEnd, bioStart = _b.bioStart, strand = _b.strand, subComponent = _b.subComponent; if (subComponent && subComponent.DnaComponent && subComponent.DnaComponent[0]) { var _c = subComponent.DnaComponent[0], annId = _c.displayId, annName = _c.name, annType = _c.type; annotations.push({ direction: strand === "+" ? 1 : -1, end: bioEnd - 1 || 0, name: annName || annId || "Untitled", start: bioStart - 1 || 0, type: annType["@_resource"] || "N/A", }); } }); } return { annotations: annotations, name: parsedName, seq: parsedSeq, type: (0, utils_1.guessType)(seq), }; }; /** * find all nodes that of the type Sequence, and convert those to parts "Sequence" -> Part * * this is not the standard format. see A1.xml */ var sequenceToPart = function (Seq, file) { // get the name var name = Seq.displayId || Seq.title || "Unnamed"; // get the sequence var seqOrig = Seq.elements[0] || ""; var _a = (0, utils_1.complement)(seqOrig), compSeq = _a.compSeq, seq = _a.seq; // guess whether it's circular or not based on the presence of a word like vector. // very ad hoc var circular = file.search(/plasmid/i) > 0; return { annotations: [], circular: circular, compSeq: compSeq, name: name, seq: seq, type: (0, utils_1.guessType)(seq) }; }; /** * find all the nodes within the SBOL JSON document that are keyed "DnaComponent" * * this is a last-resort scrapper that tries to find valid parts that aren't within a root * DnaComponent document or within a root Collection array */ var findDnaComponentNodes = function (acc, doc) { Object.keys(doc).forEach(function (k) { if (k === "DnaComponent" && doc[k].length) acc.push.apply(acc, doc[k]); if (Array.isArray(doc[k])) { doc[k].forEach(function (nestedNode) { findDnaComponentNodes(acc, nestedNode); }); } }); }; /***/ }), /* 16 */ /***/ ((__unused_webpack_module, exports, __webpack_require__) => { Object.defineProperty(exports, "__esModule", ({ value: true })); var fast_xml_parser_1 = __webpack_require__(10); var utils_1 = __webpack_require__(8); /** * Converts an SBOL file to our Seq format. * * SBOL v2.0 schema definition can be found at: http://sbolstandard.org/wp-content/uploads/2016/06/SBOL-data-model-2.2.1.pdf * differs from SBOL v1.0 in that the ComponentDefinitions are like the root parts, * and the sequence and annotations are separated (they're no longer defined relationally * by nesting but, instead, by id) we only care about components that have sequence information */ exports["default"] = (function (sbol, fileName) { // weird edge case with directed quotation characters var fileString = sbol.replace(/“|”/g, '"'); // parse var parsedSBOL = new fast_xml_parser_1.XMLParser({ ignoreAttributes: false, isArray: function (name) { return ["Sequence", "ComponentDefinition", "SequenceAnnotation", "sequenceAnnotation", "elements"].includes(name); }, removeNSPrefix: true, }).parse(fileString); try { var seqList = parseSBOL2(parsedSBOL, fileName); if (seqList.length) { return seqList; } else { throw new Error("No Sequence info found"); } } catch (err) { throw new Error("Failed to parse SBOL v2 file: ".concat(err)); } }); var parseSBOL2 = function (parsedSBOL, fileName) { var RDF = null; if (parsedSBOL.RDF) { (RDF = parsedSBOL.RDF); } if (!RDF) { throw new Error("No root RDF document"); } // check if anything is defined, return if not var ComponentDefinition = RDF.ComponentDefinition, Sequence = RDF.Sequence; if (!ComponentDefinition && !Sequence) { throw new Error("Failed to parse SBOL v2: No ComponentDefinition or Sequence"); } // read thru the Sequence elements var getSeq = function (seqID) { var seqElement = seqID ? // @ts-ignore Sequence.find(function (s) { return (s.persistentIdentity && s.persistentIdentity.length && s.persistentIdentity["@_resource"] === seqID) || s["@_about"] === seqID; }) : Sequence[0]; if (seqElement && seqElement.elements) { var seq_1 = (0, utils_1.complement)(seqElement.elements[0] || "").seq; return { annotations: [], name: seqElement.displayId, seq: seq_1, type: (0, utils_1.guessType)(seq_1), }; } return null; }; // if it's a collection of DnaComponents, parse each to a part var seqList = []; // @ts-ignore ComponentDefinition === null || ComponentDefinition === void 0 ? void 0 : ComponentDefinition.forEach(function (c, i) { // we're only making parts out of those with seq info if (!c.sequence) { return; } var displayId = c.displayId, sequence = c.sequence, sequenceAnnotation = c.sequenceAnnotation; var name = displayId || "".concat(fileName, "_").concat(i + 1); var annotations = []; (sequenceAnnotation || []).forEach(function (_a) { var SequenceAnnotation = _a.SequenceAnnotation; var ann = SequenceAnnotation[0]; var annId = ann.displayId; var Range = ann.location.Range; var range = Range; if (range) { annotations.push({ end: range.end - 1, name: annId, start: range.start - 1, }); } }); var seq = getSeq(sequence["@_resource"]); if (seq) { seqList.push({ annotations: annotations, name: name, seq: seq.seq, type: seq.type, }); } }); // if it's a single sequence, just try and get the sequence from that alone var seq = getSeq(); if (!seqList.length && seq) { seqList.push(seq); } return seqList; }; /***/ }), /* 17 */ /***/ ((__unused_webpack_module, exports, __webpack_require__) => { Object.defineProperty(exports, "__esModule", ({ value: true })); var utils_1 = __webpack_require__(8); // a list of recognized types that would constitute an annotation name var tagNameList = ["gene", "product", "note", "db_xref", "protein_id", "label", "lab_host"]; // a list of tags that could represent colors var tagColorList = ["ApEinfo_fwdcolor", "ApEinfo_revcolor", "loom_color"]; /** * takes in a string representation of a SeqBuilder file and outputs our * part representation of it. an example of a SeqBuilder file can be found * at imports/io/examples/seqbuilder, though there may be variations to the * format */ exports["default"] = (function (fileInput, fileName) { return fileInput.split(/\/\/\s/g).map(function (file) { // +++++SEQUENCE+++++// // the part sequence comes after the line that specifies the seqbuilder version number // @ts-ignore var SEQ_ROWS = file .substring(file.search(/.*?written by seqbuilder .*?[0-9.]+[^actg]+/i) + // @ts-ignore file.match(/.*?written by seqbuilder .*?[0-9.]+[^actg]+/i)[0].length, file.length) .match(/[actgyrwskmdvhbxn]+/gim)[0]; var seq = SEQ_ROWS; (seq = (0, utils_1.complement)(seq).seq); // seq and compSeq // there may be a genbank-like header row after the sequence // LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 var parsedName = fileName.length > 0 ? fileName : "Unnamed"; if (~file.indexOf("LOCUS")) { var HEADER_ROW = file.substring(file.indexOf("LOCUS"), file.search(/\\n|\n/)); if (HEADER_ROW && HEADER_ROW.split(/\s{2,}/g)) { var _a = HEADER_ROW.split(/\s{2,}/g).filter(function (h) { return h; }), name_1 = _a[1]; parsedName = name_1; } } // Name setting logic ported from GenBank parser if ((parsedName === "Exported" && file.includes("SnapGene")) || // stupid Snapgene name Number.parseInt(parsedName, 10) // it thinks seq-length is the name ) { // first try and get the name from ACCESSION var accessionName = false; if (file.includes("ACCESSION")) { // this will be undefined is there is no var accession = file .substring(file.indexOf("ACCESSION"), file.indexOf("\n", file.indexOf("ACCESSION")))