seqparse
Version:
Parse sequence files (GenBank, FASTA, SnapGene, SBOL) and accession IDs (NCBI, iGEM) to a common format
1,289 lines (1,230 loc) • 64.3 kB
JavaScript
#!/usr/bin/env node
/******/ (() => { // webpackBootstrap
/******/ "use strict";
/******/ var __webpack_modules__ = ([
/* 0 */,
/* 1 */
/***/ ((module) => {
module.exports = require("fs");
/***/ }),
/* 2 */
/***/ (function(__unused_webpack_module, exports, __webpack_require__) {
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (_) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
Object.defineProperty(exports, "__esModule", ({ value: true }));
exports.parseFile = void 0;
var fetchFile_1 = __webpack_require__(3);
var parseFile_1 = __webpack_require__(5);
exports.parseFile = parseFile_1.default;
/* Parse a sequence file. Or download a sequence with an Accession ID. */
exports["default"] = (function (input, options) { return __awaiter(void 0, void 0, void 0, function () {
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!(!(options === null || options === void 0 ? void 0 : options.fileName) && (0, fetchFile_1.isAccession)(input))) return [3 /*break*/, 2];
return [4 /*yield*/, (0, fetchFile_1.default)(input, options)];
case 1: return [2 /*return*/, _a.sent()];
case 2: return [2 /*return*/, (0, parseFile_1.default)(input, options)[0]];
}
});
}); });
/***/ }),
/* 3 */
/***/ (function(__unused_webpack_module, exports, __webpack_require__) {
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (_) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
Object.defineProperty(exports, "__esModule", ({ value: true }));
exports.isAccession = void 0;
var node_fetch_1 = __webpack_require__(4);
var parseFile_1 = __webpack_require__(5);
/**
* Get a remote sequence from NCBI or the iGEM registry.
*/
exports["default"] = (function (accession, options) { return __awaiter(void 0, void 0, void 0, function () {
var url, body, response, err_1;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=".concat(accession.trim(), "&rettype=gbwithparts&retmode=text");
if (accession.startsWith("BB")) {
// it's a BioBrick... target the iGEM repo
if ((typeof window !== "undefined" && typeof process === "undefined") || (options === null || options === void 0 ? void 0 : options.cors)) {
// use this hack to get around a no-CORS setting on iGEM webserver, pending fix on their side
url = "https://cors-anywhere.herokuapp.com/http://parts.igem.org/cgi/xml/part.cgi?part=".concat(accession.trim());
}
else {
url = "http://parts.igem.org/cgi/xml/part.cgi?part=".concat(accession.trim());
}
}
body = "";
_a.label = 1;
case 1:
_a.trys.push([1, 4, , 5]);
return [4 /*yield*/, (0, node_fetch_1.default)(url)];
case 2:
response = _a.sent();
return [4 /*yield*/, response.text()];
case 3:
body = _a.sent();
return [3 /*break*/, 5];
case 4:
err_1 = _a.sent();
throw new Error("Failed to get part: accession=".concat(accession, " url=").concat(url, " err=").concat(err_1));
case 5:
if (!response.ok || !body.length) {
throw new Error("Failed to get part, no body returned: accession=".concat(accession, " url=").concat(url));
}
return [4 /*yield*/, (0, parseFile_1.default)(body)];
case 6: return [2 /*return*/, (_a.sent())[0]];
}
});
}); });
/** returns whether the passed ID is an accession in iGEM or NCBI */
var isAccession = function (accession) {
if (accession.startsWith("BB")) {
return true; // biobrick
}
if (accession.length < 14 && accession.match(/^[a-z0-9_\-.]+$/i)) {
return true;
}
return false;
};
exports.isAccession = isAccession;
/***/ }),
/* 4 */
/***/ ((module) => {
module.exports = require("node-fetch");
/***/ }),
/* 5 */
/***/ ((__unused_webpack_module, exports, __webpack_require__) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
var path_1 = __webpack_require__(6);
var benchling_1 = __webpack_require__(7);
var biobrick_1 = __webpack_require__(9);
var fasta_1 = __webpack_require__(11);
var genbank_1 = __webpack_require__(12);
var jbei_1 = __webpack_require__(13);
var sbol_1 = __webpack_require__(14);
var seqbuilder_1 = __webpack_require__(17);
var snapgene_1 = __webpack_require__(18);
var utils_1 = __webpack_require__(8);
/**
* parseFile converts the contents of a sequence file to a an array of Seq
*/
exports["default"] = (function (file, opts) {
var fileName = (opts === null || opts === void 0 ? void 0 : opts.fileName) || "";
var sourceName = fileName.split(path_1.sep).pop() || fileName;
if (!file) {
throw Error("cannot parse null or empty string");
}
// this is a check for an edge case, where the user uploads come kind
// of file that's full of bps but doesn't fit into a defined type
var firstLine = file.substring(0, file.search("\n"));
var dnaCharLength = firstLine.replace(/[^atcgATCG]/, "").length;
var dnaOnlyFile = dnaCharLength / firstLine.length > 0.8; // is it >80% dna?
var name = fileName && sourceName ? sourceName.substring(0, sourceName.search("\\.")) : "Untitled";
// another edge case check for whether the seq is a JSON seq from Benchling
// just a heuristic that says 1) yes it can be parsed 2) it contains a list of
// fields that are common to Benchling files
var isBenchling = false;
try {
var benchlingJSON_1 = JSON.parse(file); // will err out if not JSON
if (["bases", "annotations", "primers"].every(function (k) { return typeof benchlingJSON_1[k] !== "undefined"; })) {
isBenchling = true;
}
}
catch (ex) {
// expected
}
var prefix = file.substring(0, 200);
var seqs;
switch (true) {
// JBEI
case prefix.includes(':seq="http://jbei.org/sequence"'):
case file.startsWith("<seq:seq"):
seqs = (0, jbei_1.default)(file);
break;
// FASTA
case file.startsWith(">"):
case file.startsWith(";"):
case fileName.endsWith(".seq"):
case fileName.endsWith(".fa"):
case fileName.endsWith(".fas"):
case fileName.endsWith(".fasta"):
seqs = (0, fasta_1.default)(file, fileName);
break;
// Genbank
case file.includes("LOCUS") && file.includes("ORIGIN"):
case fileName.endsWith(".gb"):
case fileName.endsWith(".gbk"):
case fileName.endsWith(".genbank"):
case fileName.endsWith(".ape"):
seqs = (0, genbank_1.default)(file, fileName);
break;
// SnapGene
case fileName.endsWith(".dna"):
seqs = (0, snapgene_1.default)(opts);
break;
// SeqBuilder
case prefix.includes("Written by SeqBuilder"):
case fileName.endsWith(".sbd"):
seqs = (0, seqbuilder_1.default)(file, fileName);
break;
// BioBrick XML
case prefix.includes("Parts from the iGEM"):
case prefix.includes("<part_list>"):
seqs = (0, biobrick_1.default)(file);
break;
// Benchling JSON
case isBenchling:
seqs = (0, benchling_1.default)(file);
break;
// SBOL
case prefix.includes("RDF"):
seqs = (0, sbol_1.default)(file, fileName);
break;
// a DNA text file without an official formatting
case dnaOnlyFile: {
var seq = (0, utils_1.complement)(file).seq;
seqs = [{ annotations: [], name: name, seq: seq, type: (0, utils_1.guessType)(seq) }];
break;
}
default:
throw Error("".concat(fileName, " File type not recognized: ").concat(file));
}
// bit of clean up to: only return the fields in a Seq and reorder to match expectations.
return seqs.map(function (p) { return ({
annotations: p.annotations
.sort(function (a, b) { return a.start - b.start || a.end - b.end; })
.map(function (a) { return ({
color: a.color,
direction: a.direction,
end: a.end,
name: a.name,
start: a.start,
type: a.type,
}); }),
name: p.name,
seq: p.seq,
type: p.type,
}); });
});
/***/ }),
/* 6 */
/***/ ((module) => {
module.exports = require("path");
/***/ }),
/* 7 */
/***/ (function(__unused_webpack_module, exports, __webpack_require__) {
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
Object.defineProperty(exports, "__esModule", ({ value: true }));
var utils_1 = __webpack_require__(8);
/**
* Benchling format is just JSON. It's virtually the same format.
*/
exports["default"] = (function (text) {
var partJSON = JSON.parse(text);
var seq = (0, utils_1.complement)(partJSON.bases).seq;
// throw an error if the sequence is empty
if (seq.length < 1) {
throw new Error("Invalid Benchling part: empty sequence");
}
return [
{
annotations: partJSON.annotations.map(function (a) { return (__assign(__assign({}, a), { direction: (0, utils_1.parseDirection)(a.strand) })); }),
name: partJSON.name || partJSON._id,
seq: seq,
type: (0, utils_1.guessType)(seq),
},
];
});
/***/ }),
/* 8 */
/***/ ((__unused_webpack_module, exports) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
exports.guessType = exports.parseDirection = exports.firstElement = exports.reverseComplement = exports.complement = void 0;
// from http://arep.med.harvard.edu/labgc/adnan/projects/Utilities/revcomp.html
var comp = {
A: "T",
B: "V",
C: "G",
D: "H",
G: "C",
H: "D",
K: "M",
M: "K",
N: "N",
R: "Y",
S: "S",
T: "A",
U: "A",
V: "B",
W: "W",
X: "X",
Y: "R",
a: "t",
b: "v",
c: "g",
d: "h",
g: "c",
h: "d",
k: "m",
m: "k",
n: "n",
r: "y",
s: "s",
t: "a",
u: "a",
v: "b",
w: "w",
x: "x",
y: "r",
};
/**
* Return the filtered sequence and its complement if its an empty string, return the same for both.
*/
var complement = function (origSeq) {
if (!origSeq) {
return { compSeq: "", seq: "" };
}
// filter out unrecognized basepairs and build up the complement
var seq = "";
var compSeq = "";
for (var i = 0, origLength = origSeq.length; i < origLength; i += 1) {
if (comp[origSeq[i]]) {
seq += origSeq[i];
compSeq += comp[origSeq[i]];
}
}
return { compSeq: compSeq, seq: seq };
};
exports.complement = complement;
/**
* Return the reverse complement of a DNA sequence
*/
var reverseComplement = function (inputSeq) {
var compSeq = (0, exports.complement)(inputSeq).compSeq;
return compSeq.split("").reverse().join("");
};
exports.reverseComplement = reverseComplement;
var firstElement = function (arr) {
if (!Array.isArray(arr))
return undefined;
return arr[0];
};
exports.firstElement = firstElement;
var fwd = new Set(["FWD", "fwd", "FORWARD", "forward", "FOR", "for", "TOP", "top", "1", 1]);
var rev = new Set(["REV", "rev", "REVERSE", "reverse", "BOTTOM", "bottom", "-1", -1]);
/**
* Parse the user defined direction, estimate the direction of the element
*
* ```js
* parseDirection("FWD") => 1
* parseDirection("FORWARD") => 1
* ```
*/
var parseDirection = function (direction) {
if (!direction) {
return 0;
}
if (fwd.has(direction)) {
return 1;
}
if (rev.has(direction)) {
return -1;
}
return 0;
};
exports.parseDirection = parseDirection;
/**
* mapping the 64 standard codons to amino acids
* no synth AA's
*
* adapted from: "https://github.com/keithwhor/NtSeq/blob/master/lib/nt.js
*/
var codon2AA = {
AAA: "K",
AAC: "N",
AAG: "K",
AAT: "N",
ACA: "T",
ACC: "T",
ACG: "T",
ACT: "T",
AGA: "R",
AGC: "S",
AGG: "R",
AGT: "S",
ATA: "I",
ATC: "I",
ATG: "M",
ATT: "I",
CAA: "Q",
CAC: "H",
CAG: "Q",
CAT: "H",
CCA: "P",
CCC: "P",
CCG: "P",
CCT: "P",
CGA: "R",
CGC: "R",
CGG: "R",
CGT: "R",
CTA: "L",
CTC: "L",
CTG: "L",
CTT: "L",
GAA: "E",
GAC: "D",
GAG: "E",
GAT: "D",
GCA: "A",
GCC: "A",
GCG: "A",
GCT: "A",
GGA: "G",
GGC: "G",
GGG: "G",
GGT: "G",
GTA: "V",
GTC: "V",
GTG: "V",
GTT: "V",
TAA: "*",
TAC: "Y",
TAG: "*",
TAT: "Y",
TCA: "S",
TCC: "S",
TCG: "S",
TCT: "S",
TGA: "*",
TGC: "C",
TGG: "W",
TGT: "C",
TTA: "L",
TTC: "F",
TTG: "L",
TTT: "F",
};
var aminoAcids = Array.from(new Set(Object.values(codon2AA)).values()).join("");
var aminoAcidRegex = new RegExp("^[".concat(aminoAcids, "]+$"), "i");
/** Infer the type of a sequence. This only allows a couple wildcard characters so may be overly strict. */
var guessType = function (seq) {
if (/^[atgcn.]+$/i.test(seq)) {
return "dna";
}
else if (/^[augcn.]+$/i.test(seq)) {
return "rna";
}
else if (aminoAcidRegex.test(seq)) {
return "aa";
}
return "unknown";
};
exports.guessType = guessType;
/***/ }),
/* 9 */
/***/ ((__unused_webpack_module, exports, __webpack_require__) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
var fast_xml_parser_1 = __webpack_require__(10);
var utils_1 = __webpack_require__(8);
/**
* Parse a BioBrick in XML format to Seq[]
*
* Eg: https://parts.igem.org/cgi/xml/part.cgi?part=BBa_J23100
*/
exports["default"] = (function (file) {
var bail = function (err) {
throw new Error("Failed on BioBrick: ".concat(err));
};
// parse
var parsedBiobrick = new fast_xml_parser_1.XMLParser({
isArray: function (name) {
return ["features", "part_name", "sequences"].includes(name);
},
removeNSPrefix: true,
}).parse(file);
// get the first part
var part = parsedBiobrick.rsbpml.part_list.part;
if (!part)
bail("No part seen in part_list");
// extract the useful fields
var features = part.features, part_name = part.part_name, sequences = part.sequences;
var name = (0, utils_1.firstElement)(part_name);
// parse the iGEM annotations
var annotations = features
.map(function (_a) {
var feature = _a.feature;
if (!feature)
return null;
var direction = feature.direction, endpos = feature.endpos, startpos = feature.startpos, type = feature.type;
return {
direction: (0, utils_1.parseDirection)(direction),
end: +endpos,
name: "".concat(direction, "-").concat(startpos),
start: +startpos || 0,
type: type || undefined,
};
})
.filter(function (a) { return a; });
// parse the sequence
var seq = (0, utils_1.complement)(sequences[0].seq_data).seq;
return [
{
annotations: annotations,
name: name,
seq: seq,
type: (0, utils_1.guessType)(seq),
},
];
});
/***/ }),
/* 10 */
/***/ ((module) => {
module.exports = require("fast-xml-parser");
/***/ }),
/* 11 */
/***/ ((__unused_webpack_module, exports, __webpack_require__) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
var utils_1 = __webpack_require__(8);
exports["default"] = (function (text, fileName) {
// partFactory returns a negative "circular" prop, we assume they're all linear
if (text.trim().startsWith(">")) {
return text
.split(">") // split up if it's a multi-seq FASTA file
.map(function (t) {
// this starts at the end of the first line, grabs all other characters,
// and removes any newlines (leaving only the original sequence)
// sequence "cleaning" happens in complement (we don't support bps other than
// the most common right now)
var seq = t.substr(t.indexOf("\n"), t.length).replace(/\s/g, "");
// the first line contains the name, though there's lots of variability around
// the information on this line...
// >MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken
var name = t.substring(0, t.search(/\n|\|/)).replace(/\//g, "");
return {
annotations: [],
name: name,
seq: seq,
type: (0, utils_1.guessType)(seq),
};
})
.filter(function (p) { return p.name && p.seq; });
}
if (text.trim().startsWith(";")) {
// it's an old-school style FASTA that's punctuated with semi-colons
// ;my|NAME
// ;my comment
// actGacgata
var name_1 = text.substring(0, text.search(/\n|\|/)).replace(/\//g, "");
var newlineBeforeSeq = text.indexOf("\n", text.lastIndexOf(";"));
var seq_1 = text.substring(newlineBeforeSeq, text.length);
return [
{
annotations: [],
name: name_1,
seq: seq_1,
type: (0, utils_1.guessType)(seq_1),
},
];
}
// assume that it's a no name FASTA. Ie it's just a file with dna and no header
// try and get the name from the fileName
var lastChar = fileName.lastIndexOf(".") || fileName.length;
var name = fileName.substring(0, lastChar) || "Untitled";
var seq = text;
return [
{
annotations: [],
name: name,
seq: seq,
type: (0, utils_1.guessType)(seq),
},
];
});
/***/ }),
/* 12 */
/***/ ((__unused_webpack_module, exports, __webpack_require__) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
var utils_1 = __webpack_require__(8);
// a list of recognized types that would constitute an annotation name
var tagNameSet = new Set(["gene", "product", "note", "db_xref", "protein_id", "label", "lab_host", "locus_tag"]);
// a list of tags that could represent colors
var tagColorSet = new Set(["ApEinfo_fwdcolor", "ApEinfo_revcolor", "loom_color"]);
/**
* takes in a string representation of a GenBank file and outputs our
* part representation of it. an example of a Genbank file can be found
* at ./parsers/Gebank, though there is significant variability to the
* format
*
* another official example can be found at:
* https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
*/
exports["default"] = (function (fileInput, fileName) {
return fileInput
.split(/\/\/\s/g)
.filter(function (f) { return f.length > 5; })
.map(function (file) {
// the first row contains the name of the part and its creation date
// LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
var HEADER_ROW = file.substring(file.indexOf("LOCUS"), file.search(/\\n|\n/));
var _a = HEADER_ROW.split(/\s{2,}/g).filter(function (h) { return h; }), name = _a[1];
// trying to avoid giving a stupid name like Exported which Snapgene has by default
// also, if there is not name in header, the seq length will be used as name, which should
// be corrected (Number.parseInt to check for this case) https://stackoverflow.com/a/175787/7541747
var parsedName = name;
if ((parsedName === "Exported" && file.includes("SnapGene")) || // stupid Snapgene name
Number.parseInt(parsedName, 10) // it thinks seq-length is the name
) {
// first try and get the name from ACCESSION
var accessionName = false;
if (file.includes("ACCESSION")) {
// this will be undefined is there is no
var accession = file
.substring(file.indexOf("ACCESSION"), file.indexOf("\n", file.indexOf("ACCESSION")))
.replace(".", "")
.split(/\s{2,}/)
.filter(function (a) { return a !== "ACCESSION"; })
.pop();
if (accession) {
parsedName = accession;
accessionName = true;
}
}
// otherwise, revert to trying to get the part name from the file name
if (!accessionName && fileName) {
parsedName = fileName
.substring(0, Math.max(fileName.search(/\n|\||\./), fileName.lastIndexOf(".")))
.replace(/\/\s/g, "");
}
else if (!accessionName) {
parsedName = "Unnamed"; // give up
}
}
// the part sequence is contained in and after the line that begins with ORIGIN
// do this before annotations so we can calc seqlength
//
// ORIGIN
// 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
// 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
var SEQ_ROWS = file.substring(file.lastIndexOf("ORIGIN") + "ORIGIN".length, file.length);
var seq = SEQ_ROWS.replace(/[^gatc]/gi, "");
(seq = (0, utils_1.complement)(seq).seq); // seq and compSeq
// the features are translated into annotations
// region is FEATURES thru ORIGIN
// FEATURES Location/Qualifiers
// source 1..5028
// /organism="Saccharomyces cerevisiae"
// /db_xref="taxon:4932"
// /chromosome="IX"
// /map="9"
//
// in the example above, source is the annotation "type" and name is "taxon:4932"
// because "db_xref" is a recognized name type
// the name depends on whether the tag type is in the reocgnized list of types
var annotations = [];
var primers = [];
if (file.indexOf("FEATURES")) {
var FEATURES_LINE = file.indexOf("FEATURES");
var FEATURES_NEW_LINE = file.indexOf("\n", FEATURES_LINE);
var ORIGIN_LINE = file.lastIndexOf("ORIGIN");
// some files have a contig file line that needs to parsed out/ shouldn't be included in
// the features parsing
if (file.includes("CONTIG")) {
ORIGIN_LINE = Math.min(ORIGIN_LINE, file.indexOf("CONTIG"));
}
var FEATURES_ROWS = file
.substring(FEATURES_NEW_LINE, ORIGIN_LINE)
.split(/\n/)
.filter(function (r) { return r; });
FEATURES_ROWS.forEach(function (r) {
// in the example above, the following converts it to ['source', '1..5028']
var currLine = r.split(/\s{2,}/g).filter(function (l) { return l; });
if (currLine.length > 1) {
// it's the beginning of a new feature/annotation
var type = currLine[0], rangeString = currLine[1];
var rangeRegex = /\d+/g;
var direction = r.includes("complement") ? -1 : 1;
// using the example above, this parses 1..5028 into 1 and 5028
var _a = [0, 0], start = _a[0], end = _a[1];
var startSearch = rangeRegex.exec(rangeString);
if (startSearch) {
// the - 1 is because genbank is 1-based while we're 0
start = +startSearch[0] - (1 % seq.length);
// single bp annotations are a thing in Genbank:
// https://github.com/Lattice-Automation/seqviz/issues/117
end = (start + 1) % seq.length;
var endSearch = rangeRegex.exec(rangeString);
if (endSearch) {
end = +endSearch[0] % seq.length;
}
}
if (type !== "source") {
// create a new annotation around the properties in this line (type and range)
annotations.push({
direction: direction,
// set in next block
end: end,
name: "",
start: start,
type: type,
});
}
}
else if (currLine.length === 1) {
// it's a continuation of a prior feature/annotation
// any updates (to name or color) to the last annotation should affect
// the last annotation that's in the array
if (currLine[0].startsWith("/")) {
var tag = currLine[0];
tag = tag.replace(/[/"]/g, ""); // get rid of quotation marks and forward slaches
// should now look like ['organism', 'Saccharomyces cerevisiae']
var _b = tag.split(/=/), tagName = _b[0], tagValue = _b[1];
// the two values that can be extracted are name or color
var lastAnn = annotations.length - 1;
if (tagNameSet.has(tagName.toLowerCase())) {
// the key is something we recognize as an annotation name
if (lastAnn >= 0 && !annotations[lastAnn].name) {
annotations[lastAnn].name = tagValue.trim();
}
}
else if (tagColorSet.has(tagName)) {
// the key is something we recognize as an annotation color
if (lastAnn > -1) {
annotations[lastAnn].color = tagValue;
}
}
}
}
});
}
return {
annotations: annotations,
name: parsedName.trim() || fileName,
primers: primers,
seq: seq,
type: (0, utils_1.guessType)(seq),
};
});
});
/***/ }),
/* 13 */
/***/ ((__unused_webpack_module, exports, __webpack_require__) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
var fast_xml_parser_1 = __webpack_require__(10);
var utils_1 = __webpack_require__(8);
/**
* Converts a JBEI file to a Seq
*
* https://j5.jbei.org/j5manual/pages/94.html
*/
exports["default"] = (function (JBEI) {
// weird edge case with directed quotation characters
var fileString = JBEI.replace(/“|”/g, '"');
// parse
var parsedJbei = new fast_xml_parser_1.XMLParser({
removeNSPrefix: true,
}).parse(fileString);
// destructure the parameters from JBEI
var seq = parsedJbei.seq;
var features = seq.features, name = seq.name, sequence = seq.sequence;
// attempt to get the name out of the JBEI
var parsedName = "Unnamed";
if (name) {
parsedName = name;
}
// attempt to get the sequence. fail if it's not findable
var parsedSeq = (0, utils_1.complement)(sequence).seq; // seq and compSeq
if (!parsedSeq)
return [];
// attempt to parse the JBEI annotations into our version of annotations
var annotations = [];
if (features && features.feature) {
features.feature.forEach(function (feature) {
if (!feature)
return;
var complement = feature.complement, label = feature.label, location = feature.location, type = feature.type;
if (location && location.genbankStart && location.end) {
annotations.push({
direction: complement ? -1 : 1,
// JBEI is 1-based
end: +location.end || 0,
name: label || "Untitled",
start: +location.genbankStart - 1 || 0,
type: type || "N/A",
});
}
});
}
return [
{
annotations: annotations,
name: parsedName,
seq: parsedSeq,
type: (0, utils_1.guessType)(parsedSeq),
},
];
});
/***/ }),
/* 14 */
/***/ ((__unused_webpack_module, exports, __webpack_require__) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
var sbol_v1_1 = __webpack_require__(15);
var sbol_v2_1 = __webpack_require__(16);
/**
* takes in an SBOL file in v1 or v2 format, and parses to an array of parts
* that match the Loom data model
*/
exports["default"] = (function (sbol, fileName) {
return sbol.includes("sbols.org/v1#") ? (0, sbol_v1_1.default)(sbol) : (0, sbol_v2_1.default)(sbol, fileName);
});
/***/ }),
/* 15 */
/***/ ((__unused_webpack_module, exports, __webpack_require__) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
var fast_xml_parser_1 = __webpack_require__(10);
var utils_1 = __webpack_require__(8);
/*
<sbol:Sequence rdf:about="https://synbiohub.cidarlab.org/public/Demo/A1_sequence/1">
<sbol:persistentIdentity rdf:resource="https://synbiohub.cidarlab.org/public/Demo/A1_sequence"/>
<sbol:displayId>A1_sequence</sbol:displayId>
<sbol:version>1</sbol:version>
<prov:wasDerivedFrom rdf:resource="https://github.com/CIDARLAB/cello/blob/master/resources/UCF/Eco1C1G1T0.UCF.json"/>
<prov:wasGeneratedBy rdf:resource="https://synbiohub.cidarlab.org/public/Demo/cello2sbol/1"/>
<dcterms:title>A1_sequence</dcterms:title>
<sbh:ownedBy rdf:resource="https://synbiohub.cidarlab.org/user/prash"/>
<sbh:topLevel rdf:resource="https://synbiohub.cidarlab.org/public/Demo/A1_sequence/1"/>
<sbol:elements>AATGTTCCCTAATAATCAGCAAAGAGGTTACTAG</sbol:elements>
<sbol:encoding rdf:resource="http://www.chem.qmul.ac.uk/iubmb/misc/naseq.html"/>
</sbol:Sequence>
*/
/**
* takes an SBOL file, as a string, and converts it into our DB
* representation of a part(s). an example of this type of file can be
* found in ../examples/j5.SBOL.xml
*/
exports["default"] = (function (sbol) {
// weird edge case with directed quotation characters
var fileString = sbol.replace(/“|”/g, '"');
// parse
var parsedSBOL = new fast_xml_parser_1.XMLParser({
ignoreAttributes: false,
isArray: function (name) {
return [
"Sequence",
"Collection",
"DnaComponent",
"dnaSequence",
"ComponentDefinition",
"SequenceAnnotation",
"sequenceAnnotation",
"elements",
"component",
"annotation",
].includes(name);
},
removeNSPrefix: true,
}).parse(fileString);
var RDF = null;
if (parsedSBOL.RDF)
(RDF = parsedSBOL.RDF);
// @ts-expect-error ts-migrate(2339) FIXME: Property 'Collection' does not exist on type 'null... Remove this comment to see the full error message
var Collection = RDF.Collection, DnaComponent = RDF.DnaComponent;
if (Collection && Collection.length) {
// it's a collection of DnaComponents, parse each to a part
var partList_1 = [];
Collection.forEach(function (_a) {
var component = _a.component;
if (component && component.length) {
component.forEach(function (_a) {
var nestedDnaComponent = _a.DnaComponent;
partList_1.push(
// @ts-expect-error ts-migrate(2345) FIXME: Argument of type '{ seq: string; compSeq: string; ... Remove this comment to see the full error message
dnaComponentToPart(nestedDnaComponent[0], {
file: sbol,
strict: false,
}));
});
}
});
// check whether any parts were created from the collection
if (partList_1.length)
return partList_1;
}
else if (DnaComponent && DnaComponent.length) {
// create a single part from the single one passed
var validPart = dnaComponentToPart(DnaComponent[0], {
file: sbol,
strict: false,
});
// it will be null if there isn't any sequence information beneath it
if (validPart)
return [validPart];
}
// go on a fishing expedition for DnaComponents
// everything else has failed
// accumulate all that are "valid" (name + seq)
var dnaComponentAccumulator = [];
findDnaComponentNodes(dnaComponentAccumulator, RDF);
// @ts-ignore
var attemptedSeqs = dnaComponentAccumulator
.map(function (p) {
return dnaComponentToPart(p, {
file: sbol,
strict: true,
});
})
.filter(function (p) { return !!p; }); // invalid parts will be null
if (attemptedSeqs.length)
return attemptedSeqs;
// go on another fishing expedition, but for Sequence nodes
var dnaSequenceAccumulator = [];
findSequenceNodes(dnaSequenceAccumulator, RDF);
return dnaSequenceAccumulator.map(function (p) { return sequenceToPart(p, sbol); }).filter(function (p) { return p; }); // invalid parts will be null
});
/**
* find all the nodes within the JSON document that are keyed "Sequence"
*
* this is another last-resort scrapper for trying to find valid parts
*/
var findSequenceNodes = function (acc, doc) {
Object.keys(doc).forEach(function (k) {
if (k === "Sequence" && doc[k].length)
acc.push.apply(acc, doc[k]);
if (Array.isArray(doc[k])) {
doc[k].forEach(function (nestedNode) {
findSequenceNodes(acc, nestedNode);
});
}
});
};
/**
* after getting a DnaComponent out of the SBOL document,
* at either the root RDF level or from within a Collection/Annotation
* hierarchy, convert that DnaComponent to a Seq
*/
var dnaComponentToPart = function (DnaComponent, options) {
var _a = options.strict, strict = _a === void 0 ? false : _a;
// destructure the params from DnaComponent
var annotation = DnaComponent.annotation, displayId = DnaComponent.displayId, dnaSequence = DnaComponent.dnaSequence, name = DnaComponent.name;
// attempt to get the name out of the SBOL
var parsedName = "Unnamed";
if (name) {
parsedName = name;
}
else if (displayId) {
parsedName = displayId;
}
else if (strict) {
// in this scenario, we're really scrapping to find parts, but shouldn't
// accept any that don't at least have some name and sequence information
return null;
}
// attempt to get the sequence. fail if it's not findable
var seq = "";
if (dnaSequence && dnaSequence[0].DnaSequence) {
seq = dnaSequence[0].DnaSequence.nucleotides;
}
var parsedSeq = (0, utils_1.complement)(seq).seq; // seq and compSeq
if (!parsedSeq)
return null;
// attempt to parse the SBOL annotations into our version of annotations
var annotations = [];
if (annotation) {
annotation.forEach(function (_a) {
var SequenceAnnotation = _a.SequenceAnnotation;
if (!SequenceAnnotation || !SequenceAnnotation[0])
return;
var _b = SequenceAnnotation[0], bioEnd = _b.bioEnd, bioStart = _b.bioStart, strand = _b.strand, subComponent = _b.subComponent;
if (subComponent && subComponent.DnaComponent && subComponent.DnaComponent[0]) {
var _c = subComponent.DnaComponent[0], annId = _c.displayId, annName = _c.name, annType = _c.type;
annotations.push({
direction: strand === "+" ? 1 : -1,
end: bioEnd - 1 || 0,
name: annName || annId || "Untitled",
start: bioStart - 1 || 0,
type: annType["@_resource"] || "N/A",
});
}
});
}
return {
annotations: annotations,
name: parsedName,
seq: parsedSeq,
type: (0, utils_1.guessType)(seq),
};
};
/**
* find all nodes that of the type Sequence, and convert those to parts "Sequence" -> Part
*
* this is not the standard format. see A1.xml
*/
var sequenceToPart = function (Seq, file) {
// get the name
var name = Seq.displayId || Seq.title || "Unnamed";
// get the sequence
var seqOrig = Seq.elements[0] || "";
var _a = (0, utils_1.complement)(seqOrig), compSeq = _a.compSeq, seq = _a.seq;
// guess whether it's circular or not based on the presence of a word like vector.
// very ad hoc
var circular = file.search(/plasmid/i) > 0;
return { annotations: [], circular: circular, compSeq: compSeq, name: name, seq: seq, type: (0, utils_1.guessType)(seq) };
};
/**
* find all the nodes within the SBOL JSON document that are keyed "DnaComponent"
*
* this is a last-resort scrapper that tries to find valid parts that aren't within a root
* DnaComponent document or within a root Collection array
*/
var findDnaComponentNodes = function (acc, doc) {
Object.keys(doc).forEach(function (k) {
if (k === "DnaComponent" && doc[k].length)
acc.push.apply(acc, doc[k]);
if (Array.isArray(doc[k])) {
doc[k].forEach(function (nestedNode) {
findDnaComponentNodes(acc, nestedNode);
});
}
});
};
/***/ }),
/* 16 */
/***/ ((__unused_webpack_module, exports, __webpack_require__) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
var fast_xml_parser_1 = __webpack_require__(10);
var utils_1 = __webpack_require__(8);
/**
* Converts an SBOL file to our Seq format.
*
* SBOL v2.0 schema definition can be found at: http://sbolstandard.org/wp-content/uploads/2016/06/SBOL-data-model-2.2.1.pdf
* differs from SBOL v1.0 in that the ComponentDefinitions are like the root parts,
* and the sequence and annotations are separated (they're no longer defined relationally
* by nesting but, instead, by id) we only care about components that have sequence information
*/
exports["default"] = (function (sbol, fileName) {
// weird edge case with directed quotation characters
var fileString = sbol.replace(/“|”/g, '"');
// parse
var parsedSBOL = new fast_xml_parser_1.XMLParser({
ignoreAttributes: false,
isArray: function (name) {
return ["Sequence", "ComponentDefinition", "SequenceAnnotation", "sequenceAnnotation", "elements"].includes(name);
},
removeNSPrefix: true,
}).parse(fileString);
try {
var seqList = parseSBOL2(parsedSBOL, fileName);
if (seqList.length) {
return seqList;
}
else {
throw new Error("No Sequence info found");
}
}
catch (err) {
throw new Error("Failed to parse SBOL v2 file: ".concat(err));
}
});
var parseSBOL2 = function (parsedSBOL, fileName) {
var RDF = null;
if (parsedSBOL.RDF) {
(RDF = parsedSBOL.RDF);
}
if (!RDF) {
throw new Error("No root RDF document");
}
// check if anything is defined, return if not
var ComponentDefinition = RDF.ComponentDefinition, Sequence = RDF.Sequence;
if (!ComponentDefinition && !Sequence) {
throw new Error("Failed to parse SBOL v2: No ComponentDefinition or Sequence");
}
// read thru the Sequence elements
var getSeq = function (seqID) {
var seqElement = seqID
? // @ts-ignore
Sequence.find(function (s) {
return (s.persistentIdentity && s.persistentIdentity.length && s.persistentIdentity["@_resource"] === seqID) ||
s["@_about"] === seqID;
})
: Sequence[0];
if (seqElement && seqElement.elements) {
var seq_1 = (0, utils_1.complement)(seqElement.elements[0] || "").seq;
return {
annotations: [],
name: seqElement.displayId,
seq: seq_1,
type: (0, utils_1.guessType)(seq_1),
};
}
return null;
};
// if it's a collection of DnaComponents, parse each to a part
var seqList = [];
// @ts-ignore
ComponentDefinition === null || ComponentDefinition === void 0 ? void 0 : ComponentDefinition.forEach(function (c, i) {
// we're only making parts out of those with seq info
if (!c.sequence) {
return;
}
var displayId = c.displayId, sequence = c.sequence, sequenceAnnotation = c.sequenceAnnotation;
var name = displayId || "".concat(fileName, "_").concat(i + 1);
var annotations = [];
(sequenceAnnotation || []).forEach(function (_a) {
var SequenceAnnotation = _a.SequenceAnnotation;
var ann = SequenceAnnotation[0];
var annId = ann.displayId;
var Range = ann.location.Range;
var range = Range;
if (range) {
annotations.push({
end: range.end - 1,
name: annId,
start: range.start - 1,
});
}
});
var seq = getSeq(sequence["@_resource"]);
if (seq) {
seqList.push({
annotations: annotations,
name: name,
seq: seq.seq,
type: seq.type,
});
}
});
// if it's a single sequence, just try and get the sequence from that alone
var seq = getSeq();
if (!seqList.length && seq) {
seqList.push(seq);
}
return seqList;
};
/***/ }),
/* 17 */
/***/ ((__unused_webpack_module, exports, __webpack_require__) => {
Object.defineProperty(exports, "__esModule", ({ value: true }));
var utils_1 = __webpack_require__(8);
// a list of recognized types that would constitute an annotation name
var tagNameList = ["gene", "product", "note", "db_xref", "protein_id", "label", "lab_host"];
// a list of tags that could represent colors
var tagColorList = ["ApEinfo_fwdcolor", "ApEinfo_revcolor", "loom_color"];
/**
* takes in a string representation of a SeqBuilder file and outputs our
* part representation of it. an example of a SeqBuilder file can be found
* at imports/io/examples/seqbuilder, though there may be variations to the
* format
*/
exports["default"] = (function (fileInput, fileName) {
return fileInput.split(/\/\/\s/g).map(function (file) {
// +++++SEQUENCE+++++//
// the part sequence comes after the line that specifies the seqbuilder version number
// @ts-ignore
var SEQ_ROWS = file
.substring(file.search(/.*?written by seqbuilder .*?[0-9.]+[^actg]+/i) +
// @ts-ignore
file.match(/.*?written by seqbuilder .*?[0-9.]+[^actg]+/i)[0].length, file.length)
.match(/[actgyrwskmdvhbxn]+/gim)[0];
var seq = SEQ_ROWS;
(seq = (0, utils_1.complement)(seq).seq); // seq and compSeq
// there may be a genbank-like header row after the sequence
// LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
var parsedName = fileName.length > 0 ? fileName : "Unnamed";
if (~file.indexOf("LOCUS")) {
var HEADER_ROW = file.substring(file.indexOf("LOCUS"), file.search(/\\n|\n/));
if (HEADER_ROW && HEADER_ROW.split(/\s{2,}/g)) {
var _a = HEADER_ROW.split(/\s{2,}/g).filter(function (h) { return h; }), name_1 = _a[1];
parsedName = name_1;
}
}
// Name setting logic ported from GenBank parser
if ((parsedName === "Exported" && file.includes("SnapGene")) || // stupid Snapgene name
Number.parseInt(parsedName, 10) // it thinks seq-length is the name
) {
// first try and get the name from ACCESSION
var accessionName = false;
if (file.includes("ACCESSION")) {
// this will be undefined is there is no
var accession = file
.substring(file.indexOf("ACCESSION"), file.indexOf("\n", file.indexOf("ACCESSION")))