UNPKG

molstar

Version:

A comprehensive macromolecular library.

259 lines 9.24 kB
"use strict"; /** * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info. * * @author Alexander Rose <alexander.rose@weirdbyte.de> */ Object.defineProperty(exports, "__esModule", { value: true }); exports.parseCsv = void 0; var tslib_1 = require("tslib"); // import { Column } from 'mol-data/db' var tokenizer_1 = require("../common/text/tokenizer"); var Data = (0, tslib_1.__importStar)(require("./data-model")); var field_1 = require("./field"); var result_1 = require("../result"); var mol_task_1 = require("../../../mol-task"); function State(data, runtimeCtx, opts) { var tokenizer = (0, tokenizer_1.Tokenizer)(data); return { data: data, tokenizer: tokenizer, tokenType: 2 /* End */, runtimeCtx: runtimeCtx, tokens: [], fieldCount: 0, recordCount: 0, columnCount: 0, columnNames: [], quoteCharCode: opts.quote.charCodeAt(0), commentCharCode: opts.comment.charCodeAt(0), delimiterCharCode: opts.delimiter.charCodeAt(0), noColumnNamesRecord: opts.noColumnNames }; } /** * Eat everything until a delimiter or newline occurs. * Ignores whitespace at the end of the value, i.e. trim right. * Returns true when a newline occurs after the value. */ function eatValue(state, delimiterCharCode) { while (state.position < state.length) { var c = state.data.charCodeAt(state.position); ++state.position; switch (c) { case 10: // \n case 13: // \r return true; case delimiterCharCode: return; case 9: // \t case 32: // ' ' break; default: ++state.tokenEnd; break; } } } /** * Eats a quoted value. Can contain a newline. * Returns true when a newline occurs after the quoted value. * * Embedded quotes are represented by a pair of double quotes: * - ""xx"" => "xx" */ function eatQuoted(state, quoteCharCode, delimiterCharCode) { ++state.position; while (state.position < state.length) { var c = state.data.charCodeAt(state.position); if (c === quoteCharCode) { var next = state.data.charCodeAt(state.position + 1); if (next !== quoteCharCode) { // get rid of the quotes. state.tokenStart++; state.tokenEnd = state.position; ++state.position; return skipEmpty(state, delimiterCharCode); } } ++state.position; } state.tokenEnd = state.position; } /** * Skips empty chars. * Returns true when the current char is a newline. */ function skipEmpty(state, delimiterCharCode) { while (state.position < state.length) { var c = state.data.charCodeAt(state.position); if (c !== 9 && c !== 32 && c !== delimiterCharCode) { // \t or ' ' return c === 10 || c === 13; // \n or \r } ++state.position; } } function skipWhitespace(state) { var prev = -1; while (state.position < state.length) { var c = state.data.charCodeAt(state.position); switch (c) { case 9: // '\t' case 32: // ' ' prev = c; ++state.position; break; case 10: // \n // handle \r\n if (prev !== 13) { ++state.lineNumber; } prev = c; ++state.position; break; case 13: // \r prev = c; ++state.position; ++state.lineNumber; break; default: return; } } } function skipLine(state) { while (state.position < state.length) { var c = state.data.charCodeAt(state.position); if (c === 10 || c === 13) return; // \n or \r ++state.position; } } /** * Move to the next token. * Returns true when the current char is a newline, i.e. indicating a full record. */ function moveNextInternal(state) { var tokenizer = state.tokenizer; skipWhitespace(tokenizer); if (tokenizer.position >= tokenizer.length) { state.tokenType = 2 /* End */; return false; } tokenizer.tokenStart = tokenizer.position; tokenizer.tokenEnd = tokenizer.position; var c = state.data.charCodeAt(tokenizer.position); switch (c) { case state.commentCharCode: state.tokenType = 1 /* Comment */; skipLine(tokenizer); break; case state.quoteCharCode: state.tokenType = 0 /* Value */; return eatQuoted(tokenizer, state.quoteCharCode, state.delimiterCharCode); default: state.tokenType = 0 /* Value */; return eatValue(tokenizer, state.delimiterCharCode); } } /** * Moves to the next non-comment token/line. * Returns true when the current char is a newline, i.e. indicating a full record. */ function moveNext(state) { var newRecord = moveNextInternal(state); while (state.tokenType === 1 /* Comment */) { newRecord = moveNextInternal(state); } return newRecord; } function readRecordsChunk(chunkSize, state) { if (state.tokenType === 2 /* End */) return 0; var counter = 0; var newRecord; var tokens = state.tokens, tokenizer = state.tokenizer; while (state.tokenType === 0 /* Value */ && counter < chunkSize) { tokenizer_1.TokenBuilder.add(tokens[state.fieldCount % state.columnCount], tokenizer.tokenStart, tokenizer.tokenEnd); ++state.fieldCount; newRecord = moveNext(state); if (newRecord) { ++state.recordCount; ++counter; } } return counter; } function readRecordsChunks(state) { var newRecord = moveNext(state); if (newRecord) ++state.recordCount; return (0, mol_task_1.chunkedSubtask)(state.runtimeCtx, 100000, state, readRecordsChunk, function (ctx, state) { return ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.data.length }); }); } function addColumn(state) { state.columnNames.push(tokenizer_1.Tokenizer.getTokenString(state.tokenizer)); state.tokens.push(tokenizer_1.TokenBuilder.create(state.tokenizer.data, state.data.length / 80)); } function init(state) { var newRecord = moveNext(state); while (!newRecord) { addColumn(state); newRecord = moveNext(state); } addColumn(state); state.columnCount = state.columnNames.length; if (state.noColumnNamesRecord) { state.columnNames.forEach(function (x, i, arr) { return arr[i] = i + ''; }); tokenizer_1.Tokenizer.reset(state.tokenizer); } } function handleRecords(state) { return (0, tslib_1.__awaiter)(this, void 0, void 0, function () { var columns, i; return (0, tslib_1.__generator)(this, function (_a) { switch (_a.label) { case 0: init(state); return [4 /*yield*/, readRecordsChunks(state)]; case 1: _a.sent(); columns = Object.create(null); for (i = 0; i < state.columnCount; ++i) { columns[state.columnNames[i]] = (0, field_1.Field)(state.tokens[i]); } return [2 /*return*/, Data.CsvTable(state.recordCount, state.columnNames, columns)]; } }); }); } function parseInternal(data, ctx, opts) { return (0, tslib_1.__awaiter)(this, void 0, void 0, function () { var state, table, result; return (0, tslib_1.__generator)(this, function (_a) { switch (_a.label) { case 0: state = State(data, ctx, opts); ctx.update({ message: 'Parsing...', current: 0, max: data.length }); return [4 /*yield*/, handleRecords(state)]; case 1: table = _a.sent(); result = Data.CsvFile(table); return [2 /*return*/, result_1.ReaderResult.success(result)]; } }); }); } function parseCsv(data, opts) { var _this = this; var completeOpts = Object.assign({}, { quote: '"', comment: '#', delimiter: ',', noColumnNames: false }, opts); return mol_task_1.Task.create('Parse CSV', function (ctx) { return (0, tslib_1.__awaiter)(_this, void 0, void 0, function () { return (0, tslib_1.__generator)(this, function (_a) { switch (_a.label) { case 0: return [4 /*yield*/, parseInternal(data, ctx, completeOpts)]; case 1: return [2 /*return*/, _a.sent()]; } }); }); }); } exports.parseCsv = parseCsv; //# sourceMappingURL=parser.js.map