UNPKG

@tensorflow/tfjs-data

Version:

TensorFlow Data API in JavaScript

379 lines 49.1 kB
/** * @license * Copyright 2018 Google LLC. All Rights Reserved. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * ============================================================================= */ import { util } from '@tensorflow/tfjs-core'; import { Dataset } from '../dataset'; import { TextLineDataset } from './text_line_dataset'; const CODE_QUOTE = '"'; const STATE_OUT = Symbol('out'); const STATE_FIELD = Symbol('field'); const STATE_QUOTE = Symbol('quote'); const STATE_QUOTE_AFTER_QUOTE = Symbol('quoteafterquote'); const STATE_WITHIN_QUOTE_IN_QUOTE = Symbol('quoteinquote'); /** * Represents a potentially large collection of delimited text records. * * The produced `TensorContainer`s each contain one key-value pair for * every column of the table. When a field is empty in the incoming data, the * resulting value is `undefined`, or throw error if it is required. Values * that can be parsed as numbers are emitted as type `number`, other values * are parsed as `string`. * * The results are not batched. * * @doc {heading: 'Data', subheading: 'Classes', namespace: 'data'} */ export class CSVDataset extends Dataset { /** * Returns column names of the csv dataset. If `configuredColumnsOnly` is * true, return column names in `columnConfigs`. If `configuredColumnsOnly` is * false and `columnNames` is provided, `columnNames`. If * `configuredColumnsOnly` is false and `columnNames` is not provided, return * all column names parsed from the csv file. For example usage please go to * `tf.data.csv`. * * @doc {heading: 'Data', subheading: 'Classes'} */ async columnNames() { if (!this.columnNamesValidated) { await this.setColumnNames(); } return this.configuredColumnsOnly ? Object.keys(this.columnConfigs) : this.fullColumnNames; } /* 1) If `columnNames` is provided as string[], use this string[] as output * keys in corresponding order. The length must match the number of inferred * columns if `hasHeader` is true . * 2) If `columnNames` is not provided, parse header line as `columnNames` if * hasHeader is true. If `hasHeader` is false, throw an error. * 3) If `columnConfigs` is provided, all the keys in `columnConfigs` must * exist in parsed `columnNames`. */ async setColumnNames() { const columnNamesFromFile = await this.maybeReadHeaderLine(); if (!this.fullColumnNames && !columnNamesFromFile) { // Throw an error if columnNames is not provided and no header line. throw new Error('Column names must be provided if there is no header line.'); } else if (this.fullColumnNames && columnNamesFromFile) { // Check provided columnNames match header line. util.assert(columnNamesFromFile.length === this.fullColumnNames.length, () => 'The length of provided columnNames (' + this.fullColumnNames.length.toString() + ') does not match the length of the header line read from ' + 'file (' + columnNamesFromFile.length.toString() + ').'); } if (!this.fullColumnNames) { this.fullColumnNames = columnNamesFromFile; } // Check if there are duplicate column names. const counts = this.fullColumnNames.reduce((countAcc, name) => { countAcc[name] = (countAcc[name] + 1) || 1; return countAcc; }, {}); const duplicateNames = Object.keys(counts).filter((name) => (counts[name] > 1)); util.assert(duplicateNames.length === 0, () => 'Duplicate column names found: ' + duplicateNames.toString()); // Check if keys in columnConfigs match columnNames. if (this.columnConfigs) { for (const key of Object.keys(this.columnConfigs)) { const index = this.fullColumnNames.indexOf(key); if (index === -1) { throw new Error('The key "' + key + '" provided in columnConfigs does not match any of the column ' + 'names (' + this.fullColumnNames.toString() + ').'); } } } this.columnNamesValidated = true; } async maybeReadHeaderLine() { if (this.hasHeader) { const iter = await this.base.iterator(); const firstElement = await iter.next(); if (firstElement.done) { throw new Error('No data was found for CSV parsing.'); } const firstLine = firstElement.value; const headers = this.parseRow(firstLine, false); return headers; } else { return null; } } /** * Create a `CSVDataset`. * * @param input A `DataSource` providing a chunked, UTF8-encoded byte stream. * @param csvConfig (Optional) A CSVConfig object that contains configurations * of reading and decoding from CSV file(s). * * hasHeader: (Optional) A boolean value that indicates whether the first * row of provided CSV file is a header line with column names, and should * not be included in the data. Defaults to `true`. * * columnNames: (Optional) A list of strings that corresponds to * the CSV column names, in order. If provided, it ignores the column * names inferred from the header row. If not provided, infers the column * names from the first row of the records. If hasHeader is false and * columnNames is not provided, this method throws an error. * * columnConfigs: (Optional) A dictionary whose key is column names, value * is an object stating if this column is required, column's data type, * default value, and if this column is label. If provided, keys must * correspond to names provided in columnNames or inferred from the file * header lines. If isLabel is true any column, returns an array of two * items: the first item is a dict of features key/value pairs, the second * item is a dict of labels key/value pairs. If no feature is marked as * label, returns a dict of features only. * * configuredColumnsOnly (Optional) If true, only columns provided in * columnConfigs will be parsed and provided during iteration. * * delimiter (Optional) The string used to parse each line of the input * file. Defaults to `,`. */ constructor(input, csvConfig) { super(); this.input = input; this.hasHeader = true; this.fullColumnNames = null; this.columnNamesValidated = false; this.columnConfigs = null; this.configuredColumnsOnly = false; this.delimiter = ','; this.delimWhitespace = false; this.base = new TextLineDataset(input); if (!csvConfig) { csvConfig = {}; } this.hasHeader = csvConfig.hasHeader === false ? false : true; this.fullColumnNames = csvConfig.columnNames; this.columnConfigs = csvConfig.columnConfigs; this.configuredColumnsOnly = csvConfig.configuredColumnsOnly; if (csvConfig.delimWhitespace) { util.assert(csvConfig.delimiter == null, () => 'Delimiter should not be provided when delimWhitespace is true.'); this.delimWhitespace = true; this.delimiter = ' '; } else { this.delimiter = csvConfig.delimiter ? csvConfig.delimiter : ','; } } async iterator() { if (!this.columnNamesValidated) { await this.setColumnNames(); } let lines = await this.base.iterator(); if (this.hasHeader) { // We previously read the first line to get the columnNames. // Now that we're providing data, skip it. lines = lines.skip(1); } return lines.map(x => this.makeDataElement(x)); } makeDataElement(line) { const values = this.parseRow(line); const features = {}; const labels = {}; for (let i = 0; i < this.fullColumnNames.length; i++) { const key = this.fullColumnNames[i]; const config = this.columnConfigs ? this.columnConfigs[key] : null; if (this.configuredColumnsOnly && !config) { // This column is not selected. continue; } else { const value = values[i]; let parsedValue = null; if (value === '') { // If default value is provided, use it. If default value is not // provided, set as undefined. if (config && config.default !== undefined) { parsedValue = config.default; } else if (config && (config.required || config.isLabel)) { throw new Error(`Required column ${key} is empty in this line: ${line}`); } else { parsedValue = undefined; } } else { // A value is present, so parse it based on type const valueAsNum = Number(value); if (isNaN(valueAsNum)) { // The value is a string and this column is declared as boolean // in config, parse it as boolean. if (config && config.dtype === 'bool') { parsedValue = this.getBoolean(value); } else { // Set value as string parsedValue = value; } } else if (!config || !config.dtype) { // If this value is a number and no type config is provided, return // it as number. parsedValue = valueAsNum; } else { // If this value is a number and data type is provided, parse it // according to provided data type. switch (config.dtype) { case 'float32': parsedValue = valueAsNum; break; case 'int32': parsedValue = Math.floor(valueAsNum); break; case 'bool': parsedValue = this.getBoolean(value); break; default: parsedValue = valueAsNum; } } } // Check if this column is label. (config && config.isLabel) ? labels[key] = parsedValue : features[key] = parsedValue; } } // If label exists, return an object of features and labels as {xs:features, // ys:labels}, otherwise return features only. if (Object.keys(labels).length === 0) { return features; } else { return { xs: features, ys: labels }; } } getBoolean(value) { if (value === '1' || value.toLowerCase() === 'true') { return 1; } else { return 0; } } // adapted from https://beta.observablehq.com/@mbostock/streaming-csv parseRow(line, validateElementCount = true) { const result = []; let readOffset = 0; const readLength = line.length; let currentState = STATE_OUT; // Goes through the line to parse quote. for (let i = 0; i < readLength; i++) { switch (currentState) { // Before enter a new field case STATE_OUT: switch (line.charAt(i)) { // Enter a quoted field case CODE_QUOTE: readOffset = i + 1; currentState = STATE_QUOTE; break; // Read an empty field case this.delimiter: readOffset = i + 1; // If delimiter is white space and configured to collapse // multiple white spaces, ignore this white space. if (this.delimiter === ' ' && this.delimWhitespace) { break; } result.push(''); currentState = STATE_OUT; break; // Enter an unquoted field default: currentState = STATE_FIELD; readOffset = i; break; } break; // In an unquoted field case STATE_FIELD: switch (line.charAt(i)) { // Exit an unquoted field, add it to result case this.delimiter: result.push(line.substring(readOffset, i)); currentState = STATE_OUT; readOffset = i + 1; break; default: } break; // In a quoted field case STATE_QUOTE: switch (line.charAt(i)) { // Read a quote after a quote case CODE_QUOTE: currentState = STATE_QUOTE_AFTER_QUOTE; break; default: } break; // This state means it's right after a second quote in a field case STATE_QUOTE_AFTER_QUOTE: switch (line.charAt(i)) { // Finished a quoted field case this.delimiter: result.push(line.substring(readOffset, i - 1)); currentState = STATE_OUT; readOffset = i + 1; break; // Finished a quoted part in a quoted field case CODE_QUOTE: currentState = STATE_QUOTE; break; // In a quoted part in a quoted field default: currentState = STATE_WITHIN_QUOTE_IN_QUOTE; break; } break; case STATE_WITHIN_QUOTE_IN_QUOTE: switch (line.charAt(i)) { // Exit a quoted part in a quoted field case CODE_QUOTE: currentState = STATE_QUOTE; break; default: } break; default: } } // Adds last item based on if it is quoted. if (currentState === STATE_QUOTE_AFTER_QUOTE) { result.push(line.substring(readOffset, readLength - 1)); } else { result.push(line.substring(readOffset)); } // Check if each row has the same number of elements as column names. if (validateElementCount && result.length !== this.fullColumnNames.length) { throw new Error(`Invalid row in csv file. Should have ${this.fullColumnNames.length} elements in a row, but got ${result}`); } return result; } } // TODO(soergel): add more basic datasets for parity with tf.data // tf.data.FixedLengthRecordDataset() // tf.data.TFRecordDataset() //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"csv_dataset.js","sourceRoot":"","sources":["../../../../../../tfjs-data/src/datasets/csv_dataset.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,OAAO,EAAkB,IAAI,EAAC,MAAM,uBAAuB,CAAC;AAC5D,OAAO,EAAC,OAAO,EAAC,MAAM,YAAY,CAAC;AAInC,OAAO,EAAC,eAAe,EAAC,MAAM,qBAAqB,CAAC;AAEpD,MAAM,UAAU,GAAG,GAAG,CAAC;AACvB,MAAM,SAAS,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;AAChC,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC;AACpC,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC;AACpC,MAAM,uBAAuB,GAAG,MAAM,CAAC,iBAAiB,CAAC,CAAC;AAC1D,MAAM,2BAA2B,GAAG,MAAM,CAAC,cAAc,CAAC,CAAC;AAE3D;;;;;;;;;;;;GAYG;AACH,MAAM,OAAO,UAAW,SAAQ,OAAwB;IAUtD;;;;;;;;;OASG;IACH,KAAK,CAAC,WAAW;QACf,IAAI,CAAC,IAAI,CAAC,oBAAoB,EAAE;YAC9B,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;SAC7B;QACD,OAAO,IAAI,CAAC,qBAAqB,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC;YACjC,IAAI,CAAC,eAAe,CAAC;IAC3D,CAAC;IAED;;;;;;;OAOG;IACK,KAAK,CAAC,cAAc;QAC1B,MAAM,mBAAmB,GAAG,MAAM,IAAI,CAAC,mBAAmB,EAAE,CAAC;QAC7D,IAAI,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,mBAAmB,EAAE;YACjD,oEAAoE;YACpE,MAAM,IAAI,KAAK,CACX,2DAA2D,CAAC,CAAC;SAClE;aAAM,IAAI,IAAI,CAAC,eAAe,IAAI,mBAAmB,EAAE;YACtD,gDAAgD;YAChD,IAAI,CAAC,MAAM,CACP,mBAAmB,CAAC,MAAM,KAAK,IAAI,CAAC,eAAe,CAAC,MAAM,EAC1D,GAAG,EAAE,CAAC,sCAAsC;gBACxC,IAAI,CAAC,eAAe,CAAC,MAAM,CAAC,QAAQ,EAAE;gBACtC,2DAA2D;gBAC3D,QAAQ,GAAG,mBAAmB,CAAC,MAAM,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,CAAC;SAClE;QACD,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE;YACzB,IAAI,CAAC,eAAe,GAAG,mBAAmB,CAAC;SAC5C;QACD,6CAA6C;QAC7C,MAAM,MAAM,GAA4B,IAAI,CAAC,eAAe,CAAC,MAAM,CAC/D,CAAC,QAAiC,EAAE,IAAI,EAAE,EAAE;YAC1C,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;YAC3C,OAAO,QAAQ,CAAC;QAClB,CAAC,EACD,EAAE,CAAC,CAAC;QACR,MAAM,cAAc,GAChB,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAC7D,IAAI,CAAC,MAAM,CACP,cAAc,CAAC,MAAM,KAAK,CAAC,EAC3B,GAAG,EAAE,CAAC,gCAAgC,GAAG,cAAc,CAAC,QAAQ,EAAE,CAAC,CAAC;QACxE,oDAAoD;QACpD,IAAI,IAAI,CAAC,aAAa,EAAE;YACtB,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,EAAE;gBACjD,MAAM,KAAK,GAAG,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;gBAChD,IAAI,KAAK,KAAK,CAAC,CAAC,EAAE;oBAChB,MAAM,IAAI,KAAK,CACX,WAAW,GAAG,GAAG;wBACjB,+DAA+D;wBAC/D,SAAS,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,CAAC;iBACzD;aACF;SACF;QACD,IAAI,CAAC,oBAAoB,GAAG,IAAI,CAAC;IACnC,CAAC;IAEO,KAAK,CAAC,mBAAmB;QAC/B,IAAI,IAAI,CAAC,SAAS,EAAE;YAClB,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACxC,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE,CAAC;YACvC,IAAI,YAAY,CAAC,IAAI,EAAE;gBACrB,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;aACvD;YACD,MAAM,SAAS,GAAW,YAAY,CAAC,KAAK,CAAC;YAC7C,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;YAChD,OAAO,OAAO,CAAC;SAChB;aAAM;YACL,OAAO,IAAI,CAAC;SACb;IACH,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;OA+BG;IACH,YAA+B,KAAiB,EAAE,SAAqB;QACrE,KAAK,EAAE,CAAC;QADqB,UAAK,GAAL,KAAK,CAAY;QA9HxC,cAAS,GAAG,IAAI,CAAC;QACjB,oBAAe,GAAa,IAAI,CAAC;QACjC,yBAAoB,GAAG,KAAK,CAAC;QAC7B,kBAAa,GAAkC,IAAI,CAAC;QACpD,0BAAqB,GAAG,KAAK,CAAC;QAC9B,cAAS,GAAG,GAAG,CAAC;QAChB,oBAAe,GAAG,KAAK,CAAC;QA0H9B,IAAI,CAAC,IAAI,GAAG,IAAI,eAAe,CAAC,KAAK,CAAC,CAAC;QACvC,IAAI,CAAC,SAAS,EAAE;YACd,SAAS,GAAG,EAAE,CAAC;SAChB;QACD,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC,SAAS,KAAK,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;QAC9D,IAAI,CAAC,eAAe,GAAG,SAAS,CAAC,WAAW,CAAC;QAC7C,IAAI,CAAC,aAAa,GAAG,SAAS,CAAC,aAAa,CAAC;QAC7C,IAAI,CAAC,qBAAqB,GAAG,SAAS,CAAC,qBAAqB,CAAC;QAC7D,IAAI,SAAS,CAAC,eAAe,EAAE;YAC7B,IAAI,CAAC,MAAM,CACP,SAAS,CAAC,SAAS,IAAI,IAAI,EAC3B,GAAG,EAAE,CACD,gEAAgE,CAAC,CAAC;YAC1E,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;YAC5B,IAAI,CAAC,SAAS,GAAG,GAAG,CAAC;SACtB;aAAM;YACL,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC;SAClE;IACH,CAAC;IAED,KAAK,CAAC,QAAQ;QACZ,IAAI,CAAC,IAAI,CAAC,oBAAoB,EAAE;YAC9B,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;SAC7B;QACD,IAAI,KAAK,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;QACvC,IAAI,IAAI,CAAC,SAAS,EAAE;YAClB,4DAA4D;YAC5D,0CAA0C;YAC1C,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;SACvB;QACD,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC;IACjD,CAAC;IAED,eAAe,CAAC,IAAY;QAC1B,MAAM,MAAM,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,QAAQ,GAAqC,EAAE,CAAC;QACtD,MAAM,MAAM,GAAqC,EAAE,CAAC;QAEpD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,eAAe,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;YACpD,MAAM,GAAG,GAAG,IAAI,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;YACpC,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YACnE,IAAI,IAAI,CAAC,qBAAqB,IAAI,CAAC,MAAM,EAAE;gBACzC,+BAA+B;gBAC/B,SAAS;aACV;iBAAM;gBACL,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;gBACxB,IAAI,WAAW,GAAG,IAAI,CAAC;gBACvB,IAAI,KAAK,KAAK,EAAE,EAAE;oBAChB,gEAAgE;oBAChE,8BAA8B;oBAC9B,IAAI,MAAM,IAAI,MAAM,CAAC,OAAO,KAAK,SAAS,EAAE;wBAC1C,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC;qBAC9B;yBAAM,IAAI,MAAM,IAAI,CAAC,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,OAAO,CAAC,EAAE;wBACxD,MAAM,IAAI,KAAK,CACX,mBAAmB,GAAG,2BAA2B,IAAI,EAAE,CAAC,CAAC;qBAC9D;yBAAM;wBACL,WAAW,GAAG,SAAS,CAAC;qBACzB;iBACF;qBAAM;oBACL,gDAAgD;oBAChD,MAAM,UAAU,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;oBACjC,IAAI,KAAK,CAAC,UAAU,CAAC,EAAE;wBACrB,+DAA+D;wBAC/D,kCAAkC;wBAClC,IAAI,MAAM,IAAI,MAAM,CAAC,KAAK,KAAK,MAAM,EAAE;4BACrC,WAAW,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;yBACtC;6BAAM;4BACL,sBAAsB;4BACtB,WAAW,GAAG,KAAK,CAAC;yBACrB;qBACF;yBAAM,IAAI,CAAC,MAAM,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE;wBACnC,mEAAmE;wBACnE,gBAAgB;wBAChB,WAAW,GAAG,UAAU,CAAC;qBAC1B;yBAAM;wBACL,gEAAgE;wBAChE,mCAAmC;wBACnC,QAAQ,MAAM,CAAC,KAAK,EAAE;4BACpB,KAAK,SAAS;gCACZ,WAAW,GAAG,UAAU,CAAC;gCACzB,MAAM;4BACR,KAAK,OAAO;gCACV,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;gCACrC,MAAM;4BACR,KAAK,MAAM;gCACT,WAAW,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;gCACrC,MAAM;4BACR;gCACE,WAAW,GAAG,UAAU,CAAC;yBAC5B;qBACF;iBACF;gBACD,iCAAiC;gBACjC,CAAC,MAAM,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC,CAAC;oBAC3B,QAAQ,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC;aAC1D;SACF;QACD,4EAA4E;QAC5E,8CAA8C;QAC9C,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE;YACpC,OAAO,QAAQ,CAAC;SAEjB;aAAM;YACL,OAAO,EAAC,EAAE,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAC,CAAC;SACnC;IACH,CAAC;IAEO,UAAU,CAAC,KAAa;QAC9B,IAAI,KAAK,KAAK,GAAG,IAAI,KAAK,CAAC,WAAW,EAAE,KAAK,MAAM,EAAE;YACnD,OAAO,CAAC,CAAC;SACV;aAAM;YACL,OAAO,CAAC,CAAC;SACV;IACH,CAAC;IAED,qEAAqE;IAC7D,QAAQ,CAAC,IAAY,EAAE,oBAAoB,GAAG,IAAI;QACxD,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC;QAC/B,IAAI,YAAY,GAAG,SAAS,CAAC;QAC7B,wCAAwC;QACxC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE;YACnC,QAAQ,YAAY,EAAE;gBACpB,2BAA2B;gBAC3B,KAAK,SAAS;oBACZ,QAAQ,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE;wBACtB,uBAAuB;wBACvB,KAAK,UAAU;4BACb,UAAU,GAAG,CAAC,GAAG,CAAC,CAAC;4BACnB,YAAY,GAAG,WAAW,CAAC;4BAC3B,MAAM;wBACR,sBAAsB;wBACtB,KAAK,IAAI,CAAC,SAAS;4BACjB,UAAU,GAAG,CAAC,GAAG,CAAC,CAAC;4BACnB,yDAAyD;4BACzD,kDAAkD;4BAClD,IAAI,IAAI,CAAC,SAAS,KAAK,GAAG,IAAI,IAAI,CAAC,eAAe,EAAE;gCAClD,MAAM;6BACP;4BACD,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;4BAChB,YAAY,GAAG,SAAS,CAAC;4BACzB,MAAM;wBACR,0BAA0B;wBAC1B;4BACE,YAAY,GAAG,WAAW,CAAC;4BAC3B,UAAU,GAAG,CAAC,CAAC;4BACf,MAAM;qBACT;oBACD,MAAM;gBACR,uBAAuB;gBACvB,KAAK,WAAW;oBACd,QAAQ,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE;wBACtB,2CAA2C;wBAC3C,KAAK,IAAI,CAAC,SAAS;4BACjB,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,CAAC;4BAC3C,YAAY,GAAG,SAAS,CAAC;4BACzB,UAAU,GAAG,CAAC,GAAG,CAAC,CAAC;4BACnB,MAAM;wBACR,QAAQ;qBACT;oBACD,MAAM;gBACR,oBAAoB;gBACpB,KAAK,WAAW;oBACd,QAAQ,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE;wBACtB,6BAA6B;wBAC7B,KAAK,UAAU;4BACb,YAAY,GAAG,uBAAuB,CAAC;4BACvC,MAAM;wBACR,QAAQ;qBACT;oBACD,MAAM;gBACR,8DAA8D;gBAC9D,KAAK,uBAAuB;oBAC1B,QAAQ,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE;wBACtB,0BAA0B;wBAC1B,KAAK,IAAI,CAAC,SAAS;4BACjB,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;4BAC/C,YAAY,GAAG,SAAS,CAAC;4BACzB,UAAU,GAAG,CAAC,GAAG,CAAC,CAAC;4BACnB,MAAM;wBACR,2CAA2C;wBAC3C,KAAK,UAAU;4BACb,YAAY,GAAG,WAAW,CAAC;4BAC3B,MAAM;wBACR,qCAAqC;wBACrC;4BACE,YAAY,GAAG,2BAA2B,CAAC;4BAC3C,MAAM;qBACT;oBACD,MAAM;gBACR,KAAK,2BAA2B;oBAC9B,QAAQ,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE;wBACtB,uCAAuC;wBACvC,KAAK,UAAU;4BACb,YAAY,GAAG,WAAW,CAAC;4BAC3B,MAAM;wBACR,QAAQ;qBACT;oBACD,MAAM;gBACR,QAAQ;aACT;SACF;QACD,2CAA2C;QAC3C,IAAI,YAAY,KAAK,uBAAuB,EAAE;YAC5C,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC;SACzD;aAAM;YACL,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC;SACzC;QACD,qEAAqE;QACrE,IAAI,oBAAoB,IAAI,MAAM,CAAC,MAAM,KAAK,IAAI,CAAC,eAAe,CAAC,MAAM,EAAE;YACzE,MAAM,IAAI,KAAK,CAAC,wCACZ,IAAI,CAAC,eAAe,CAAC,MAAM,+BAA+B,MAAM,EAAE,CAAC,CAAC;SACzE;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAED,iEAAiE;AACjE,qCAAqC;AACrC,4BAA4B","sourcesContent":["/**\n * @license\n * Copyright 2018 Google LLC. All Rights Reserved.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n * =============================================================================\n */\n\nimport {TensorContainer, util} from '@tensorflow/tfjs-core';\nimport {Dataset} from '../dataset';\nimport {DataSource} from '../datasource';\nimport {LazyIterator} from '../iterators/lazy_iterator';\nimport {ColumnConfig, CSVConfig} from '../types';\nimport {TextLineDataset} from './text_line_dataset';\n\nconst CODE_QUOTE = '\"';\nconst STATE_OUT = Symbol('out');\nconst STATE_FIELD = Symbol('field');\nconst STATE_QUOTE = Symbol('quote');\nconst STATE_QUOTE_AFTER_QUOTE = Symbol('quoteafterquote');\nconst STATE_WITHIN_QUOTE_IN_QUOTE = Symbol('quoteinquote');\n\n/**\n * Represents a potentially large collection of delimited text records.\n *\n * The produced `TensorContainer`s each contain one key-value pair for\n * every column of the table.  When a field is empty in the incoming data, the\n * resulting value is `undefined`, or throw error if it is required.  Values\n * that can be parsed as numbers are emitted as type `number`, other values\n * are parsed as `string`.\n *\n * The results are not batched.\n *\n * @doc {heading: 'Data', subheading: 'Classes', namespace: 'data'}\n */\nexport class CSVDataset extends Dataset<TensorContainer> {\n  base: TextLineDataset;\n  private hasHeader = true;\n  private fullColumnNames: string[] = null;\n  private columnNamesValidated = false;\n  private columnConfigs: {[key: string]: ColumnConfig} = null;\n  private configuredColumnsOnly = false;\n  private delimiter = ',';\n  private delimWhitespace = false;\n\n  /**\n   * Returns column names of the csv dataset. If `configuredColumnsOnly` is\n   * true, return column names in `columnConfigs`. If `configuredColumnsOnly` is\n   * false and `columnNames` is provided, `columnNames`. If\n   * `configuredColumnsOnly` is false and `columnNames` is not provided, return\n   * all column names parsed from the csv file. For example usage please go to\n   * `tf.data.csv`.\n   *\n   * @doc {heading: 'Data', subheading: 'Classes'}\n   */\n  async columnNames() {\n    if (!this.columnNamesValidated) {\n      await this.setColumnNames();\n    }\n    return this.configuredColumnsOnly ? Object.keys(this.columnConfigs) :\n                                        this.fullColumnNames;\n  }\n\n  /* 1) If `columnNames` is provided as string[], use this string[] as output\n   * keys in corresponding order. The length must match the number of inferred\n   * columns if `hasHeader` is true .\n   * 2) If `columnNames` is not provided, parse header line as `columnNames` if\n   * hasHeader is true. If `hasHeader` is false, throw an error.\n   * 3) If `columnConfigs` is provided, all the keys in `columnConfigs` must\n   * exist in parsed `columnNames`.\n   */\n  private async setColumnNames() {\n    const columnNamesFromFile = await this.maybeReadHeaderLine();\n    if (!this.fullColumnNames && !columnNamesFromFile) {\n      // Throw an error if columnNames is not provided and no header line.\n      throw new Error(\n          'Column names must be provided if there is no header line.');\n    } else if (this.fullColumnNames && columnNamesFromFile) {\n      // Check provided columnNames match header line.\n      util.assert(\n          columnNamesFromFile.length === this.fullColumnNames.length,\n          () => 'The length of provided columnNames (' +\n              this.fullColumnNames.length.toString() +\n              ') does not match the length of the header line read from ' +\n              'file (' + columnNamesFromFile.length.toString() + ').');\n    }\n    if (!this.fullColumnNames) {\n      this.fullColumnNames = columnNamesFromFile;\n    }\n    // Check if there are duplicate column names.\n    const counts: {[key: string]: number} = this.fullColumnNames.reduce(\n        (countAcc: {[key: string]: number}, name) => {\n          countAcc[name] = (countAcc[name] + 1) || 1;\n          return countAcc;\n        },\n        {});\n    const duplicateNames =\n        Object.keys(counts).filter((name) => (counts[name] > 1));\n    util.assert(\n        duplicateNames.length === 0,\n        () => 'Duplicate column names found: ' + duplicateNames.toString());\n    // Check if keys in columnConfigs match columnNames.\n    if (this.columnConfigs) {\n      for (const key of Object.keys(this.columnConfigs)) {\n        const index = this.fullColumnNames.indexOf(key);\n        if (index === -1) {\n          throw new Error(\n              'The key \"' + key +\n              '\" provided in columnConfigs does not match any of the column ' +\n              'names (' + this.fullColumnNames.toString() + ').');\n        }\n      }\n    }\n    this.columnNamesValidated = true;\n  }\n\n  private async maybeReadHeaderLine() {\n    if (this.hasHeader) {\n      const iter = await this.base.iterator();\n      const firstElement = await iter.next();\n      if (firstElement.done) {\n        throw new Error('No data was found for CSV parsing.');\n      }\n      const firstLine: string = firstElement.value;\n      const headers = this.parseRow(firstLine, false);\n      return headers;\n    } else {\n      return null;\n    }\n  }\n\n  /**\n   * Create a `CSVDataset`.\n   *\n   * @param input A `DataSource` providing a chunked, UTF8-encoded byte stream.\n   * @param csvConfig (Optional) A CSVConfig object that contains configurations\n   *     of reading and decoding from CSV file(s).\n   *\n   *     hasHeader: (Optional) A boolean value that indicates whether the first\n   *     row of provided CSV file is a header line with column names, and should\n   *     not be included in the data. Defaults to `true`.\n   *\n   *     columnNames: (Optional) A list of strings that corresponds to\n   *     the CSV column names, in order. If provided, it ignores the column\n   *     names inferred from the header row. If not provided, infers the column\n   *     names from the first row of the records. If hasHeader is false and\n   *     columnNames is not provided, this method throws an error.\n   *\n   *     columnConfigs: (Optional) A dictionary whose key is column names, value\n   *     is an object stating if this column is required, column's data type,\n   *     default value, and if this column is label. If provided, keys must\n   *     correspond to names provided in columnNames or inferred from the file\n   *     header lines. If isLabel is true any column, returns an array of two\n   *     items: the first item is a dict of features key/value pairs, the second\n   *     item is a dict of labels key/value pairs. If no feature is marked as\n   *     label, returns a dict of features only.\n   *\n   *     configuredColumnsOnly (Optional) If true, only columns provided in\n   *     columnConfigs will be parsed and provided during iteration.\n   *\n   *     delimiter (Optional) The string used to parse each line of the input\n   *     file. Defaults to `,`.\n   */\n  constructor(protected readonly input: DataSource, csvConfig?: CSVConfig) {\n    super();\n    this.base = new TextLineDataset(input);\n    if (!csvConfig) {\n      csvConfig = {};\n    }\n    this.hasHeader = csvConfig.hasHeader === false ? false : true;\n    this.fullColumnNames = csvConfig.columnNames;\n    this.columnConfigs = csvConfig.columnConfigs;\n    this.configuredColumnsOnly = csvConfig.configuredColumnsOnly;\n    if (csvConfig.delimWhitespace) {\n      util.assert(\n          csvConfig.delimiter == null,\n          () =>\n              'Delimiter should not be provided when delimWhitespace is true.');\n      this.delimWhitespace = true;\n      this.delimiter = ' ';\n    } else {\n      this.delimiter = csvConfig.delimiter ? csvConfig.delimiter : ',';\n    }\n  }\n\n  async iterator(): Promise<LazyIterator<TensorContainer>> {\n    if (!this.columnNamesValidated) {\n      await this.setColumnNames();\n    }\n    let lines = await this.base.iterator();\n    if (this.hasHeader) {\n      // We previously read the first line to get the columnNames.\n      // Now that we're providing data, skip it.\n      lines = lines.skip(1);\n    }\n    return lines.map(x => this.makeDataElement(x));\n  }\n\n  makeDataElement(line: string): TensorContainer {\n    const values = this.parseRow(line);\n    const features: {[key: string]: TensorContainer} = {};\n    const labels: {[key: string]: TensorContainer} = {};\n\n    for (let i = 0; i < this.fullColumnNames.length; i++) {\n      const key = this.fullColumnNames[i];\n      const config = this.columnConfigs ? this.columnConfigs[key] : null;\n      if (this.configuredColumnsOnly && !config) {\n        // This column is not selected.\n        continue;\n      } else {\n        const value = values[i];\n        let parsedValue = null;\n        if (value === '') {\n          // If default value is provided, use it. If default value is not\n          // provided, set as undefined.\n          if (config && config.default !== undefined) {\n            parsedValue = config.default;\n          } else if (config && (config.required || config.isLabel)) {\n            throw new Error(\n                `Required column ${key} is empty in this line: ${line}`);\n          } else {\n            parsedValue = undefined;\n          }\n        } else {\n          // A value is present, so parse it based on type\n          const valueAsNum = Number(value);\n          if (isNaN(valueAsNum)) {\n            // The value is a string and this column is declared as boolean\n            // in config, parse it as boolean.\n            if (config && config.dtype === 'bool') {\n              parsedValue = this.getBoolean(value);\n            } else {\n              // Set value as string\n              parsedValue = value;\n            }\n          } else if (!config || !config.dtype) {\n            // If this value is a number and no type config is provided, return\n            // it as number.\n            parsedValue = valueAsNum;\n          } else {\n            // If this value is a number and data type is provided, parse it\n            // according to provided data type.\n            switch (config.dtype) {\n              case 'float32':\n                parsedValue = valueAsNum;\n                break;\n              case 'int32':\n                parsedValue = Math.floor(valueAsNum);\n                break;\n              case 'bool':\n                parsedValue = this.getBoolean(value);\n                break;\n              default:\n                parsedValue = valueAsNum;\n            }\n          }\n        }\n        // Check if this column is label.\n        (config && config.isLabel) ? labels[key] = parsedValue :\n                                     features[key] = parsedValue;\n      }\n    }\n    // If label exists, return an object of features and labels as {xs:features,\n    // ys:labels}, otherwise return features only.\n    if (Object.keys(labels).length === 0) {\n      return features;\n\n    } else {\n      return {xs: features, ys: labels};\n    }\n  }\n\n  private getBoolean(value: string): number {\n    if (value === '1' || value.toLowerCase() === 'true') {\n      return 1;\n    } else {\n      return 0;\n    }\n  }\n\n  // adapted from https://beta.observablehq.com/@mbostock/streaming-csv\n  private parseRow(line: string, validateElementCount = true): string[] {\n    const result: string[] = [];\n    let readOffset = 0;\n    const readLength = line.length;\n    let currentState = STATE_OUT;\n    // Goes through the line to parse quote.\n    for (let i = 0; i < readLength; i++) {\n      switch (currentState) {\n        // Before enter a new field\n        case STATE_OUT:\n          switch (line.charAt(i)) {\n            // Enter a quoted field\n            case CODE_QUOTE:\n              readOffset = i + 1;\n              currentState = STATE_QUOTE;\n              break;\n            // Read an empty field\n            case this.delimiter:\n              readOffset = i + 1;\n              // If delimiter is white space and configured to collapse\n              // multiple white spaces, ignore this white space.\n              if (this.delimiter === ' ' && this.delimWhitespace) {\n                break;\n              }\n              result.push('');\n              currentState = STATE_OUT;\n              break;\n            // Enter an unquoted field\n            default:\n              currentState = STATE_FIELD;\n              readOffset = i;\n              break;\n          }\n          break;\n        // In an unquoted field\n        case STATE_FIELD:\n          switch (line.charAt(i)) {\n            // Exit an unquoted field, add it to result\n            case this.delimiter:\n              result.push(line.substring(readOffset, i));\n              currentState = STATE_OUT;\n              readOffset = i + 1;\n              break;\n            default:\n          }\n          break;\n        // In a quoted field\n        case STATE_QUOTE:\n          switch (line.charAt(i)) {\n            // Read a quote after a quote\n            case CODE_QUOTE:\n              currentState = STATE_QUOTE_AFTER_QUOTE;\n              break;\n            default:\n          }\n          break;\n        // This state means it's right after a second quote in a field\n        case STATE_QUOTE_AFTER_QUOTE:\n          switch (line.charAt(i)) {\n            // Finished a quoted field\n            case this.delimiter:\n              result.push(line.substring(readOffset, i - 1));\n              currentState = STATE_OUT;\n              readOffset = i + 1;\n              break;\n            // Finished a quoted part in a quoted field\n            case CODE_QUOTE:\n              currentState = STATE_QUOTE;\n              break;\n            // In a quoted part in a quoted field\n            default:\n              currentState = STATE_WITHIN_QUOTE_IN_QUOTE;\n              break;\n          }\n          break;\n        case STATE_WITHIN_QUOTE_IN_QUOTE:\n          switch (line.charAt(i)) {\n            // Exit a quoted part in a quoted field\n            case CODE_QUOTE:\n              currentState = STATE_QUOTE;\n              break;\n            default:\n          }\n          break;\n        default:\n      }\n    }\n    // Adds last item based on if it is quoted.\n    if (currentState === STATE_QUOTE_AFTER_QUOTE) {\n      result.push(line.substring(readOffset, readLength - 1));\n    } else {\n      result.push(line.substring(readOffset));\n    }\n    // Check if each row has the same number of elements as column names.\n    if (validateElementCount && result.length !== this.fullColumnNames.length) {\n      throw new Error(`Invalid row in csv file. Should have ${\n          this.fullColumnNames.length} elements in a row, but got ${result}`);\n    }\n    return result;\n  }\n}\n\n// TODO(soergel): add more basic datasets for parity with tf.data\n// tf.data.FixedLengthRecordDataset()\n// tf.data.TFRecordDataset()\n"]}