UNPKG

@tensorflow/tfjs-data

Version:

TensorFlow Data API in JavaScript

396 lines (395 loc) 15.4 kB
/** * @license * Copyright 2018 Google LLC. All Rights Reserved. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * ============================================================================= */ /// <amd-module name="@tensorflow/tfjs-data/dist/dataset" /> import * as tf from '@tensorflow/tfjs-core'; import { TensorContainer } from '@tensorflow/tfjs-core'; import { LazyIterator } from './iterators/lazy_iterator'; import { Container } from './types'; /** * A nested structure of Datasets, used as the input to zip(). */ export type DatasetContainer = Container<Dataset<TensorContainer>>; /** * Represents a potentially large list of independent data elements (typically * 'samples' or 'examples'). * * A 'data example' may be a primitive, an array, a map from string keys to * values, or any nested structure of these. * * A `Dataset` represents an ordered collection of elements, together with a * chain of transformations to be performed on those elements. Each * transformation is a method of `Dataset` that returns another `Dataset`, so * these may be chained, e.g. * `const processedDataset = rawDataset.filter(...).map(...).batch(...)`. * * Data loading and transformation is done in a lazy, streaming fashion. The * dataset may be iterated over multiple times; each iteration starts the data * loading anew and recapitulates the transformations. * * A `Dataset` is typically processed as a stream of unbatched examples -- i.e., * its transformations are applied one example at a time. Batching produces a * new `Dataset` where each element is a batch. Batching should usually come * last in a pipeline, because data transformations are easier to express on a * per-example basis than on a per-batch basis. * * The following code examples are calling `await dataset.forEachAsync(...)` to * iterate once over the entire dataset in order to print out the data. * * @doc {heading: 'Data', subheading: 'Classes', namespace: 'data'} */ export declare abstract class Dataset<T extends tf.TensorContainer> { abstract iterator(): Promise<LazyIterator<T>>; readonly size: number; /** * Groups elements into batches. * * It is assumed that each of the incoming dataset elements has the same * structure -- i.e. the same set of keys at each location in an object * hierarchy. For each key, the resulting `Dataset` provides a batched * element collecting all of the incoming values for that key. * * * Incoming primitives are grouped into a 1-D Tensor. * * Incoming Tensors are grouped into a new Tensor where the 0th axis is * the batch dimension. * * Incoming arrays are converted to Tensor and then batched. * * A nested array is interpreted as an n-D Tensor, so the batched result * has n+1 dimensions. * * An array that cannot be converted to Tensor produces an error. * * If an array should not be batched as a unit, it should first be converted * to an object with integer keys. * * Here are a few examples: * * Batch a dataset of numbers: * ```js * const a = tf.data.array([1, 2, 3, 4, 5, 6, 7, 8]).batch(4); * await a.forEachAsync(e => e.print()); * ``` * * Batch a dataset of arrays: * ```js * const b = tf.data.array([[1], [2], [3], [4], [5], [6], [7], [8]]).batch(4); * await b.forEachAsync(e => e.print()); * ``` * * Batch a dataset of objects: * ```js * const c = tf.data.array([{a: 1, b: 11}, {a: 2, b: 12}, {a: 3, b: 13}, * {a: 4, b: 14}, {a: 5, b: 15}, {a: 6, b: 16}, {a: 7, b: 17}, * {a: 8, b: 18}]).batch(4); * await c.forEachAsync(e => { * console.log('{'); * for(var key in e) { * console.log(key+':'); * e[key].print(); * } * console.log('}'); * }) * ``` * * @param batchSize The number of elements desired per batch. * @param smallLastBatch Whether to emit the final batch when it has fewer * than batchSize elements. Default true. * @returns A `Dataset`, from which a stream of batches can be obtained. * * @doc {heading: 'Data', subheading: 'Classes'} */ batch(batchSize: number, smallLastBatch?: boolean): Dataset<tf.TensorContainer>; /** * Concatenates this `Dataset` with another. * * ```js * const a = tf.data.array([1, 2, 3]); * const b = tf.data.array([4, 5, 6]); * const c = a.concatenate(b); * await c.forEachAsync(e => console.log(e)); * ``` * * @param dataset A `Dataset` to be concatenated onto this one. * @returns A `Dataset`. * * @doc {heading: 'Data', subheading: 'Classes'} */ concatenate(dataset: Dataset<T>): Dataset<T>; /** * Filters this dataset according to `predicate`. * * ```js * const a = tf.data.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) * .filter(x => x%2 === 0); * await a.forEachAsync(e => console.log(e)); * ``` * * @param predicate A function mapping a dataset element to a boolean or a * `Promise` for one. * * @returns A `Dataset` of elements for which the predicate was true. * * @doc {heading: 'Data', subheading: 'Classes'} */ filter(predicate: (value: T) => boolean): Dataset<T>; /** * Apply a function to every element of the dataset. * * After the function is applied to a dataset element, any Tensors contained * within that element are disposed. * * ```js * const a = tf.data.array([1, 2, 3]); * await a.forEachAsync(e => console.log(e)); * ``` * * @param f A function to apply to each dataset element. * @returns A `Promise` that resolves after all elements have been processed. * * @doc {heading: 'Data', subheading: 'Classes'} */ forEachAsync(f: (input: T) => void): Promise<void>; /** * Maps this dataset through a 1-to-1 transform. * * ```js * const a = tf.data.array([1, 2, 3]).map(x => x*x); * await a.forEachAsync(e => console.log(e)); * ``` * * @param transform A function mapping a dataset element to a transformed * dataset element. * * @returns A `Dataset` of transformed elements. * * @doc {heading: 'Data', subheading: 'Classes'} */ map<O extends tf.TensorContainer>(transform: (value: T) => O): Dataset<O>; /** * Maps this dataset through an async 1-to-1 transform. * * ```js * const a = * tf.data.array([1, 2, 3]).mapAsync(x => new Promise(function(resolve){ * setTimeout(() => { * resolve(x * x); * }, Math.random()*1000 + 500); * })); * console.log(await a.toArray()); * ``` * * @param transform A function mapping a dataset element to a `Promise` for a * transformed dataset element. This transform is responsible for disposing * any intermediate `Tensor`s, i.e. by wrapping its computation in * `tf.tidy()`; that cannot be automated here (as it is in the synchronous * `map()` case). * * @returns A `Dataset` of transformed elements. * * @doc {heading: 'Data', subheading: 'Classes'} */ mapAsync<O extends tf.TensorContainer>(transform: (value: T) => Promise<O>): Dataset<O>; /** * Creates a `Dataset` that prefetches elements from this dataset. * * @param bufferSize: An integer specifying the number of elements to be * prefetched. * @returns A `Dataset`. * * @doc {heading: 'Data', subheading: 'Classes'} */ prefetch(bufferSize: number): Dataset<T>; /** * Repeats this dataset `count` times. * * NOTE: If this dataset is a function of global state (e.g. a random number * generator), then different repetitions may produce different elements. * * ```js * const a = tf.data.array([1, 2, 3]).repeat(3); * await a.forEachAsync(e => console.log(e)); * ``` * * @param count: (Optional) An integer, representing the number of times * the dataset should be repeated. The default behavior (if `count` is * `undefined` or negative) is for the dataset be repeated indefinitely. * @returns A `Dataset`. * * @doc {heading: 'Data', subheading: 'Classes'} */ repeat(count?: number): Dataset<T>; /** * Creates a `Dataset` that skips `count` initial elements from this dataset. * * ```js * const a = tf.data.array([1, 2, 3, 4, 5, 6]).skip(3); * await a.forEachAsync(e => console.log(e)); * ``` * * @param count: The number of elements of this dataset that should be skipped * to form the new dataset. If `count` is greater than the size of this * dataset, the new dataset will contain no elements. If `count` * is `undefined` or negative, skips the entire dataset. * * @returns A `Dataset`. * * @doc {heading: 'Data', subheading: 'Classes'} */ skip(count: number): Dataset<T>; static readonly MAX_BUFFER_SIZE = 10000; /** * Pseudorandomly shuffles the elements of this dataset. This is done in a * streaming manner, by sampling from a given number of prefetched elements. * * ```js * const a = tf.data.array([1, 2, 3, 4, 5, 6]).shuffle(3); * await a.forEachAsync(e => console.log(e)); * ``` * * @param bufferSize: An integer specifying the number of elements from this * dataset from which the new dataset will sample. * @param seed: (Optional) An integer specifying the random seed that will * be used to create the distribution. * @param reshuffleEachIteration: (Optional) A boolean, which if true * indicates that the dataset should be pseudorandomly reshuffled each time * it is iterated over. If false, elements will be returned in the same * shuffled order on each iteration. (Defaults to `true`.) * @returns A `Dataset`. * * @doc {heading: 'Data', subheading: 'Classes'} */ shuffle(bufferSize: number, seed?: string, reshuffleEachIteration?: boolean): Dataset<T>; /** * Creates a `Dataset` with at most `count` initial elements from this * dataset. * * ```js * const a = tf.data.array([1, 2, 3, 4, 5, 6]).take(3); * await a.forEachAsync(e => console.log(e)); * ``` * * @param count: The number of elements of this dataset that should be taken * to form the new dataset. If `count` is `undefined` or negative, or if * `count` is greater than the size of this dataset, the new dataset will * contain all elements of this dataset. * @returns A `Dataset`. * * @doc {heading: 'Data', subheading: 'Classes'} */ take(count: number): Dataset<T>; /** * Collect all elements of this dataset into an array. * * Obviously this will succeed only for small datasets that fit in memory. * Useful for testing and generally should be avoided if possible. * * ```js * const a = tf.data.array([1, 2, 3, 4, 5, 6]); * console.log(await a.toArray()); * ``` * * @returns A Promise for an array of elements, which will resolve * when a new stream has been obtained and fully consumed. * * @doc {heading: 'Data', subheading: 'Classes'} */ toArray(): Promise<T[]>; /** * Collect all elements of this dataset into an array with prefetching 100 * elements. This is useful for testing, because the prefetch changes the * order in which the Promises are resolved along the processing pipeline. * This may help expose bugs where results are dependent on the order of * Promise resolution rather than on the logical order of the stream (i.e., * due to hidden mutable state). * * @returns A Promise for an array of elements, which will resolve * when a new stream has been obtained and fully consumed. */ toArrayForTest(): Promise<T[]>; } /** * Create a `Dataset` defined by a provided iterator() function. * * ```js * let i = -1; * const func = () => * ++i < 5 ? {value: i, done: false} : {value: null, done: true}; * const iter = tf.data.iteratorFromFunction(func); * const ds = tf.data.datasetFromIteratorFn(iter); * await ds.forEachAsync(e => console.log(e)); * ``` */ export declare function datasetFromIteratorFn<T extends tf.TensorContainer>(iteratorFn: () => Promise<LazyIterator<T>>, size?: number): Dataset<T>; /** * Create a `Dataset` from an array of elements. * * Create a Dataset from an array of objects: * ```js * const a = tf.data.array([{'item': 1}, {'item': 2}, {'item': 3}]); * await a.forEachAsync(e => console.log(e)); * ``` * * Create a Dataset from an array of numbers: * ```js * const a = tf.data.array([4, 5, 6]); * await a.forEachAsync(e => console.log(e)); * ``` * @param items An array of elements that will be parsed as items in a dataset. * * @doc {heading: 'Data', subheading: 'Creation', namespace: 'data'} */ export declare function array<T extends tf.TensorContainer>(items: T[]): Dataset<T>; /** * Create a `Dataset` by zipping together an array, dict, or nested * structure of `Dataset`s (and perhaps additional constants). * The underlying datasets must provide elements in a consistent order such that * they correspond. * * The number of elements in the resulting dataset is the same as the size of * the smallest dataset in datasets. * * The nested structure of the `datasets` argument determines the * structure of elements in the resulting iterator. * * Note this means that, given an array of two datasets that produce dict * elements, the result is a dataset that produces elements that are arrays * of two dicts: * * Zip an array of datasets: * ```js * console.log('Zip two datasets of objects:'); * const ds1 = tf.data.array([{a: 1}, {a: 2}, {a: 3}]); * const ds2 = tf.data.array([{b: 4}, {b: 5}, {b: 6}]); * const ds3 = tf.data.zip([ds1, ds2]); * await ds3.forEachAsync(e => console.log(JSON.stringify(e))); * * // If the goal is to merge the dicts in order to produce elements like * // {a: ..., b: ...}, this requires a second step such as: * console.log('Merge the objects:'); * const ds4 = ds3.map(x => {return {a: x[0].a, b: x[1].b}}); * await ds4.forEachAsync(e => console.log(e)); * ``` * * Zip a dict of datasets: * ```js * const a = tf.data.array([{a: 1}, {a: 2}, {a: 3}]); * const b = tf.data.array([{b: 4}, {b: 5}, {b: 6}]); * const c = tf.data.zip({c: a, d: b}); * await c.forEachAsync(e => console.log(JSON.stringify(e))); * ``` * * @doc {heading: 'Data', subheading: 'Operations', namespace: 'data'} */ export declare function zip<O extends tf.TensorContainer>(datasets: DatasetContainer): Dataset<O>;