UNPKG

@tensorflow/tfjs-layers

Version:

TensorFlow layers API in JavaScript

118 lines (117 loc) 4.71 kB
/** * @license * Copyright 2023 Google LLC. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================= */ /// <amd-module name="@tensorflow/tfjs-layers/dist/layers/nlp/models/gpt2/gpt2_preprocessor" /> /** * GPT-2 preprocessor layer. */ import { NamedTensorMap, Tensor, serialization } from '@tensorflow/tfjs-core'; import { LayerArgs } from '../../../../engine/topology'; import { Preprocessor } from '../preprocessor'; import { GPT2Tokenizer } from './gpt2_tokenizer'; import { StartEndPacker } from '../../preprocessing/start_end_packer'; export declare interface GPT2PreprocessorArgs extends LayerArgs { /** * A GPT2Tokenizer instance. */ tokenizer: GPT2Tokenizer; /** * The length of the packed inputs. * Defaults to 1024. */ sequenceLength?: number; /** * If `true`, the preprocessor will prepend the tokenizer start token to each * input sequence. * Defaults to `true`. */ addStartToken?: boolean; /** * If `true`, the preprocessor will prepend the tokenizer end token to each * input sequence. * Defaults to `true`. */ addEndToken?: boolean; } export declare interface GPT2PreprocessorOptions { /** * Any label data. Will be passed through unaltered. */ y?: Tensor; /** * Any label weight data. Will be passed through unaltered. */ sampleWeight?: Tensor; /** * Pass to override the configured `sequenceLength` of the layer. */ sequenceLength?: number; } export declare function packXYSampleWeight(x: NamedTensorMap, y?: Tensor, sampleWeight?: Tensor): NamedTensorMap | [NamedTensorMap, Tensor] | [NamedTensorMap, Tensor, Tensor]; /** * GPT2 preprocessing layer which tokenizes and packs inputs. * * This preprocessing layer will do 2 things: * * - Tokenize the inputs using the `tokenizer`. * - Construct a dictionary with keys `"tokenIds"`, `"paddingMask"`, that can * be passed directly to a `GPT2Backbone`. * * The call method of this layer accepts three arguments, `x`, `y`, and * `sampleWeight`. `x` can be a string or tensor representing a single * segment, a list of strings representing a batch of single segments, * or a list of tensors representing multiple segments to be packed together. * `y` and `sampleWeight` are both optional, can have any format, and will be * passed through unaltered. * * `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is * mainly used for generation tasks. For tasks having multi-segment inputs * like "glue/mnli", please use a model designed for classification purposes * such as BERT or RoBERTa. * * Examples: * * Directly calling the layer on data. * ```js * const features = ['a quick fox.', 'a fox quick.']; * const vocabulary = * new Map([['<|endoftext|>', 0], ['a', 4], ['Ġquick', 5], ['Ġfox', 6]]); * const merges = * ['Ġ q', 'u i', 'c k', 'ui ck', 'Ġq uick', 'Ġ f', 'o x', 'Ġf ox']; * const tokenizer = GPT2Tokenizer({vocabulary, merges}); * * const preprocessor = GPT2Preprocessor({tokenizer}); * preprocessor.call(tensor(['the quick brown fox jumped.']))[0].print(); * ``` */ export declare class GPT2Preprocessor extends Preprocessor { /** @nocollapse */ static className: string; protected readonly sequenceLength: number; protected readonly addStartToken: boolean; protected readonly addEndToken: boolean; protected readonly packer: StartEndPacker; constructor(args: GPT2PreprocessorArgs); getConfig(): serialization.ConfigDict; call(inputs: Tensor | Tensor[], kwargs: GPT2PreprocessorOptions): Tensor | Tensor[]; private callAndReturnPaddingMask; /** * Calls the layer and returns extra information like the paddingMask used to * pack the sequence, the label data, and the sample weights used. */ callAndPackArgs(inputs: Tensor | Tensor[], kwargs: GPT2PreprocessorOptions): NamedTensorMap | [NamedTensorMap, Tensor] | [NamedTensorMap, Tensor, Tensor]; static tokenizerCls<T extends serialization.Serializable>(cls: serialization.SerializableConstructor<T>): typeof GPT2Tokenizer; }