@tensorflow/tfjs-layers
Version:
TensorFlow layers API in JavaScript
118 lines (117 loc) • 4.71 kB
TypeScript
/**
* @license
* Copyright 2023 Google LLC.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================================
*/
/// <amd-module name="@tensorflow/tfjs-layers/dist/layers/nlp/models/gpt2/gpt2_preprocessor" />
/**
* GPT-2 preprocessor layer.
*/
import { NamedTensorMap, Tensor, serialization } from '@tensorflow/tfjs-core';
import { LayerArgs } from '../../../../engine/topology';
import { Preprocessor } from '../preprocessor';
import { GPT2Tokenizer } from './gpt2_tokenizer';
import { StartEndPacker } from '../../preprocessing/start_end_packer';
export declare interface GPT2PreprocessorArgs extends LayerArgs {
/**
* A GPT2Tokenizer instance.
*/
tokenizer: GPT2Tokenizer;
/**
* The length of the packed inputs.
* Defaults to 1024.
*/
sequenceLength?: number;
/**
* If `true`, the preprocessor will prepend the tokenizer start token to each
* input sequence.
* Defaults to `true`.
*/
addStartToken?: boolean;
/**
* If `true`, the preprocessor will prepend the tokenizer end token to each
* input sequence.
* Defaults to `true`.
*/
addEndToken?: boolean;
}
export declare interface GPT2PreprocessorOptions {
/**
* Any label data. Will be passed through unaltered.
*/
y?: Tensor;
/**
* Any label weight data. Will be passed through unaltered.
*/
sampleWeight?: Tensor;
/**
* Pass to override the configured `sequenceLength` of the layer.
*/
sequenceLength?: number;
}
export declare function packXYSampleWeight(x: NamedTensorMap, y?: Tensor, sampleWeight?: Tensor): NamedTensorMap | [NamedTensorMap, Tensor] | [NamedTensorMap, Tensor, Tensor];
/**
* GPT2 preprocessing layer which tokenizes and packs inputs.
*
* This preprocessing layer will do 2 things:
*
* - Tokenize the inputs using the `tokenizer`.
* - Construct a dictionary with keys `"tokenIds"`, `"paddingMask"`, that can
* be passed directly to a `GPT2Backbone`.
*
* The call method of this layer accepts three arguments, `x`, `y`, and
* `sampleWeight`. `x` can be a string or tensor representing a single
* segment, a list of strings representing a batch of single segments,
* or a list of tensors representing multiple segments to be packed together.
* `y` and `sampleWeight` are both optional, can have any format, and will be
* passed through unaltered.
*
* `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is
* mainly used for generation tasks. For tasks having multi-segment inputs
* like "glue/mnli", please use a model designed for classification purposes
* such as BERT or RoBERTa.
*
* Examples:
*
* Directly calling the layer on data.
* ```js
* const features = ['a quick fox.', 'a fox quick.'];
* const vocabulary =
* new Map([['<|endoftext|>', 0], ['a', 4], ['Ġquick', 5], ['Ġfox', 6]]);
* const merges =
* ['Ġ q', 'u i', 'c k', 'ui ck', 'Ġq uick', 'Ġ f', 'o x', 'Ġf ox'];
* const tokenizer = GPT2Tokenizer({vocabulary, merges});
*
* const preprocessor = GPT2Preprocessor({tokenizer});
* preprocessor.call(tensor(['the quick brown fox jumped.']))[0].print();
* ```
*/
export declare class GPT2Preprocessor extends Preprocessor {
/** @nocollapse */
static className: string;
protected readonly sequenceLength: number;
protected readonly addStartToken: boolean;
protected readonly addEndToken: boolean;
protected readonly packer: StartEndPacker;
constructor(args: GPT2PreprocessorArgs);
getConfig(): serialization.ConfigDict;
call(inputs: Tensor | Tensor[], kwargs: GPT2PreprocessorOptions): Tensor | Tensor[];
private callAndReturnPaddingMask;
/**
* Calls the layer and returns extra information like the paddingMask used to
* pack the sequence, the label data, and the sample weights used.
*/
callAndPackArgs(inputs: Tensor | Tensor[], kwargs: GPT2PreprocessorOptions): NamedTensorMap | [NamedTensorMap, Tensor] | [NamedTensorMap, Tensor, Tensor];
static tokenizerCls<T extends serialization.Serializable>(cls: serialization.SerializableConstructor<T>): typeof GPT2Tokenizer;
}