@tensorflow/tfjs-layers

/** * @license * Copyright 2023 Google LLC. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================= */ /// <amd-module name="@tensorflow/tfjs-layers/dist/layers/nlp/tokenizers" /> /** * Tokenizer layers. */ import { Tensor, serialization } from '@tensorflow/tfjs-core'; import { Layer, LayerArgs } from '../../engine/topology'; export declare interface TokenizerOptions { mode?: 'tokenize' | 'detokenize'; } /** * Base class for Tokenizers. * * Tokenizers in the tfjs library should all subclass this layer. * The class provides two core methods `tokenize()` and `detokenize()` for * going from plain text to sequences and back. A tokenizer is a subclass of * `Layer` and can be combined with other layers in a `tf.sequential` model. * * Subclassers should always implement the `tokenize()` method, which will also * be the default when calling the layer directly on inputs. * * Subclassers can optionally implement the `detokenize()` method if the * tokenization is reversible. Otherwise, this can be skipped. * * Subclassers should implement `get_vocabulary()`, `vocabulary_size()`, * `token_to_id()` and `id_to_token()` if applicable. For some simple * "vocab free" tokenizers, such as a whitespace splitter shown below, these * methods do not apply and can be skipped. * * Example: * * ```js * class WhitespaceSplitterTokenizer extends Tokenizer { * tokenize(inputs: Tensor): Tensor[] { * const stringInputs = inputs.dataSync() as unknown as string[]; * return stringInputs.map(input => Tensor(input.split(' '))); * } * * override detokenize(inputs: Tensor[]): Tensor { * const stringInputs = inputs.map( * input => input.dataSync() as unknown as string[]); * return Tensor(stringInputs.map(str => str.join(' '))); * } * } * * const tokenizer = new WhitespaceSplitterTokenizer(); * * tokenizer.tokenize(tensor(['this is a test']))[0].print(); * * tokenizer.detokenize([tensor(['this', 'is', 'a', 'test'])]).print(); * ``` */ export declare abstract class Tokenizer extends Layer { /** * Transform input tensors of strings into output tokens. * * @param inputs Input tensor. * @param kwargs Additional keyword arguments. */ abstract tokenize(inputs: Tensor): Tensor[]; /** * Transform tokens back into strings. * * @param inputs Input tensor. * @param kwargs Additional keyword arguments. */ detokenize(inputs: Tensor[]): Tensor; /** * Get the tokenizer vocabulary as a list of strings terms. */ get vocabulary(): string[]; /** * Returns the total size of the token id space. */ get vocabularySize(): number; /** * Convert an integer id to a string token. */ idToToken(id: number): string; /** * Convert an integer id to a string token. */ tokenToId(token: string): number; call(inputs: Tensor | Tensor[], { mode }?: TokenizerOptions): Tensor | Tensor[]; } export declare interface BytePairTokenizerArgs extends LayerArgs { /** * Maps token to integer ids */ vocabulary: Map<string, number>; /** * Array. Contains the merge rule. */ merges: string[]; /** * Integer. If set, the output will be padded or truncated to the * `sequenceLength`. Defaults to `null`. */ sequenceLength?: number; /** * Boolean. Whether to add an initial space to the input. This tokenizer is * whitespace aware, and will tokenize a word with a leading space * differently. Adding a prefix space to the first word will cause it to be * tokenized equivalently to all subsequent words in the sequence. * Defaults to `false`. */ addPrefixSpace?: boolean; /** * Array. A list of strings that will never be split during the word-level * splitting applied before the byte-pair encoding. This can be used to ensure * special tokens map to unique indices in the vocabulary, even if these * special tokens contain splittable characters such as punctuation. Special * tokens must still be included in `vocabulary`. Defaults to `None`. */ unsplittableTokens?: string[]; } /** * Byte-pair encoding tokenizer layer. * * This BPE tokenizer provides the same functionality as the official GPT-2 * tokenizer. Given the same `vocabulary` which maps tokens to ids, and `merges` * which describes BPE merge rules, it should provide the same output as OpenAI * implementation (https://github.com/openai/gpt-2/blob/master/src/encoder.py). * * If input is a batch of strings (rank > 0): * By default, the layer will output a `Tensor[]`. * If `sequenceLength` is set, the layer will output a `Tensor[]` where all * inputs have been padded or truncated to `sequenceLength`. * * Examples: * * Tokenize * ```js * const vocabulary = new Map([['butter', 1], ['fly', 2]]); * const merges = ['b u', 't t', 'e r', 'bu tt', 'butt er', 'f l', 'fl y']; * const tokenizer = new BytePairTokenizer({vocabulary, merges}); * * tokenizer.tokenize(tensor(['butterfly']))[0].print(); * tokenizer.tokenize(tensor(['butterfly, butter']))[1].print(); * ``` * * Detokenize * ```js * const vocabulary = new Map([['butter', 1], ['fly', 2]]); * const merges = ['b u', 't t', 'e r', 'bu tt', 'butt er', 'f l', 'fl y']; * const tokenizer = new BytePairTokenizer({vocabulary, merges}); * * tokenizer.detokenize([[1, 2]]).print(); * ``` */ export declare class BytePairTokenizer extends Tokenizer { /** @nocollapse */ static readonly className = "BytePairTokenizer"; private _vocabulary; private merges; private readonly sequenceLength; private readonly addPrefixSpace; private readonly unsplittableTokens; private readonly byte2Unicode; private readonly cache; private readonly tokenToIdMap; private readonly idToTokenMap; private readonly mergeRanksLookupDefault; private readonly mergeRanks; constructor(args: BytePairTokenizerArgs); /** * Get the tokenizer vocabulary as a list of string tokens. */ get vocabulary(): string[]; /** * Get the size of the tokenizer vocabulary. */ get vocabularySize(): number; /** * Convert an integer id to a string token. */ idToToken(id: number): string | undefined; /** * Convert a string token to an integer id. */ tokenToId(token: string): number | undefined; getConfig(): serialization.ConfigDict; /** * Perform one step of byte-pair merge. */ private bpeMergeOneStep; /** * Perform byte-pair merge for each word in the inputs. */ private bpeMerge; /** * Map token bytes to unicode using `byte2unicode`. */ private transformBytes; /** * Process unseen tokens and add to cache. */ private bpeMergeAndUpdateCache; tokenize(inputs: Tensor): Tensor[]; detokenize(inputs: Tensor[]): Tensor; }