@tensorflow/tfjs-layers

Version:

TensorFlow layers API in JavaScript

70 lines (69 loc) • 2.76 kB

TypeScript

/** * @license * Copyright 2023 Google LLC. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================= */ /// <amd-module name="@tensorflow/tfjs-layers/dist/layers/nlp/tokenizers_utils" /> import { Tensor } from '@tensorflow/tfjs-core'; export declare function bytesToUnicode(): [Uint8Array, string[]]; /** * StaticHashTable includes a `lookup` function for multiple keys at once. */ export declare class StaticHashTable<K, V extends number | string> { private readonly defaultValue; private _map; constructor(keys: K[], values: V[], defaultValue: V); get(key: K): V; lookup(keys: Tensor[]): Tensor[]; } export declare function createStaticHashtable<K, V extends number | string>(keys: K[], values: V[], defaultVal: V): StaticHashTable<K, V>; /** * Cache that stores the encoded result of seen tokens. * * The cache key is string tensor or python strings, and the value is split * tokens joined by whitespace. For example, "dragonfly" => "dragon fly" * * Examples: * * ```js * const cache = new BytePairTokenizerCache(); * cache.insert(["butterfly", "dragonfly"], ["but ter fly", "dragon fly"]); * cache.lookup(["butterfly"]); * ``` */ export declare class BytePairTokenizerCache { private _cache; constructor(); get(key: string): string; /** * Insert token <=> encoded outputs pairs. */ insert(keys: Tensor | string[], values: string[]): BytePairTokenizerCache; /** * Look up the encoded outputs of given tokens. */ lookup(keys: Tensor | string[]): string[]; } /** * Remove certain strings from input tensor. */ export declare function removeStringsFromInputs(inputs: Tensor[], stringToRemove: string): Tensor[]; /** * Create alternates for all special tokens that will be not split during * tokenization. */ export declare function createAltsForUnsplittableTokens(unsplittableTokens: string[]): string[]; export declare const SPLIT_PATTERN_1: RegExp; export declare function regexSplit(strs: string[] | string[][], delimRegexPattern: RegExp | string, keepDelimRegexPattern?: boolean): string[][]; export declare function splitStringsForBpe(inputs: Tensor, unsplittableTokens?: string[]): Tensor[];