@tensorflow/tfjs-layers
Version:
TensorFlow layers API in JavaScript
70 lines (69 loc) • 2.76 kB
TypeScript
/**
* @license
* Copyright 2023 Google LLC.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================================
*/
/// <amd-module name="@tensorflow/tfjs-layers/dist/layers/nlp/tokenizers_utils" />
import { Tensor } from '@tensorflow/tfjs-core';
export declare function bytesToUnicode(): [Uint8Array, string[]];
/**
* StaticHashTable includes a `lookup` function for multiple keys at once.
*/
export declare class StaticHashTable<K, V extends number | string> {
private readonly defaultValue;
private _map;
constructor(keys: K[], values: V[], defaultValue: V);
get(key: K): V;
lookup(keys: Tensor[]): Tensor[];
}
export declare function createStaticHashtable<K, V extends number | string>(keys: K[], values: V[], defaultVal: V): StaticHashTable<K, V>;
/**
* Cache that stores the encoded result of seen tokens.
*
* The cache key is string tensor or python strings, and the value is split
* tokens joined by whitespace. For example, "dragonfly" => "dragon fly"
*
* Examples:
*
* ```js
* const cache = new BytePairTokenizerCache();
* cache.insert(["butterfly", "dragonfly"], ["but ter fly", "dragon fly"]);
* cache.lookup(["butterfly"]);
* ```
*/
export declare class BytePairTokenizerCache {
private _cache;
constructor();
get(key: string): string;
/**
* Insert token <=> encoded outputs pairs.
*/
insert(keys: Tensor | string[], values: string[]): BytePairTokenizerCache;
/**
* Look up the encoded outputs of given tokens.
*/
lookup(keys: Tensor | string[]): string[];
}
/**
* Remove certain strings from input tensor.
*/
export declare function removeStringsFromInputs(inputs: Tensor[], stringToRemove: string): Tensor[];
/**
* Create alternates for all special tokens that will be not split during
* tokenization.
*/
export declare function createAltsForUnsplittableTokens(unsplittableTokens: string[]): string[];
export declare const SPLIT_PATTERN_1: RegExp;
export declare function regexSplit(strs: string[] | string[][], delimRegexPattern: RegExp | string, keepDelimRegexPattern?: boolean): string[][];
export declare function splitStringsForBpe(inputs: Tensor, unsplittableTokens?: string[]): Tensor[];