UNPKG

@huggingface/transformers

Version:

State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!

1,000 lines • 44.3 kB
/** * Checks whether the given Unicode codepoint represents a CJK (Chinese, Japanese, or Korean) character. * * A "chinese character" is defined as anything in the CJK Unicode block: * https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) * * Note that the CJK Unicode block is NOT all Japanese and Korean characters, despite its name. * The modern Korean Hangul alphabet is a different block, as is Japanese Hiragana and Katakana. * Those alphabets are used to write space-separated words, so they are not treated specially * and are handled like all other languages. * * @param {number|bigint} cp The Unicode codepoint to check. * @returns {boolean} True if the codepoint represents a CJK character, false otherwise. */ export function is_chinese_char(cp: number | bigint): boolean; declare const TokenizerModel_base: new () => { (...args: any[]): any; _call(...args: any[]): any; }; /** * Abstract base class for tokenizer models. * * @extends Callable */ export class TokenizerModel extends TokenizerModel_base { /** * Instantiates a new TokenizerModel instance based on the configuration object provided. * @param {Object} config The configuration object for the TokenizerModel. * @param {...*} args Optional arguments to pass to the specific TokenizerModel constructor. * @returns {TokenizerModel} A new instance of a TokenizerModel. * @throws Will throw an error if the TokenizerModel type in the config is not recognized. */ static fromConfig(config: any, ...args: any[]): TokenizerModel; /** * Creates a new instance of TokenizerModel. * @param {Object} config The configuration object for the TokenizerModel. */ constructor(config: any); config: any; /** @type {string[]} */ vocab: string[]; /** * A mapping of tokens to ids. * @type {Map<string, number>} */ tokens_to_ids: Map<string, number>; unk_token_id: any; unk_token: any; end_of_word_suffix: any; /** @type {boolean} Whether to fuse unknown tokens when encoding. Defaults to false. */ fuse_unk: boolean; /** * Internal function to call the TokenizerModel instance. * @param {string[]} tokens The tokens to encode. * @returns {string[]} The encoded tokens. */ _call(tokens: string[]): string[]; /** * Encodes a list of tokens into a list of token IDs. * @param {string[]} tokens The tokens to encode. * @returns {string[]} The encoded tokens. * @throws Will throw an error if not implemented in a subclass. */ encode(tokens: string[]): string[]; /** * Converts a list of tokens into a list of token IDs. * @param {string[]} tokens The tokens to convert. * @returns {number[]} The converted token IDs. */ convert_tokens_to_ids(tokens: string[]): number[]; /** * Converts a list of token IDs into a list of tokens. * @param {number[]|bigint[]} ids The token IDs to convert. * @returns {string[]} The converted tokens. */ convert_ids_to_tokens(ids: number[] | bigint[]): string[]; } declare const PreTrainedTokenizer_base: new () => { (...args: any[]): any; _call(...args: any[]): any; }; /** * @typedef {Object} Message * @property {string} role The role of the message (e.g., "user" or "assistant" or "system"). * @property {string} content The content of the message. */ export class PreTrainedTokenizer extends PreTrainedTokenizer_base { /** * Loads a pre-trained tokenizer from the given `pretrained_model_name_or_path`. * * @param {string} pretrained_model_name_or_path The path to the pre-trained tokenizer. * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. * * @throws {Error} Throws an error if the tokenizer.json or tokenizer_config.json files are not found in the `pretrained_model_name_or_path`. * @returns {Promise<PreTrainedTokenizer>} A new instance of the `PreTrainedTokenizer` class. */ static from_pretrained(pretrained_model_name_or_path: string, { progress_callback, config, cache_dir, local_files_only, revision, legacy, }?: PretrainedTokenizerOptions): Promise<PreTrainedTokenizer>; /** * Create a new PreTrainedTokenizer instance. * @param {Object} tokenizerJSON The JSON of the tokenizer. * @param {Object} tokenizerConfig The config of the tokenizer. */ constructor(tokenizerJSON: any, tokenizerConfig: any); return_token_type_ids: boolean; padding_side: string; _tokenizer_config: any; normalizer: Normalizer; pre_tokenizer: PreTokenizer; model: TokenizerModel; post_processor: PostProcessor; decoder: Decoder; special_tokens: any[]; all_special_ids: number[]; /** @type {AddedToken[]} */ added_tokens: AddedToken[]; additional_special_tokens: any; added_tokens_splitter: DictionarySplitter; /** @type {Map<string, AddedToken>} */ added_tokens_map: Map<string, AddedToken>; mask_token: string; mask_token_id: number; pad_token: string; pad_token_id: number; sep_token: string; sep_token_id: number; unk_token: string; unk_token_id: number; bos_token: string; bos_token_id: number; eos_token: string; eos_token_id: number; model_max_length: any; /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */ remove_space: boolean; clean_up_tokenization_spaces: any; do_lowercase_and_remove_accent: any; legacy: boolean; chat_template: any; _compiled_template_cache: Map<any, any>; /** * Returns the value of the first matching key in the tokenizer config object. * @param {...string} keys One or more keys to search for in the tokenizer config object. * @returns {string|null} The value associated with the first matching key, or null if no match is found. * @throws {Error} If an object is found for a matching key and its __type property is not "AddedToken". * @private */ private getToken; /** * @typedef {number[]|number[][]|Tensor} BatchEncodingItem * * @typedef {Object} BatchEncoding Holds the output of the tokenizer's call function. * @property {BatchEncodingItem} input_ids List of token ids to be fed to a model. * @property {BatchEncodingItem} attention_mask List of indices specifying which tokens should be attended to by the model. * @property {BatchEncodingItem} [token_type_ids] List of token type ids to be fed to a model. */ /** * Encode/tokenize the given text(s). * @param {string|string[]} text The text to tokenize. * @param {Object} options An optional object containing the following properties: * @param {string|string[]} [options.text_pair=null] Optional second sequence to be encoded. If set, must be the same type as text. * @param {boolean|'max_length'} [options.padding=false] Whether to pad the input sequences. * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. * @param {boolean} [options.truncation=null] Whether to truncate the input sequences. * @param {number} [options.max_length=null] Maximum length of the returned list and optionally padding length. * @param {boolean} [options.return_tensor=true] Whether to return the results as Tensors or arrays. * @param {boolean} [options.return_token_type_ids=null] Whether to return the token type ids. * @returns {BatchEncoding} Object to be passed to the model. */ _call(text: string | string[], { text_pair, add_special_tokens, padding, truncation, max_length, return_tensor, return_token_type_ids, }?: { text_pair?: string | string[]; padding?: boolean | "max_length"; add_special_tokens?: boolean; truncation?: boolean; max_length?: number; return_tensor?: boolean; return_token_type_ids?: boolean; }): { /** * List of token ids to be fed to a model. */ input_ids: number[] | number[][] | Tensor; /** * List of indices specifying which tokens should be attended to by the model. */ attention_mask: number[] | number[][] | Tensor; /** * List of token type ids to be fed to a model. */ token_type_ids?: number[] | number[][] | Tensor; }; /** * Encodes a single text using the preprocessor pipeline of the tokenizer. * * @param {string|null} text The text to encode. * @returns {string[]|null} The encoded tokens. */ _encode_text(text: string | null): string[] | null; /** * Encodes a single text or a pair of texts using the model's tokenizer. * * @param {string} text The text to encode. * @param {Object} options An optional object containing the following properties: * @param {string} [options.text_pair=null] The optional second text to encode. * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. * @param {boolean} [options.return_token_type_ids=null] Whether to return token_type_ids. * @returns {EncodingSingle} An object containing the encoded text. * @private */ private _encode_plus; /** * Internal helper function to tokenize a text, and optionally a pair of texts. * @param {string} text The text to tokenize. * @param {Object} options An optional object containing the following properties: * @param {string} [options.pair=null] The optional second text to tokenize. * @param {boolean} [options.add_special_tokens=false] Whether or not to add the special tokens associated with the corresponding model. * @returns {{tokens: string[], token_type_ids?: number[]}} An object containing the tokens and optionally the token type IDs. */ _tokenize_helper(text: string, { pair, add_special_tokens, }?: { pair?: string; add_special_tokens?: boolean; }): { tokens: string[]; token_type_ids?: number[]; }; /** * Converts a string into a sequence of tokens. * @param {string} text The sequence to be encoded. * @param {Object} options An optional object containing the following properties: * @param {string} [options.pair] A second sequence to be encoded with the first. * @param {boolean} [options.add_special_tokens=false] Whether or not to add the special tokens associated with the corresponding model. * @returns {string[]} The list of tokens. */ tokenize(text: string, { pair, add_special_tokens, }?: { pair?: string; add_special_tokens?: boolean; }): string[]; /** * Encodes a single text or a pair of texts using the model's tokenizer. * * @param {string} text The text to encode. * @param {Object} options An optional object containing the following properties: * @param {string} [options.text_pair=null] The optional second text to encode. * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. * @param {boolean} [options.return_token_type_ids=null] Whether to return token_type_ids. * @returns {number[]} An array of token IDs representing the encoded text(s). */ encode(text: string, { text_pair, add_special_tokens, return_token_type_ids, }?: { text_pair?: string; add_special_tokens?: boolean; return_token_type_ids?: boolean; }): number[]; /** * Decode a batch of tokenized sequences. * @param {number[][]|Tensor} batch List/Tensor of tokenized input sequences. * @param {Object} decode_args (Optional) Object with decoding arguments. * @returns {string[]} List of decoded sequences. */ batch_decode(batch: number[][] | Tensor, decode_args?: any): string[]; /** * Decodes a sequence of token IDs back to a string. * * @param {number[]|bigint[]|Tensor} token_ids List/Tensor of token IDs to decode. * @param {Object} [decode_args={}] * @param {boolean} [decode_args.skip_special_tokens=false] If true, special tokens are removed from the output string. * @param {boolean} [decode_args.clean_up_tokenization_spaces=true] If true, spaces before punctuations and abbreviated forms are removed. * * @returns {string} The decoded string. * @throws {Error} If `token_ids` is not a non-empty array of integers. */ decode(token_ids: number[] | bigint[] | Tensor, decode_args?: { skip_special_tokens?: boolean; clean_up_tokenization_spaces?: boolean; }): string; /** * Decode a single list of token ids to a string. * @param {number[]|bigint[]} token_ids List of token ids to decode * @param {Object} decode_args Optional arguments for decoding * @param {boolean} [decode_args.skip_special_tokens=false] Whether to skip special tokens during decoding * @param {boolean} [decode_args.clean_up_tokenization_spaces=null] Whether to clean up tokenization spaces during decoding. * If null, the value is set to `this.decoder.cleanup` if it exists, falling back to `this.clean_up_tokenization_spaces` if it exists, falling back to `true`. * @returns {string} The decoded string */ decode_single(token_ids: number[] | bigint[], { skip_special_tokens, clean_up_tokenization_spaces, }: { skip_special_tokens?: boolean; clean_up_tokenization_spaces?: boolean; }): string; /** * Retrieve the chat template string used for tokenizing chat messages. This template is used * internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat * template for better generation tracking. * * @param {Object} options An optional object containing the following properties: * @param {string} [options.chat_template=null] * A Jinja template or the name of a template to use for this conversion. * It is usually not necessary to pass anything to this argument, * as the model's template will be used by default. * @param {Object[]} [options.tools=null] * A list of tools (callable functions) that will be accessible to the model. If the template does not * support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, * giving the name, description and argument types for the tool. See our * [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) * for more information. * @returns {string} The chat template string. */ get_chat_template({ chat_template, tools, }?: { chat_template?: string; tools?: any[]; }): string; /** * Converts a list of message objects with `"role"` and `"content"` keys to a list of token * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to * determine the format and control tokens to use when converting. * * See [here](https://huggingface.co/docs/transformers/chat_templating) for more information. * * **Example:** Applying a chat template to a conversation. * * ```javascript * import { AutoTokenizer } from "@huggingface/transformers"; * * const tokenizer = await AutoTokenizer.from_pretrained("Xenova/mistral-tokenizer-v1"); * * const chat = [ * { "role": "user", "content": "Hello, how are you?" }, * { "role": "assistant", "content": "I'm doing great. How can I help you today?" }, * { "role": "user", "content": "I'd like to show off how chat templating works!" }, * ] * * const text = tokenizer.apply_chat_template(chat, { tokenize: false }); * // "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]" * * const input_ids = tokenizer.apply_chat_template(chat, { tokenize: true, return_tensor: false }); * // [1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793] * ``` * * @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys, * representing the chat history so far. * @param {Object} options An optional object containing the following properties: * @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If * this is not passed, the model's chat template will be used instead. * @param {Object[]} [options.tools=null] * A list of tools (callable functions) that will be accessible to the model. If the template does not * support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, * giving the name, description and argument types for the tool. See our * [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) * for more information. * @param {Record<string, string>[]} [options.documents=null] * A list of dicts representing documents that will be accessible to the model if it is performing RAG * (retrieval-augmented generation). If the template does not support RAG, this argument will have no * effect. We recommend that each document should be a dict containing "title" and "text" keys. Please * see the RAG section of the [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#arguments-for-RAG) * for examples of passing documents with chat templates. * @param {boolean} [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate * the start of an assistant message. This is useful when you want to generate a response from the model. * Note that this argument will be passed to the chat template, and so it must be supported in the * template for this argument to have any effect. * @param {boolean} [options.tokenize=true] Whether to tokenize the output. If false, the output will be a string. * @param {boolean} [options.padding=false] Whether to pad sequences to the maximum length. Has no effect if tokenize is false. * @param {boolean} [options.truncation=false] Whether to truncate sequences to the maximum length. Has no effect if tokenize is false. * @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false. * If not specified, the tokenizer's `max_length` attribute will be used as a default. * @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false. * @param {boolean} [options.return_dict=true] Whether to return a dictionary with named outputs. Has no effect if tokenize is false. * @param {Object} [options.tokenizer_kwargs={}] Additional options to pass to the tokenizer. * @returns {string | Tensor | number[]| number[][]|BatchEncoding} The tokenized output. */ apply_chat_template(conversation: Message[], { tools, documents, chat_template, add_generation_prompt, tokenize, padding, truncation, max_length, return_tensor, return_dict, tokenizer_kwargs, ...kwargs }?: { chat_template?: string; tools?: any[]; documents?: Record<string, string>[]; add_generation_prompt?: boolean; tokenize?: boolean; padding?: boolean; truncation?: boolean; max_length?: number; return_tensor?: boolean; return_dict?: boolean; tokenizer_kwargs?: any; }): string | Tensor | number[] | number[][] | { /** * List of token ids to be fed to a model. */ input_ids: number[] | number[][] | Tensor; /** * List of indices specifying which tokens should be attended to by the model. */ attention_mask: number[] | number[][] | Tensor; /** * List of token type ids to be fed to a model. */ token_type_ids?: number[] | number[][] | Tensor; }; } /** * BertTokenizer is a class used to tokenize text for BERT models. * @extends PreTrainedTokenizer */ export class BertTokenizer extends PreTrainedTokenizer { } /** * Albert tokenizer * @extends PreTrainedTokenizer */ export class AlbertTokenizer extends PreTrainedTokenizer { } export class MobileBertTokenizer extends PreTrainedTokenizer { } export class SqueezeBertTokenizer extends PreTrainedTokenizer { } export class DebertaTokenizer extends PreTrainedTokenizer { } export class DebertaV2Tokenizer extends PreTrainedTokenizer { } export class HerbertTokenizer extends PreTrainedTokenizer { } export class ConvBertTokenizer extends PreTrainedTokenizer { } export class RoFormerTokenizer extends PreTrainedTokenizer { } export class DistilBertTokenizer extends PreTrainedTokenizer { } export class CamembertTokenizer extends PreTrainedTokenizer { } export class XLMTokenizer extends PreTrainedTokenizer { constructor(tokenizerJSON: any, tokenizerConfig: any); } export class ElectraTokenizer extends PreTrainedTokenizer { } export class T5Tokenizer extends PreTrainedTokenizer { } export class GPT2Tokenizer extends PreTrainedTokenizer { } export class BartTokenizer extends PreTrainedTokenizer { } export class MBartTokenizer extends PreTrainedTokenizer { constructor(tokenizerJSON: any, tokenizerConfig: any); languageRegex: RegExp; language_codes: any[]; lang_to_token: (x: any) => any; /** * Helper function to build translation inputs for an `MBartTokenizer`. * @param {string|string[]} raw_inputs The text to tokenize. * @param {Object} tokenizer_options Options to be sent to the tokenizer * @param {Object} generate_kwargs Generation options. * @returns {Object} Object to be passed to the model. */ _build_translation_inputs(raw_inputs: string | string[], tokenizer_options: any, generate_kwargs: any): any; } export class MBart50Tokenizer extends MBartTokenizer { } export class RobertaTokenizer extends PreTrainedTokenizer { } export class BloomTokenizer extends PreTrainedTokenizer { } export class LlamaTokenizer extends PreTrainedTokenizer { constructor(tokenizerJSON: any, tokenizerConfig: any); legacy: any; } export class CodeLlamaTokenizer extends PreTrainedTokenizer { } export class XLMRobertaTokenizer extends PreTrainedTokenizer { } export class MPNetTokenizer extends PreTrainedTokenizer { } export class FalconTokenizer extends PreTrainedTokenizer { } export class GPTNeoXTokenizer extends PreTrainedTokenizer { } export class EsmTokenizer extends PreTrainedTokenizer { } export class Qwen2Tokenizer extends PreTrainedTokenizer { } export class GemmaTokenizer extends PreTrainedTokenizer { } export class Grok1Tokenizer extends PreTrainedTokenizer { } /** * The NllbTokenizer class is used to tokenize text for NLLB ("No Language Left Behind") models. * * No Language Left Behind (NLLB) is a first-of-its-kind, AI breakthrough project * that open-sources models capable of delivering high-quality translations directly * between any pair of 200+ languages — including low-resource languages like Asturian, * Luganda, Urdu and more. It aims to help people communicate with anyone, anywhere, * regardless of their language preferences. For more information, check out their * [paper](https://arxiv.org/abs/2207.04672). * * For a list of supported languages (along with their language codes), * @see {@link https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200} */ export class NllbTokenizer extends PreTrainedTokenizer { constructor(tokenizerJSON: any, tokenizerConfig: any); languageRegex: RegExp; language_codes: any[]; lang_to_token: (x: any) => any; /** * Helper function to build translation inputs for an `NllbTokenizer`. * @param {string|string[]} raw_inputs The text to tokenize. * @param {Object} tokenizer_options Options to be sent to the tokenizer * @param {Object} generate_kwargs Generation options. * @returns {Object} Object to be passed to the model. */ _build_translation_inputs(raw_inputs: string | string[], tokenizer_options: any, generate_kwargs: any): any; } /** * The M2M100Tokenizer class is used to tokenize text for M2M100 ("Many-to-Many") models. * * M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many * multilingual translation. It was introduced in this [paper](https://arxiv.org/abs/2010.11125) * and first released in [this](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100) repository. * * For a list of supported languages (along with their language codes), * @see {@link https://huggingface.co/facebook/m2m100_418M#languages-covered} */ export class M2M100Tokenizer extends PreTrainedTokenizer { constructor(tokenizerJSON: any, tokenizerConfig: any); languageRegex: RegExp; language_codes: any[]; lang_to_token: (x: any) => string; /** * Helper function to build translation inputs for an `M2M100Tokenizer`. * @param {string|string[]} raw_inputs The text to tokenize. * @param {Object} tokenizer_options Options to be sent to the tokenizer * @param {Object} generate_kwargs Generation options. * @returns {Object} Object to be passed to the model. */ _build_translation_inputs(raw_inputs: string | string[], tokenizer_options: any, generate_kwargs: any): any; } /** * WhisperTokenizer tokenizer * @extends PreTrainedTokenizer */ export class WhisperTokenizer extends PreTrainedTokenizer { get timestamp_begin(): number; /** * Decodes automatic speech recognition (ASR) sequences. * @param {Array<{tokens: bigint[], token_timestamps?: number[], stride: number[]}>} sequences The sequences to decode. * @param {Object} options The options to use for decoding. * @returns {Array<string|{chunks?: undefined|Array<{language: string|null, timestamp: Array<number|null>, text: string}>}>} The decoded sequences. */ _decode_asr(sequences: Array<{ tokens: bigint[]; token_timestamps?: number[]; stride: number[]; }>, { return_timestamps, return_language, time_precision, force_full_sequences }?: any): Array<string | { chunks?: undefined | Array<{ language: string | null; timestamp: Array<number | null>; text: string; }>; }>; /** * Finds the longest common sequence among the provided sequences. * @param {number[][]} sequences An array of sequences of token ids to compare. * @returns {number[][]} The longest common sequence found. * @throws {Error} If there is a bug within the function. * @private */ private findLongestCommonSequence; /** @private */ private collateWordTimestamps; /** * Groups tokens by word. Returns a tuple containing a list of strings with the words, * and a list of `token_id` sequences with the tokens making up each word. * @param {number[]} tokens * @param {string} [language] * @param {string} prepend_punctionations * @param {string} append_punctuations * * @private */ private combineTokensIntoWords; /** * @param {number[]|bigint[]} token_ids List of token IDs to decode. * @param {Object} decode_args Optional arguments for decoding * @private */ private decodeWithTimestamps; /** * Combine tokens into words by splitting at any position where the tokens are decoded as valid unicode points. * @param {number[]} tokens * @returns {*} * @private */ private splitTokensOnUnicode; /** * Combine tokens into words by splitting at whitespace and punctuation tokens. * @param {number[]} tokens * @private */ private splitTokensOnSpaces; /** * Merges punctuation tokens with neighboring words. * @param {string[]} words * @param {number[][]} tokens * @param {number[][]} indices * @param {string} prepended * @param {string} appended * @private */ private mergePunctuations; } export class CodeGenTokenizer extends PreTrainedTokenizer { } export class CLIPTokenizer extends PreTrainedTokenizer { } export class SiglipTokenizer extends PreTrainedTokenizer { } /** * @todo This model is not yet supported by Hugging Face's "fast" tokenizers library (https://github.com/huggingface/tokenizers). * Therefore, this implementation (which is based on fast tokenizers) may produce slightly inaccurate results. */ export class MarianTokenizer extends PreTrainedTokenizer { languageRegex: RegExp; supported_language_codes: string[]; /** * Encodes a single text. Overriding this method is necessary since the language codes * must be removed before encoding with sentencepiece model. * @see https://github.com/huggingface/transformers/blob/12d51db243a00726a548a43cc333390ebae731e3/src/transformers/models/marian/tokenization_marian.py#L204-L213 * * @param {string|null} text The text to encode. * @returns {Array} The encoded tokens. */ _encode_text(text: string | null): any[]; } export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer { } export class BlenderbotTokenizer extends PreTrainedTokenizer { } export class BlenderbotSmallTokenizer extends PreTrainedTokenizer { } export class SpeechT5Tokenizer extends PreTrainedTokenizer { } export class NougatTokenizer extends PreTrainedTokenizer { } export class VitsTokenizer extends PreTrainedTokenizer { constructor(tokenizerJSON: any, tokenizerConfig: any); } export class CohereTokenizer extends PreTrainedTokenizer { } export class MgpstrTokenizer extends PreTrainedTokenizer { } /** * Helper class which is used to instantiate pretrained tokenizers with the `from_pretrained` function. * The chosen tokenizer class is determined by the type specified in the tokenizer config. * * @example * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/bert-base-uncased'); */ export class AutoTokenizer { static TOKENIZER_CLASS_MAPPING: { T5Tokenizer: typeof T5Tokenizer; DistilBertTokenizer: typeof DistilBertTokenizer; CamembertTokenizer: typeof CamembertTokenizer; DebertaTokenizer: typeof DebertaTokenizer; DebertaV2Tokenizer: typeof DebertaV2Tokenizer; BertTokenizer: typeof BertTokenizer; HerbertTokenizer: typeof HerbertTokenizer; ConvBertTokenizer: typeof ConvBertTokenizer; RoFormerTokenizer: typeof RoFormerTokenizer; XLMTokenizer: typeof XLMTokenizer; ElectraTokenizer: typeof ElectraTokenizer; MobileBertTokenizer: typeof MobileBertTokenizer; SqueezeBertTokenizer: typeof SqueezeBertTokenizer; AlbertTokenizer: typeof AlbertTokenizer; GPT2Tokenizer: typeof GPT2Tokenizer; BartTokenizer: typeof BartTokenizer; MBartTokenizer: typeof MBartTokenizer; MBart50Tokenizer: typeof MBart50Tokenizer; RobertaTokenizer: typeof RobertaTokenizer; WhisperTokenizer: typeof WhisperTokenizer; CodeGenTokenizer: typeof CodeGenTokenizer; CLIPTokenizer: typeof CLIPTokenizer; SiglipTokenizer: typeof SiglipTokenizer; MarianTokenizer: typeof MarianTokenizer; BloomTokenizer: typeof BloomTokenizer; NllbTokenizer: typeof NllbTokenizer; M2M100Tokenizer: typeof M2M100Tokenizer; LlamaTokenizer: typeof LlamaTokenizer; CodeLlamaTokenizer: typeof CodeLlamaTokenizer; XLMRobertaTokenizer: typeof XLMRobertaTokenizer; MPNetTokenizer: typeof MPNetTokenizer; FalconTokenizer: typeof FalconTokenizer; GPTNeoXTokenizer: typeof GPTNeoXTokenizer; EsmTokenizer: typeof EsmTokenizer; Wav2Vec2CTCTokenizer: typeof Wav2Vec2CTCTokenizer; BlenderbotTokenizer: typeof BlenderbotTokenizer; BlenderbotSmallTokenizer: typeof BlenderbotSmallTokenizer; SpeechT5Tokenizer: typeof SpeechT5Tokenizer; NougatTokenizer: typeof NougatTokenizer; VitsTokenizer: typeof VitsTokenizer; Qwen2Tokenizer: typeof Qwen2Tokenizer; GemmaTokenizer: typeof GemmaTokenizer; Grok1Tokenizer: typeof Grok1Tokenizer; CohereTokenizer: typeof CohereTokenizer; MgpstrTokenizer: typeof MgpstrTokenizer; PreTrainedTokenizer: typeof PreTrainedTokenizer; }; /** * Instantiate one of the tokenizer classes of the library from a pretrained model. * * The tokenizer class to instantiate is selected based on the `tokenizer_class` property of the config object * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) * * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: * - A string, the *model id* of a pretrained tokenizer hosted inside a model repo on huggingface.co. * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a * user or organization name, like `dbmdz/bert-base-german-cased`. * - A path to a *directory* containing tokenizer files, e.g., `./my_model_directory/`. * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. * * @returns {Promise<PreTrainedTokenizer>} A new instance of the PreTrainedTokenizer class. */ static from_pretrained(pretrained_model_name_or_path: string, { progress_callback, config, cache_dir, local_files_only, revision, legacy, }?: PretrainedTokenizerOptions): Promise<PreTrainedTokenizer>; } /** * Additional tokenizer-specific properties. */ export type TokenizerProperties = { /** * Whether or not the `legacy` behavior of the tokenizer should be used. */ legacy?: boolean; }; export type PretrainedTokenizerOptions = import("./utils/hub.js").PretrainedOptions & TokenizerProperties; export type BPENode = { /** * The token associated with the node */ token: string; /** * A positional bias for the node. */ bias: number; /** * The score of the node. */ score?: number; /** * The previous node in the linked list. */ prev?: BPENode; /** * The next node in the linked list. */ next?: BPENode; }; export type SplitDelimiterBehavior = "removed" | "isolated" | "mergedWithPrevious" | "mergedWithNext" | "contiguous"; export type PostProcessedOutput = { /** * List of token produced by the post-processor. */ tokens: string[]; /** * List of token type ids produced by the post-processor. */ token_type_ids?: number[]; }; export type EncodingSingle = { /** * List of token ids to be fed to a model. */ input_ids: number[]; /** * List of token type ids to be fed to a model */ attention_mask: number[]; /** * List of indices specifying which tokens should be attended to by the model */ token_type_ids?: number[]; }; export type Message = { /** * The role of the message (e.g., "user" or "assistant" or "system"). */ role: string; /** * The content of the message. */ content: string; }; declare const Normalizer_base: new () => { (...args: any[]): any; _call(...args: any[]): any; }; /** * A base class for text normalization. * @abstract */ declare class Normalizer extends Normalizer_base { /** * Factory method for creating normalizers from config objects. * @static * @param {Object} config The configuration object for the normalizer. * @returns {Normalizer} A Normalizer object. * @throws {Error} If an unknown Normalizer type is specified in the config. */ static fromConfig(config: any): Normalizer; /** * @param {Object} config The configuration object for the normalizer. */ constructor(config: any); config: any; /** * Normalize the input text. * @abstract * @param {string} text The text to normalize. * @returns {string} The normalized text. * @throws {Error} If this method is not implemented in a subclass. */ normalize(text: string): string; /** * Alias for {@link Normalizer#normalize}. * @param {string} text The text to normalize. * @returns {string} The normalized text. */ _call(text: string): string; } declare const PreTokenizer_base: new () => { (...args: any[]): any; _call(...args: any[]): any; }; /** * A callable class representing a pre-tokenizer used in tokenization. Subclasses * should implement the `pre_tokenize_text` method to define the specific pre-tokenization logic. * @extends Callable */ declare class PreTokenizer extends PreTokenizer_base { /** * Factory method that returns an instance of a subclass of `PreTokenizer` based on the provided configuration. * * @static * @param {Object} config A configuration object for the pre-tokenizer. * @returns {PreTokenizer} An instance of a subclass of `PreTokenizer`. * @throws {Error} If the provided configuration object does not correspond to any known pre-tokenizer. */ static fromConfig(config: any): PreTokenizer; /** * Method that should be implemented by subclasses to define the specific pre-tokenization logic. * * @abstract * @param {string} text The text to pre-tokenize. * @param {Object} [options] Additional options for the pre-tokenization logic. * @returns {string[]} The pre-tokenized text. * @throws {Error} If the method is not implemented in the subclass. */ pre_tokenize_text(text: string, options?: any): string[]; /** * Tokenizes the given text into pre-tokens. * @param {string|string[]} text The text or array of texts to pre-tokenize. * @param {Object} [options] Additional options for the pre-tokenization logic. * @returns {string[]} An array of pre-tokens. */ pre_tokenize(text: string | string[], options?: any): string[]; /** * Alias for {@link PreTokenizer#pre_tokenize}. * @param {string|string[]} text The text or array of texts to pre-tokenize. * @param {Object} [options] Additional options for the pre-tokenization logic. * @returns {string[]} An array of pre-tokens. */ _call(text: string | string[], options?: any): string[]; } declare const PostProcessor_base: new () => { (...args: any[]): any; _call(...args: any[]): any; }; /** * @typedef {Object} PostProcessedOutput * @property {string[]} tokens List of token produced by the post-processor. * @property {number[]} [token_type_ids] List of token type ids produced by the post-processor. */ /** * @typedef {Object} EncodingSingle * @property {number[]} input_ids List of token ids to be fed to a model. * @property {number[]} attention_mask List of token type ids to be fed to a model * @property {number[]} [token_type_ids] List of indices specifying which tokens should be attended to by the model */ /** * @extends Callable */ declare class PostProcessor extends PostProcessor_base { /** * Factory method to create a PostProcessor object from a configuration object. * * @param {Object} config Configuration object representing a PostProcessor. * @returns {PostProcessor} A PostProcessor object created from the given configuration. * @throws {Error} If an unknown PostProcessor type is encountered. */ static fromConfig(config: any): PostProcessor; /** * @param {Object} config The configuration for the post-processor. */ constructor(config: any); config: any; /** * Method to be implemented in subclass to apply post-processing on the given tokens. * * @param {Array} tokens The input tokens to be post-processed. * @param {...*} args Additional arguments required by the post-processing logic. * @returns {PostProcessedOutput} The post-processed tokens. * @throws {Error} If the method is not implemented in subclass. */ post_process(tokens: any[], ...args: any[]): PostProcessedOutput; /** * Alias for {@link PostProcessor#post_process}. * @param {Array} tokens The text or array of texts to post-process. * @param {...*} args Additional arguments required by the post-processing logic. * @returns {PostProcessedOutput} The post-processed tokens. */ _call(tokens: any[], ...args: any[]): PostProcessedOutput; } declare const Decoder_base: new () => { (...args: any[]): any; _call(...args: any[]): any; }; /** * The base class for token decoders. * @extends Callable */ declare class Decoder extends Decoder_base { /** * Creates a decoder instance based on the provided configuration. * * @param {Object} config The configuration object. * @returns {Decoder} A decoder instance. * @throws {Error} If an unknown decoder type is provided. */ static fromConfig(config: any): Decoder; /** * Creates an instance of `Decoder`. * * @param {Object} config The configuration object. */ constructor(config: any); config: any; /** @type {AddedToken[]} */ added_tokens: AddedToken[]; end_of_word_suffix: any; trim_offsets: any; /** * Calls the `decode` method. * * @param {string[]} tokens The list of tokens. * @returns {string} The decoded string. */ _call(tokens: string[]): string; /** * Decodes a list of tokens. * @param {string[]} tokens The list of tokens. * @returns {string} The decoded string. */ decode(tokens: string[]): string; /** * Apply the decoder to a list of tokens. * * @param {string[]} tokens The list of tokens. * @returns {string[]} The decoded list of tokens. * @throws {Error} If the `decode_chain` method is not implemented in the subclass. */ decode_chain(tokens: string[]): string[]; } /** * Represent a token added by the user on top of the existing Model vocabulary. * AddedToken can be configured to specify the behavior they should have in various situations like: * - Whether they should only match single words * - Whether to include any whitespace on its left or right */ declare class AddedToken { /** * Creates a new instance of AddedToken. * @param {Object} config Added token configuration object. * @param {string} config.content The content of the added token. * @param {number} config.id The id of the added token. * @param {boolean} [config.single_word=false] Whether this token must be a single word or can break words. * @param {boolean} [config.lstrip=false] Whether this token should strip whitespaces on its left. * @param {boolean} [config.rstrip=false] Whether this token should strip whitespaces on its right. * @param {boolean} [config.normalized=false] Whether this token should be normalized. * @param {boolean} [config.special=false] Whether this token is special. */ constructor(config: { content: string; id: number; single_word?: boolean; lstrip?: boolean; rstrip?: boolean; normalized?: boolean; special?: boolean; }); content: string; id: number; single_word: boolean; lstrip: boolean; rstrip: boolean; special: boolean; normalized: boolean; } import { DictionarySplitter } from './utils/data-structures.js'; import { Tensor } from './utils/tensor.js'; export {}; //# sourceMappingURL=tokenizers.d.ts.map