chonkie
Version:
🦛 CHONK your texts in TS with Chonkie!✨The no-nonsense lightweight and efficient chunking library.
232 lines (231 loc) • 8.76 kB
TypeScript
import { PreTrainedTokenizer } from "@huggingface/transformers";
type TransformersJsTokenizer = PreTrainedTokenizer;
type CallableTokenizerCounter = (text: string) => number;
type CallableTokenizerEncoder = (text: string) => number[];
type CallableTokenizerDecoder = (tokens: number[]) => string;
type SupportedTokenizerInstance = BaseTokenizer | TransformersJsTokenizer;
export type CallableTokenizer = {
encode?: CallableTokenizerEncoder;
decode?: CallableTokenizerDecoder;
countTokens: CallableTokenizerCounter;
};
/**
* @abstract
* Base class for tokenizers, providing a foundational structure for text tokenization.
* It manages a vocabulary of tokens and their corresponding IDs.
* This class is intended to be extended by specific tokenizer implementations
* (e.g., CharacterTokenizer, WordTokenizer).
*
* @property {string[]} vocab - An array storing the tokens in the vocabulary.
* @property {Map<string, number>} token2id - A map that stores the mapping from tokens (strings) to their unique IDs (numbers).
*/
export declare abstract class BaseTokenizer {
vocab: string[];
token2id: Map<string, number>;
/**
* Initializes the BaseTokenizer.
* Sets up an empty vocabulary (`this.vocab`) and an empty token-to-ID map (`this.token2id`).
* It also adds a space character (" ") as the initial token to the vocabulary.
*/
constructor();
/**
* Adds a token to the vocabulary if it's not already present.
* @param token The token to add.
* @returns The ID of the token.
*/
protected addTokenToVocab(token: string): number;
/**
* Return a string representation of the BaseTokenizer.
* @returns String representation.
*/
toString(): string;
/**
* Return the vocabulary.
* @returns The vocabulary.
*/
getVocab(): readonly string[];
/**
* Return token-to-id mapping.
* @returns The token-to-ID map.
*/
getToken2id(): ReadonlyMap<string, number>;
/**
* Encode the given text into tokens.
* @param text The text to encode.
* @returns Encoded sequence of token IDs.
*/
abstract encode(text: string): number[];
/**
* Decode the given tokens back into text.
* @param tokens The tokens to decode.
* @returns Decoded text.
*/
abstract decode(tokens: number[]): string;
/**
* Count the number of tokens in the given text.
* @param text The text to count tokens in.
* @returns Number of tokens.
*/
abstract countTokens(text: string): number;
/**
* Batch encode a list of texts into tokens.
* @param texts The texts to encode.
* @returns List of encoded sequences.
*/
encodeBatch(texts: string[]): number[][];
/**
* Batch decode a list of tokens back into text.
* @param tokenSequences The tokens to decode.
* @returns List of decoded texts.
*/
decodeBatch(tokenSequences: number[][]): string[];
/**
* Count the number of tokens in a batch of texts.
* @param texts The texts to count tokens in.
* @returns List of token counts.
*/
countTokensBatch(texts: string[]): number[];
}
/**
* Character-based tokenizer.
*/
export declare class CharacterTokenizer extends BaseTokenizer {
/**
* Encode the given text into tokens.
* @param text The text to encode.
* @returns Encoded sequence of character IDs.
*/
encode(text: string): number[];
/**
* Decode the given tokens back into text.
* @param tokens The tokens to decode.
* @returns Decoded text.
*/
decode(tokens: number[]): string;
/**
* Count the number of tokens in the given text.
* For CharacterTokenizer, this is the length of the text.
* @param text The text to count tokens in.
* @returns Number of characters (tokens).
*/
countTokens(text: string): number;
}
/**
* Word-based tokenizer.
*/
export declare class WordTokenizer extends BaseTokenizer {
/**
* Tokenize the given text into words.
* Splits the text by spaces.
* @param text The text to tokenize.
* @returns List of word tokens.
*/
tokenize(text: string): string[];
/**
* Encode the given text into tokens.
* @param text The text to encode.
* @returns Encoded sequence of word IDs.
*/
encode(text: string): number[];
/**
* Decode token ids back to text.
* Joins tokens with spaces.
* @param tokens The tokens to decode.
* @returns Decoded text.
*/
decode(tokens: number[]): string;
/**
* Count the number of tokens in the given text.
* For WordTokenizer, this is the number of words after splitting by space.
* @param text The text to count tokens in.
* @returns Number of words (tokens).
*/
countTokens(text: string): number;
}
/**
* Unified tokenizer interface for Chonkie.
* This class provides a consistent API for various tokenization backends.
*/
export declare class Tokenizer {
private tokenizerInstance;
private _backend;
/**
* Private constructor. Use `Tokenizer.create()` to instantiate.
* @param tokenizerInstance The underlying tokenizer instance.
* @param backend The name of the backend being used.
*/
private constructor();
/**
* Creates and initializes a Tokenizer instance.
* @param tokenizer Tokenizer identifier (e.g., "google-bert/bert-base-uncased", "character", "word"),
* a pre-initialized tokenizer instance, or a custom callable tokenizer.
* Defaults to "google-bert/bert-base-uncased".
* @returns A promise that resolves to a Tokenizer instance.
* @throws Error if the specified tokenizer cannot be loaded or is unsupported.
*/
static create(tokenizer?: string | SupportedTokenizerInstance | CallableTokenizer): Promise<Tokenizer>;
/**
* Loads the tokenizer based on the identifier string.
* Tries loading from 'tokenizers', then 'transformers'.
* Also supports 'character' and 'word' for basic tokenizers.
* @param tokenizerName The name or path of the tokenizer to load.
* @returns A promise that resolves to a supported tokenizer instance.
* @throws Error if the tokenizer cannot be found or loaded.
*/
private static _loadTokenizer;
/**
* Determines the backend name from a tokenizer instance.
* @param instance The tokenizer instance.
* @returns The backend name (e.g., "chonkie", "tokenizers", "transformers", "callable").
* @throws Error if the instance type is unsupported.
*/
private static _getBackendFromInstance;
/**
* Gets the name of the backend currently used by this tokenizer instance.
* @returns The backend name.
*/
get backend(): string;
/**
* Encode the text into tokens.
* @param text The text to encode.
* @returns A promise that resolves to an array of token IDs.
* @throws Error if encoding is not supported by the backend or fails.
*/
encode(text: string): Promise<number[]>;
/**
* Decode the tokens back into text.
* @param tokens An array of token IDs.
* @returns A promise that resolves to the decoded string.
* @throws Error if decoding is not supported by the backend or fails.
*/
decode(tokens: number[]): Promise<string>;
/**
* Count the number of tokens in the text.
* @param text The text to count tokens in.
* @returns A promise that resolves to the number of tokens.
* @throws Error if token counting is not supported by the backend or fails.
*/
countTokens(text: string): Promise<number>;
/**
* Batch encode a list of texts into tokens.
* @param texts An array of strings to encode.
* @returns A promise that resolves to a list of encoded token ID sequences.
* @throws Error if batch encoding is not supported by the backend or fails.
*/
encodeBatch(texts: string[]): Promise<number[][]>;
/**
* Batch decode a list of token sequences back into text.
* @param tokenSequences An array of token ID sequences.
* @returns A promise that resolves to a list of decoded strings.
* @throws Error if batch decoding is not supported by the backend or fails.
*/
decodeBatch(tokenSequences: number[][]): Promise<string[]>;
/**
* Count the number of tokens in a batch of texts.
* @param texts An array of strings to count tokens in.
* @returns A promise that resolves to a list of token counts.
* @throws Error if batch token counting is not supported by the backend or fails.
*/
countTokensBatch(texts: string[]): Promise<number[]>;
}
export {};