UNPKG

chonkie

Version:

🦛 CHONK your texts in TS with Chonkie!✨The no-nonsense lightweight and efficient chunking library.

232 lines (231 loc) • 8.76 kB
import { PreTrainedTokenizer } from "@huggingface/transformers"; type TransformersJsTokenizer = PreTrainedTokenizer; type CallableTokenizerCounter = (text: string) => number; type CallableTokenizerEncoder = (text: string) => number[]; type CallableTokenizerDecoder = (tokens: number[]) => string; type SupportedTokenizerInstance = BaseTokenizer | TransformersJsTokenizer; export type CallableTokenizer = { encode?: CallableTokenizerEncoder; decode?: CallableTokenizerDecoder; countTokens: CallableTokenizerCounter; }; /** * @abstract * Base class for tokenizers, providing a foundational structure for text tokenization. * It manages a vocabulary of tokens and their corresponding IDs. * This class is intended to be extended by specific tokenizer implementations * (e.g., CharacterTokenizer, WordTokenizer). * * @property {string[]} vocab - An array storing the tokens in the vocabulary. * @property {Map<string, number>} token2id - A map that stores the mapping from tokens (strings) to their unique IDs (numbers). */ export declare abstract class BaseTokenizer { vocab: string[]; token2id: Map<string, number>; /** * Initializes the BaseTokenizer. * Sets up an empty vocabulary (`this.vocab`) and an empty token-to-ID map (`this.token2id`). * It also adds a space character (" ") as the initial token to the vocabulary. */ constructor(); /** * Adds a token to the vocabulary if it's not already present. * @param token The token to add. * @returns The ID of the token. */ protected addTokenToVocab(token: string): number; /** * Return a string representation of the BaseTokenizer. * @returns String representation. */ toString(): string; /** * Return the vocabulary. * @returns The vocabulary. */ getVocab(): readonly string[]; /** * Return token-to-id mapping. * @returns The token-to-ID map. */ getToken2id(): ReadonlyMap<string, number>; /** * Encode the given text into tokens. * @param text The text to encode. * @returns Encoded sequence of token IDs. */ abstract encode(text: string): number[]; /** * Decode the given tokens back into text. * @param tokens The tokens to decode. * @returns Decoded text. */ abstract decode(tokens: number[]): string; /** * Count the number of tokens in the given text. * @param text The text to count tokens in. * @returns Number of tokens. */ abstract countTokens(text: string): number; /** * Batch encode a list of texts into tokens. * @param texts The texts to encode. * @returns List of encoded sequences. */ encodeBatch(texts: string[]): number[][]; /** * Batch decode a list of tokens back into text. * @param tokenSequences The tokens to decode. * @returns List of decoded texts. */ decodeBatch(tokenSequences: number[][]): string[]; /** * Count the number of tokens in a batch of texts. * @param texts The texts to count tokens in. * @returns List of token counts. */ countTokensBatch(texts: string[]): number[]; } /** * Character-based tokenizer. */ export declare class CharacterTokenizer extends BaseTokenizer { /** * Encode the given text into tokens. * @param text The text to encode. * @returns Encoded sequence of character IDs. */ encode(text: string): number[]; /** * Decode the given tokens back into text. * @param tokens The tokens to decode. * @returns Decoded text. */ decode(tokens: number[]): string; /** * Count the number of tokens in the given text. * For CharacterTokenizer, this is the length of the text. * @param text The text to count tokens in. * @returns Number of characters (tokens). */ countTokens(text: string): number; } /** * Word-based tokenizer. */ export declare class WordTokenizer extends BaseTokenizer { /** * Tokenize the given text into words. * Splits the text by spaces. * @param text The text to tokenize. * @returns List of word tokens. */ tokenize(text: string): string[]; /** * Encode the given text into tokens. * @param text The text to encode. * @returns Encoded sequence of word IDs. */ encode(text: string): number[]; /** * Decode token ids back to text. * Joins tokens with spaces. * @param tokens The tokens to decode. * @returns Decoded text. */ decode(tokens: number[]): string; /** * Count the number of tokens in the given text. * For WordTokenizer, this is the number of words after splitting by space. * @param text The text to count tokens in. * @returns Number of words (tokens). */ countTokens(text: string): number; } /** * Unified tokenizer interface for Chonkie. * This class provides a consistent API for various tokenization backends. */ export declare class Tokenizer { private tokenizerInstance; private _backend; /** * Private constructor. Use `Tokenizer.create()` to instantiate. * @param tokenizerInstance The underlying tokenizer instance. * @param backend The name of the backend being used. */ private constructor(); /** * Creates and initializes a Tokenizer instance. * @param tokenizer Tokenizer identifier (e.g., "google-bert/bert-base-uncased", "character", "word"), * a pre-initialized tokenizer instance, or a custom callable tokenizer. * Defaults to "google-bert/bert-base-uncased". * @returns A promise that resolves to a Tokenizer instance. * @throws Error if the specified tokenizer cannot be loaded or is unsupported. */ static create(tokenizer?: string | SupportedTokenizerInstance | CallableTokenizer): Promise<Tokenizer>; /** * Loads the tokenizer based on the identifier string. * Tries loading from 'tokenizers', then 'transformers'. * Also supports 'character' and 'word' for basic tokenizers. * @param tokenizerName The name or path of the tokenizer to load. * @returns A promise that resolves to a supported tokenizer instance. * @throws Error if the tokenizer cannot be found or loaded. */ private static _loadTokenizer; /** * Determines the backend name from a tokenizer instance. * @param instance The tokenizer instance. * @returns The backend name (e.g., "chonkie", "tokenizers", "transformers", "callable"). * @throws Error if the instance type is unsupported. */ private static _getBackendFromInstance; /** * Gets the name of the backend currently used by this tokenizer instance. * @returns The backend name. */ get backend(): string; /** * Encode the text into tokens. * @param text The text to encode. * @returns A promise that resolves to an array of token IDs. * @throws Error if encoding is not supported by the backend or fails. */ encode(text: string): Promise<number[]>; /** * Decode the tokens back into text. * @param tokens An array of token IDs. * @returns A promise that resolves to the decoded string. * @throws Error if decoding is not supported by the backend or fails. */ decode(tokens: number[]): Promise<string>; /** * Count the number of tokens in the text. * @param text The text to count tokens in. * @returns A promise that resolves to the number of tokens. * @throws Error if token counting is not supported by the backend or fails. */ countTokens(text: string): Promise<number>; /** * Batch encode a list of texts into tokens. * @param texts An array of strings to encode. * @returns A promise that resolves to a list of encoded token ID sequences. * @throws Error if batch encoding is not supported by the backend or fails. */ encodeBatch(texts: string[]): Promise<number[][]>; /** * Batch decode a list of token sequences back into text. * @param tokenSequences An array of token ID sequences. * @returns A promise that resolves to a list of decoded strings. * @throws Error if batch decoding is not supported by the backend or fails. */ decodeBatch(tokenSequences: number[][]): Promise<string[]>; /** * Count the number of tokens in a batch of texts. * @param texts An array of strings to count tokens in. * @returns A promise that resolves to a list of token counts. * @throws Error if batch token counting is not supported by the backend or fails. */ countTokensBatch(texts: string[]): Promise<number[]>; } export {};