UNPKG

chonkie

Version:

🦛 CHONK your texts in TS with Chonkie!✨The no-nonsense lightweight and efficient chunking library.

449 lines • 18.8 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.Tokenizer = exports.WordTokenizer = exports.CharacterTokenizer = exports.BaseTokenizer = void 0; const transformers_1 = require("@huggingface/transformers"); /** * @abstract * Base class for tokenizers, providing a foundational structure for text tokenization. * It manages a vocabulary of tokens and their corresponding IDs. * This class is intended to be extended by specific tokenizer implementations * (e.g., CharacterTokenizer, WordTokenizer). * * @property {string[]} vocab - An array storing the tokens in the vocabulary. * @property {Map<string, number>} token2id - A map that stores the mapping from tokens (strings) to their unique IDs (numbers). */ class BaseTokenizer { /** * Initializes the BaseTokenizer. * Sets up an empty vocabulary (`this.vocab`) and an empty token-to-ID map (`this.token2id`). * It also adds a space character (" ") as the initial token to the vocabulary. */ constructor() { this.vocab = []; this.token2id = new Map(); this.addTokenToVocab(" "); // Add space to the vocabulary } /** * Adds a token to the vocabulary if it's not already present. * @param token The token to add. * @returns The ID of the token. */ addTokenToVocab(token) { if (!this.token2id.has(token)) { this.token2id.set(token, this.vocab.length); this.vocab.push(token); } return this.token2id.get(token); } /** * Return a string representation of the BaseTokenizer. * @returns String representation. */ toString() { return `${this.constructor.name}(vocab_size=${this.vocab.length})`; } /** * Return the vocabulary. * @returns The vocabulary. */ getVocab() { return this.vocab; } /** * Return token-to-id mapping. * @returns The token-to-ID map. */ getToken2id() { return this.token2id; } /** * Batch encode a list of texts into tokens. * @param texts The texts to encode. * @returns List of encoded sequences. */ encodeBatch(texts) { return texts.map((text) => this.encode(text)); } /** * Batch decode a list of tokens back into text. * @param tokenSequences The tokens to decode. * @returns List of decoded texts. */ decodeBatch(tokenSequences) { return tokenSequences.map((tokens) => this.decode(tokens)); } /** * Count the number of tokens in a batch of texts. * @param texts The texts to count tokens in. * @returns List of token counts. */ countTokensBatch(texts) { return texts.map((text) => this.countTokens(text)); } } exports.BaseTokenizer = BaseTokenizer; /** * Character-based tokenizer. */ class CharacterTokenizer extends BaseTokenizer { /** * Encode the given text into tokens. * @param text The text to encode. * @returns Encoded sequence of character IDs. */ encode(text) { const encoded = []; for (const char of text) { const id = this.addTokenToVocab(char); encoded.push(id); } return encoded; } /** * Decode the given tokens back into text. * @param tokens The tokens to decode. * @returns Decoded text. */ decode(tokens) { try { return tokens.map((token) => this.vocab[token]).join(""); } catch (e) { throw new Error(`Decoding failed. Tokens: [${tokens.join(", ")}] not found in vocab.`); } } /** * Count the number of tokens in the given text. * For CharacterTokenizer, this is the length of the text. * @param text The text to count tokens in. * @returns Number of characters (tokens). */ countTokens(text) { return text.length; } } exports.CharacterTokenizer = CharacterTokenizer; /** * Word-based tokenizer. */ class WordTokenizer extends BaseTokenizer { /** * Tokenize the given text into words. * Splits the text by spaces. * @param text The text to tokenize. * @returns List of word tokens. */ tokenize(text) { return text.split(" "); } /** * Encode the given text into tokens. * @param text The text to encode. * @returns Encoded sequence of word IDs. */ encode(text) { const encoded = []; for (const token of this.tokenize(text)) { const id = this.addTokenToVocab(token); encoded.push(id); } return encoded; } /** * Decode token ids back to text. * Joins tokens with spaces. * @param tokens The tokens to decode. * @returns Decoded text. */ decode(tokens) { try { return tokens.map((token) => this.vocab[token]).join(" "); } catch (e) { throw new Error(`Decoding failed. Tokens: [${tokens.join(", ")}] not found in vocab.`); } } /** * Count the number of tokens in the given text. * For WordTokenizer, this is the number of words after splitting by space. * @param text The text to count tokens in. * @returns Number of words (tokens). */ countTokens(text) { return this.tokenize(text).length; } } exports.WordTokenizer = WordTokenizer; /** * Unified tokenizer interface for Chonkie. * This class provides a consistent API for various tokenization backends. */ class Tokenizer { /** * Private constructor. Use `Tokenizer.create()` to instantiate. * @param tokenizerInstance The underlying tokenizer instance. * @param backend The name of the backend being used. */ constructor(tokenizerInstance, backend) { this.tokenizerInstance = tokenizerInstance; this._backend = backend; } /** * Creates and initializes a Tokenizer instance. * @param tokenizer Tokenizer identifier (e.g., "google-bert/bert-base-uncased", "character", "word"), * a pre-initialized tokenizer instance, or a custom callable tokenizer. * Defaults to "google-bert/bert-base-uncased". * @returns A promise that resolves to a Tokenizer instance. * @throws Error if the specified tokenizer cannot be loaded or is unsupported. */ static create() { return __awaiter(this, arguments, void 0, function* (tokenizer = "google-bert/bert-base-uncased") { if (typeof tokenizer === "string") { const instance = yield Tokenizer._loadTokenizer(tokenizer); const backend = Tokenizer._getBackendFromInstance(instance); return new Tokenizer(instance, backend); } else { const backend = Tokenizer._getBackendFromInstance(tokenizer); return new Tokenizer(tokenizer, backend); } }); } /** * Loads the tokenizer based on the identifier string. * Tries loading from 'tokenizers', then 'transformers'. * Also supports 'character' and 'word' for basic tokenizers. * @param tokenizerName The name or path of the tokenizer to load. * @returns A promise that resolves to a supported tokenizer instance. * @throws Error if the tokenizer cannot be found or loaded. */ static _loadTokenizer(tokenizerName) { return __awaiter(this, void 0, void 0, function* () { if (tokenizerName === "character") { return new CharacterTokenizer(); } if (tokenizerName === "word") { return new WordTokenizer(); } try { // Try transformers library return yield transformers_1.AutoTokenizer.from_pretrained(tokenizerName); } catch (e) { throw new Error(`Tokenizer '${tokenizerName}' not found in 'tokenizers' or 'transformers'. Error: ${e.message}`); } }); } /** * Determines the backend name from a tokenizer instance. * @param instance The tokenizer instance. * @returns The backend name (e.g., "chonkie", "tokenizers", "transformers", "callable"). * @throws Error if the instance type is unsupported. */ static _getBackendFromInstance(instance) { var _a; if (instance instanceof BaseTokenizer) return "chonkie"; // Check for PreTrainedTokenizer from @huggingface/transformers // This check needs to be robust as direct instanceof might not work with all bundlers/versions. // Checking for a known method like 'encode' or 'decode' specific to the class structure. if (typeof instance.encode === "function" && typeof instance.decode === "function" && (((_a = instance.constructor) === null || _a === void 0 ? void 0 : _a.name) === "PreTrainedTokenizer" || instance.name !== undefined) // Heuristic for transformers tokenizer ) { return "transformers"; } if (typeof instance.countTokens === "function") { return "callable"; } throw new Error(`Unsupported tokenizer instance type: ${typeof instance}`); } /** * Gets the name of the backend currently used by this tokenizer instance. * @returns The backend name. */ get backend() { return this._backend; } /** * Encode the text into tokens. * @param text The text to encode. * @returns A promise that resolves to an array of token IDs. * @throws Error if encoding is not supported by the backend or fails. */ encode(text) { return __awaiter(this, void 0, void 0, function* () { const instance = this.tokenizerInstance; switch (this._backend) { case "chonkie": return instance.encode(text); case "transformers": // Type assertion needed as transformers' encode can return number[] or Tensor const result = instance.encode(text, { add_special_tokens: false }); // Transformers v3+ encode directly returns number[] if (Array.isArray(result)) { return result; } const resolvedResult = yield Promise.resolve(result); if (typeof resolvedResult === 'object' && resolvedResult !== null && 'input_ids' in resolvedResult) { return resolvedResult.input_ids; } // If resolvedResult is a Tensor, extract its data if (resolvedResult && typeof resolvedResult === 'object' && 'data' in resolvedResult && resolvedResult.data !== null) { // (resolvedResult as any).data is typically a TypedArray (e.g., Int32Array) or number[] return Array.from(resolvedResult.data); } console.error("Tokenizer.encode: Unexpected result structure from TransformersJsTokenizer.encode after attempting to resolve:", resolvedResult); throw new Error("Failed to convert encoding result from transformers backend to number[]."); case "callable": if (instance.encode) { return instance.encode(text); } throw new Error("Encoding not implemented for this callable tokenizer."); default: throw new Error(`Unsupported tokenizer backend: ${this._backend}`); } }); } /** * Decode the tokens back into text. * @param tokens An array of token IDs. * @returns A promise that resolves to the decoded string. * @throws Error if decoding is not supported by the backend or fails. */ decode(tokens) { return __awaiter(this, void 0, void 0, function* () { const instance = this.tokenizerInstance; switch (this._backend) { case "chonkie": return instance.decode(tokens); case "transformers": return instance.decode(tokens, { skip_special_tokens: true }); case "callable": if (instance.decode) { return instance.decode(tokens); } throw new Error("Decoding not implemented for this callable tokenizer."); default: throw new Error(`Unsupported tokenizer backend: ${this._backend}`); } }); } /** * Count the number of tokens in the text. * @param text The text to count tokens in. * @returns A promise that resolves to the number of tokens. * @throws Error if token counting is not supported by the backend or fails. */ countTokens(text) { return __awaiter(this, void 0, void 0, function* () { const instance = this.tokenizerInstance; switch (this._backend) { case "chonkie": return instance.countTokens(text); case "transformers": const result = instance.encode(text, { add_special_tokens: false }); const resolvedResult = yield Promise.resolve(result); if (Array.isArray(resolvedResult)) { return resolvedResult.length; } if (typeof resolvedResult === 'object' && resolvedResult !== null && 'input_ids' in resolvedResult) { return resolvedResult.input_ids.length; } if (resolvedResult && typeof resolvedResult === 'object' && 'data' in resolvedResult && resolvedResult.data !== null) { // (resolvedResult as any).data is TypedArray or number[], .length will work return resolvedResult.data.length; } console.error("Tokenizer.countTokens: Unexpected result structure from TransformersJsTokenizer.encode after attempting to resolve:", resolvedResult); throw new Error("Failed to count tokens due to unexpected result from transformers backend."); case "callable": return instance.countTokens(text); default: throw new Error(`Unsupported tokenizer backend: ${this._backend}`); } }); } /** * Batch encode a list of texts into tokens. * @param texts An array of strings to encode. * @returns A promise that resolves to a list of encoded token ID sequences. * @throws Error if batch encoding is not supported by the backend or fails. */ encodeBatch(texts) { return __awaiter(this, void 0, void 0, function* () { const instance = this.tokenizerInstance; switch (this._backend) { case "chonkie": return instance.encodeBatch(texts); case "transformers": // Process each text in parallel using the existing this.encode() method. // Note: The console.log for "Batch encoding result:" is removed as 'batchEncoding' from the direct call no longer exists. return Promise.all(texts.map(text => this.encode(text))); case "callable": if (instance.encode) { return texts.map(text => instance.encode(text)); } throw new Error("Batch encoding not implemented for this callable tokenizer."); default: throw new Error(`Unsupported tokenizer backend: ${this._backend}`); } }); } /** * Batch decode a list of token sequences back into text. * @param tokenSequences An array of token ID sequences. * @returns A promise that resolves to a list of decoded strings. * @throws Error if batch decoding is not supported by the backend or fails. */ decodeBatch(tokenSequences) { return __awaiter(this, void 0, void 0, function* () { const instance = this.tokenizerInstance; switch (this._backend) { case "chonkie": return instance.decodeBatch(tokenSequences); case "transformers": return instance.batch_decode(tokenSequences, { skip_special_tokens: true }); case "callable": if (instance.decode) { return tokenSequences.map(tokens => instance.decode(tokens)); } throw new Error("Batch decoding not implemented for this callable tokenizer."); default: throw new Error(`Unsupported tokenizer backend: ${this._backend}`); } }); } /** * Count the number of tokens in a batch of texts. * @param texts An array of strings to count tokens in. * @returns A promise that resolves to a list of token counts. * @throws Error if batch token counting is not supported by the backend or fails. */ countTokensBatch(texts) { return __awaiter(this, void 0, void 0, function* () { const instance = this.tokenizerInstance; switch (this._backend) { case "chonkie": return instance.countTokensBatch(texts); case "transformers": return Promise.all(texts.map(text => this.countTokens(text))); case "callable": return texts.map(text => instance.countTokens(text)); default: throw new Error(`Unsupported tokenizer backend: ${this._backend}`); } }); } } exports.Tokenizer = Tokenizer; //# sourceMappingURL=tokenizer.js.map