chonkie
Version:
🦛 CHONK your texts in TS with Chonkie!✨The no-nonsense lightweight and efficient chunking library.
449 lines • 18.8 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Tokenizer = exports.WordTokenizer = exports.CharacterTokenizer = exports.BaseTokenizer = void 0;
const transformers_1 = require("@huggingface/transformers");
/**
* @abstract
* Base class for tokenizers, providing a foundational structure for text tokenization.
* It manages a vocabulary of tokens and their corresponding IDs.
* This class is intended to be extended by specific tokenizer implementations
* (e.g., CharacterTokenizer, WordTokenizer).
*
* @property {string[]} vocab - An array storing the tokens in the vocabulary.
* @property {Map<string, number>} token2id - A map that stores the mapping from tokens (strings) to their unique IDs (numbers).
*/
class BaseTokenizer {
/**
* Initializes the BaseTokenizer.
* Sets up an empty vocabulary (`this.vocab`) and an empty token-to-ID map (`this.token2id`).
* It also adds a space character (" ") as the initial token to the vocabulary.
*/
constructor() {
this.vocab = [];
this.token2id = new Map();
this.addTokenToVocab(" "); // Add space to the vocabulary
}
/**
* Adds a token to the vocabulary if it's not already present.
* @param token The token to add.
* @returns The ID of the token.
*/
addTokenToVocab(token) {
if (!this.token2id.has(token)) {
this.token2id.set(token, this.vocab.length);
this.vocab.push(token);
}
return this.token2id.get(token);
}
/**
* Return a string representation of the BaseTokenizer.
* @returns String representation.
*/
toString() {
return `${this.constructor.name}(vocab_size=${this.vocab.length})`;
}
/**
* Return the vocabulary.
* @returns The vocabulary.
*/
getVocab() {
return this.vocab;
}
/**
* Return token-to-id mapping.
* @returns The token-to-ID map.
*/
getToken2id() {
return this.token2id;
}
/**
* Batch encode a list of texts into tokens.
* @param texts The texts to encode.
* @returns List of encoded sequences.
*/
encodeBatch(texts) {
return texts.map((text) => this.encode(text));
}
/**
* Batch decode a list of tokens back into text.
* @param tokenSequences The tokens to decode.
* @returns List of decoded texts.
*/
decodeBatch(tokenSequences) {
return tokenSequences.map((tokens) => this.decode(tokens));
}
/**
* Count the number of tokens in a batch of texts.
* @param texts The texts to count tokens in.
* @returns List of token counts.
*/
countTokensBatch(texts) {
return texts.map((text) => this.countTokens(text));
}
}
exports.BaseTokenizer = BaseTokenizer;
/**
* Character-based tokenizer.
*/
class CharacterTokenizer extends BaseTokenizer {
/**
* Encode the given text into tokens.
* @param text The text to encode.
* @returns Encoded sequence of character IDs.
*/
encode(text) {
const encoded = [];
for (const char of text) {
const id = this.addTokenToVocab(char);
encoded.push(id);
}
return encoded;
}
/**
* Decode the given tokens back into text.
* @param tokens The tokens to decode.
* @returns Decoded text.
*/
decode(tokens) {
try {
return tokens.map((token) => this.vocab[token]).join("");
}
catch (e) {
throw new Error(`Decoding failed. Tokens: [${tokens.join(", ")}] not found in vocab.`);
}
}
/**
* Count the number of tokens in the given text.
* For CharacterTokenizer, this is the length of the text.
* @param text The text to count tokens in.
* @returns Number of characters (tokens).
*/
countTokens(text) {
return text.length;
}
}
exports.CharacterTokenizer = CharacterTokenizer;
/**
* Word-based tokenizer.
*/
class WordTokenizer extends BaseTokenizer {
/**
* Tokenize the given text into words.
* Splits the text by spaces.
* @param text The text to tokenize.
* @returns List of word tokens.
*/
tokenize(text) {
return text.split(" ");
}
/**
* Encode the given text into tokens.
* @param text The text to encode.
* @returns Encoded sequence of word IDs.
*/
encode(text) {
const encoded = [];
for (const token of this.tokenize(text)) {
const id = this.addTokenToVocab(token);
encoded.push(id);
}
return encoded;
}
/**
* Decode token ids back to text.
* Joins tokens with spaces.
* @param tokens The tokens to decode.
* @returns Decoded text.
*/
decode(tokens) {
try {
return tokens.map((token) => this.vocab[token]).join(" ");
}
catch (e) {
throw new Error(`Decoding failed. Tokens: [${tokens.join(", ")}] not found in vocab.`);
}
}
/**
* Count the number of tokens in the given text.
* For WordTokenizer, this is the number of words after splitting by space.
* @param text The text to count tokens in.
* @returns Number of words (tokens).
*/
countTokens(text) {
return this.tokenize(text).length;
}
}
exports.WordTokenizer = WordTokenizer;
/**
* Unified tokenizer interface for Chonkie.
* This class provides a consistent API for various tokenization backends.
*/
class Tokenizer {
/**
* Private constructor. Use `Tokenizer.create()` to instantiate.
* @param tokenizerInstance The underlying tokenizer instance.
* @param backend The name of the backend being used.
*/
constructor(tokenizerInstance, backend) {
this.tokenizerInstance = tokenizerInstance;
this._backend = backend;
}
/**
* Creates and initializes a Tokenizer instance.
* @param tokenizer Tokenizer identifier (e.g., "google-bert/bert-base-uncased", "character", "word"),
* a pre-initialized tokenizer instance, or a custom callable tokenizer.
* Defaults to "google-bert/bert-base-uncased".
* @returns A promise that resolves to a Tokenizer instance.
* @throws Error if the specified tokenizer cannot be loaded or is unsupported.
*/
static create() {
return __awaiter(this, arguments, void 0, function* (tokenizer = "google-bert/bert-base-uncased") {
if (typeof tokenizer === "string") {
const instance = yield Tokenizer._loadTokenizer(tokenizer);
const backend = Tokenizer._getBackendFromInstance(instance);
return new Tokenizer(instance, backend);
}
else {
const backend = Tokenizer._getBackendFromInstance(tokenizer);
return new Tokenizer(tokenizer, backend);
}
});
}
/**
* Loads the tokenizer based on the identifier string.
* Tries loading from 'tokenizers', then 'transformers'.
* Also supports 'character' and 'word' for basic tokenizers.
* @param tokenizerName The name or path of the tokenizer to load.
* @returns A promise that resolves to a supported tokenizer instance.
* @throws Error if the tokenizer cannot be found or loaded.
*/
static _loadTokenizer(tokenizerName) {
return __awaiter(this, void 0, void 0, function* () {
if (tokenizerName === "character") {
return new CharacterTokenizer();
}
if (tokenizerName === "word") {
return new WordTokenizer();
}
try {
// Try transformers library
return yield transformers_1.AutoTokenizer.from_pretrained(tokenizerName);
}
catch (e) {
throw new Error(`Tokenizer '${tokenizerName}' not found in 'tokenizers' or 'transformers'. Error: ${e.message}`);
}
});
}
/**
* Determines the backend name from a tokenizer instance.
* @param instance The tokenizer instance.
* @returns The backend name (e.g., "chonkie", "tokenizers", "transformers", "callable").
* @throws Error if the instance type is unsupported.
*/
static _getBackendFromInstance(instance) {
var _a;
if (instance instanceof BaseTokenizer)
return "chonkie";
// Check for PreTrainedTokenizer from @huggingface/transformers
// This check needs to be robust as direct instanceof might not work with all bundlers/versions.
// Checking for a known method like 'encode' or 'decode' specific to the class structure.
if (typeof instance.encode === "function" &&
typeof instance.decode === "function" &&
(((_a = instance.constructor) === null || _a === void 0 ? void 0 : _a.name) === "PreTrainedTokenizer" ||
instance.name !== undefined) // Heuristic for transformers tokenizer
) {
return "transformers";
}
if (typeof instance.countTokens === "function") {
return "callable";
}
throw new Error(`Unsupported tokenizer instance type: ${typeof instance}`);
}
/**
* Gets the name of the backend currently used by this tokenizer instance.
* @returns The backend name.
*/
get backend() {
return this._backend;
}
/**
* Encode the text into tokens.
* @param text The text to encode.
* @returns A promise that resolves to an array of token IDs.
* @throws Error if encoding is not supported by the backend or fails.
*/
encode(text) {
return __awaiter(this, void 0, void 0, function* () {
const instance = this.tokenizerInstance;
switch (this._backend) {
case "chonkie":
return instance.encode(text);
case "transformers":
// Type assertion needed as transformers' encode can return number[] or Tensor
const result = instance.encode(text, { add_special_tokens: false });
// Transformers v3+ encode directly returns number[]
if (Array.isArray(result)) {
return result;
}
const resolvedResult = yield Promise.resolve(result);
if (typeof resolvedResult === 'object' && resolvedResult !== null && 'input_ids' in resolvedResult) {
return resolvedResult.input_ids;
}
// If resolvedResult is a Tensor, extract its data
if (resolvedResult && typeof resolvedResult === 'object' && 'data' in resolvedResult && resolvedResult.data !== null) {
// (resolvedResult as any).data is typically a TypedArray (e.g., Int32Array) or number[]
return Array.from(resolvedResult.data);
}
console.error("Tokenizer.encode: Unexpected result structure from TransformersJsTokenizer.encode after attempting to resolve:", resolvedResult);
throw new Error("Failed to convert encoding result from transformers backend to number[].");
case "callable":
if (instance.encode) {
return instance.encode(text);
}
throw new Error("Encoding not implemented for this callable tokenizer.");
default:
throw new Error(`Unsupported tokenizer backend: ${this._backend}`);
}
});
}
/**
* Decode the tokens back into text.
* @param tokens An array of token IDs.
* @returns A promise that resolves to the decoded string.
* @throws Error if decoding is not supported by the backend or fails.
*/
decode(tokens) {
return __awaiter(this, void 0, void 0, function* () {
const instance = this.tokenizerInstance;
switch (this._backend) {
case "chonkie":
return instance.decode(tokens);
case "transformers":
return instance.decode(tokens, { skip_special_tokens: true });
case "callable":
if (instance.decode) {
return instance.decode(tokens);
}
throw new Error("Decoding not implemented for this callable tokenizer.");
default:
throw new Error(`Unsupported tokenizer backend: ${this._backend}`);
}
});
}
/**
* Count the number of tokens in the text.
* @param text The text to count tokens in.
* @returns A promise that resolves to the number of tokens.
* @throws Error if token counting is not supported by the backend or fails.
*/
countTokens(text) {
return __awaiter(this, void 0, void 0, function* () {
const instance = this.tokenizerInstance;
switch (this._backend) {
case "chonkie":
return instance.countTokens(text);
case "transformers":
const result = instance.encode(text, { add_special_tokens: false });
const resolvedResult = yield Promise.resolve(result);
if (Array.isArray(resolvedResult)) {
return resolvedResult.length;
}
if (typeof resolvedResult === 'object' && resolvedResult !== null && 'input_ids' in resolvedResult) {
return resolvedResult.input_ids.length;
}
if (resolvedResult && typeof resolvedResult === 'object' && 'data' in resolvedResult && resolvedResult.data !== null) {
// (resolvedResult as any).data is TypedArray or number[], .length will work
return resolvedResult.data.length;
}
console.error("Tokenizer.countTokens: Unexpected result structure from TransformersJsTokenizer.encode after attempting to resolve:", resolvedResult);
throw new Error("Failed to count tokens due to unexpected result from transformers backend.");
case "callable":
return instance.countTokens(text);
default:
throw new Error(`Unsupported tokenizer backend: ${this._backend}`);
}
});
}
/**
* Batch encode a list of texts into tokens.
* @param texts An array of strings to encode.
* @returns A promise that resolves to a list of encoded token ID sequences.
* @throws Error if batch encoding is not supported by the backend or fails.
*/
encodeBatch(texts) {
return __awaiter(this, void 0, void 0, function* () {
const instance = this.tokenizerInstance;
switch (this._backend) {
case "chonkie":
return instance.encodeBatch(texts);
case "transformers":
// Process each text in parallel using the existing this.encode() method.
// Note: The console.log for "Batch encoding result:" is removed as 'batchEncoding' from the direct call no longer exists.
return Promise.all(texts.map(text => this.encode(text)));
case "callable":
if (instance.encode) {
return texts.map(text => instance.encode(text));
}
throw new Error("Batch encoding not implemented for this callable tokenizer.");
default:
throw new Error(`Unsupported tokenizer backend: ${this._backend}`);
}
});
}
/**
* Batch decode a list of token sequences back into text.
* @param tokenSequences An array of token ID sequences.
* @returns A promise that resolves to a list of decoded strings.
* @throws Error if batch decoding is not supported by the backend or fails.
*/
decodeBatch(tokenSequences) {
return __awaiter(this, void 0, void 0, function* () {
const instance = this.tokenizerInstance;
switch (this._backend) {
case "chonkie":
return instance.decodeBatch(tokenSequences);
case "transformers":
return instance.batch_decode(tokenSequences, { skip_special_tokens: true });
case "callable":
if (instance.decode) {
return tokenSequences.map(tokens => instance.decode(tokens));
}
throw new Error("Batch decoding not implemented for this callable tokenizer.");
default:
throw new Error(`Unsupported tokenizer backend: ${this._backend}`);
}
});
}
/**
* Count the number of tokens in a batch of texts.
* @param texts An array of strings to count tokens in.
* @returns A promise that resolves to a list of token counts.
* @throws Error if batch token counting is not supported by the backend or fails.
*/
countTokensBatch(texts) {
return __awaiter(this, void 0, void 0, function* () {
const instance = this.tokenizerInstance;
switch (this._backend) {
case "chonkie":
return instance.countTokensBatch(texts);
case "transformers":
return Promise.all(texts.map(text => this.countTokens(text)));
case "callable":
return texts.map(text => instance.countTokens(text));
default:
throw new Error(`Unsupported tokenizer backend: ${this._backend}`);
}
});
}
}
exports.Tokenizer = Tokenizer;
//# sourceMappingURL=tokenizer.js.map