@agnai/web-tokenizers
Version:
| [NPM Package](https://www.npmjs.com/package/@mlc-ai/web-tokenizers) | [WebLLM](https://github.com/mlc-ai/web-llm) |
65 lines • 1.9 kB
TypeScript
/**
* A universal tokenizer that is backed by either
* HF tokenizers rust library or sentencepiece
*/
export declare class Tokenizer {
private handle;
private constructor();
/**
* Dispose this tokenizer.
*
* Call this function when we no longer needs to
*/
dispose(): void;
/**
* Encode text to token ids.
*
* @param text Input text.
* @returns The output tokens
*/
encode(text: string): Int32Array;
/**
* Decode the token ids into string.
*
* @param ids the input ids.
* @returns The decoded string.
*/
decode(ids: Int32Array): string;
/**
* Returns the vocabulary size. Special tokens are considered.
*
* @returns Vocab size.
*/
getVocabSize(): number;
/**
* Convert the given id to its corresponding token if it exists. If not, return an empty string.
*
* @param id the input id.
* @returns The decoded string.
*/
idToToken(id: number): string;
/**
* Create a tokenizer from jsonArrayBuffer
*
* @param json The input array buffer that contains json text.
* @returns The tokenizer
*/
static fromJSON(json: ArrayBuffer): Promise<Tokenizer>;
/**
* Create a tokenizer from byte-level BPE blobs.
*
* @param vocab The vocab blob.
* @param merges The merges blob.
* @param addedTokens The addedTokens blob
* @returns The tokenizer
*/
static fromByteLevelBPE(vocab: ArrayBuffer, merges: ArrayBuffer, addedTokens?: string): Promise<Tokenizer>;
/**
* Create a tokenizer from sentencepiece model.
*
* @param model The model blob.
* @returns The tokenizer
*/
static fromSentencePiece(model: ArrayBuffer): Promise<Tokenizer>;
}
//# sourceMappingURL=tokenizers.d.ts.map