@lenml/llama2-tokenizer

Version:

Our library `@lenml/llama2-tokenizer` has been deprecated. We are excited to introduce our new library `@lenml/tokenizers` as its replacement, offering a broader set of features and an enhanced experience.

github.com/lenML/llama2-tokenizer.js

lenML/llama2-tokenizer.js

1 lines • 22.8 kB

Source Map (JSON)

View Raw

{"version":3,"sources":["../src/Trie.ts","../src/tokenizer.ts"],"sourcesContent":["/**\r\n * Trie in TypeScript. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass\r\n * Loose reference https://en.wikipedia.org/wiki/Trie\r\n */\r\nexport class Trie {\r\n private data: Record<string, any>;\r\n private _tokens: Set<string>;\r\n\r\n constructor() {\r\n this.data = {};\r\n this._tokens = new Set();\r\n }\r\n\r\n /**\r\n * Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.\r\n * The special key `\"\"` is used to represent termination.\r\n *\r\n * This function is idempotent, adding twice the same word will leave the trie unchanged\r\n *\r\n * Example:\r\n *\r\n * ```typescript\r\n * const trie = new Trie();\r\n * trie.add(\"Hello 友達\");\r\n * console.log(trie.data);\r\n * // {\"H\": {\"e\": {\"l\": {\"l\": {\"o\": {\" \": {\"友\": {\"達\": {\"\": 1}}}}}}}}}\r\n *\r\n * trie.add(\"Hello\");\r\n * console.log(trie.data);\r\n * // {\"H\": {\"e\": {\"l\": {\"l\": {\"o\": {\"\": 1, \" \": {\"友\": {\"達\": {\"\": 1}}}}}}}}}\r\n * ```\r\n */\r\n add(word: string): void {\r\n if (!word) {\r\n // Prevent empty string\r\n return;\r\n }\r\n\r\n this._tokens.add(word);\r\n let ref = this.data;\r\n for (const char of word) {\r\n ref[char] = char in ref ? ref[char] : {};\r\n ref = ref[char];\r\n }\r\n ref[\"\"] = 1;\r\n }\r\n\r\n /**\r\n * Will look for the words added to the trie within `text`. Output is the original string splitted along the\r\n * boundaries of the words found.\r\n *\r\n * This trie will match the longest possible word first !\r\n *\r\n * Example:\r\n *\r\n * ```typescript\r\n * const trie = new Trie();\r\n * console.log(trie.split(\"[CLS] This is a extra_id_100\"));\r\n * // [\"[CLS] This is a extra_id_100\"]\r\n *\r\n * trie.add(\"[CLS]\");\r\n * trie.add(\"extra_id_1\");\r\n * trie.add(\"extra_id_100\");\r\n * console.log(trie.split(\"[CLS] This is a extra_id_100\"));\r\n * // [\"[CLS]\", \" This is a \", \"extra_id_100\"]\r\n * ```\r\n */\r\n split(text: string): string[] {\r\n if (!text) {\r\n return [];\r\n }\r\n let states: Record<number, any> = {};\r\n\r\n // indexes are counted left of the chars index.\r\n // \"hello\", index 0, is left of h, index 1 is between h and e.\r\n // index 5 is right of the \"o\".\r\n\r\n // States are going to capture every possible start (indexes as above)\r\n // as keys, and have as values, a pointer to the position in the trie\r\n // where we're at. This is a partial match for now.\r\n // This enables to keep track of multiple matches while we're iterating\r\n // the string\r\n // If the trie contains, \"blowing\", and \"lower\" and we encounter the\r\n // string \"blower\", we need to split into [\"b\", \"lower\"].\r\n // This is where we need to keep track of multiple possible starts.\r\n const offsets: number[] = [0];\r\n\r\n // This is used by the lookahead which needs to skip over\r\n // some text where the full match exceeded the place in the initial\r\n // for loop\r\n let skip = 0;\r\n\r\n // Main loop, Giving this algorithm O(n) complexity\r\n for (let current = 0; current < text.length; current++) {\r\n if (skip && current < skip) {\r\n // Prevents the lookahead for matching twice\r\n // like extra_id_100 and id_100\r\n continue;\r\n }\r\n\r\n // This will track every state\r\n // that stop matching, we need to stop tracking them.\r\n // If we look at \"lowball\", we're going to match \"l\" (add it to states), \"o\", \"w\", then\r\n // fail on \"b\", we need to remove 0 from the valid states.\r\n let toRemove: Set<number> = new Set();\r\n // Whenever we found a match, we need to drop everything\r\n // this is a greedy algorithm, it will match on the first found token\r\n let reset = false;\r\n\r\n // In this case, we already have partial matches (But unfinished)\r\n for (let start in states) {\r\n let triePointer = states[start];\r\n if (\"\" in triePointer) {\r\n // This is a final match, we need to reset and\r\n // store the results in `offsets`.\r\n\r\n // Lookahead to match longest first\r\n // Important in case of extra_id_1 vs extra_id_100\r\n // Here we are also actively looking for other earlier partial\r\n // matches\r\n // \"[CLS]\", \"L\", we need to match CLS even if L is special\r\n let lookaheadIndex: number;\r\n let end: number;\r\n let nextChar: string | null;\r\n\r\n for (const lookStart in states) {\r\n let lookTriePointer = states[lookStart];\r\n if (parseInt(lookStart) > parseInt(start)) {\r\n // This partial match is later, we can stop looking\r\n break;\r\n } else if (parseInt(lookStart) < parseInt(start)) {\r\n // This partial match is earlier, the trie pointer\r\n // was already updated, so index is + 1\r\n lookaheadIndex = current + 1;\r\n end = current + 1;\r\n } else {\r\n // Here lookstart == start and\r\n // looktrie_pointer == trie_pointer\r\n // It wasn't updated yet so indices are current ones\r\n lookaheadIndex = current;\r\n end = current;\r\n }\r\n nextChar =\r\n lookaheadIndex < text.length ? text[lookaheadIndex] : null;\r\n if (\"\" in lookTriePointer) {\r\n start = lookStart;\r\n end = lookaheadIndex;\r\n skip = lookaheadIndex;\r\n }\r\n while (nextChar && nextChar in lookTriePointer) {\r\n lookTriePointer = lookTriePointer[nextChar];\r\n lookaheadIndex += 1;\r\n if (\"\" in lookTriePointer) {\r\n start = lookStart;\r\n end = lookaheadIndex;\r\n skip = lookaheadIndex;\r\n }\r\n\r\n if (lookaheadIndex === text.length) {\r\n // End of string\r\n break;\r\n }\r\n nextChar = text[lookaheadIndex];\r\n }\r\n // End lookahead\r\n }\r\n\r\n // Storing and resetting\r\n offsets.push(parseInt(start));\r\n offsets.push(end!);\r\n reset = true;\r\n break;\r\n } else if (text[current] in triePointer) {\r\n // The current character being looked at has a match within the trie\r\n // update the pointer (it will be stored back into states later).\r\n triePointer = triePointer[text[current]];\r\n\r\n // Storing back the new pointer into the states.\r\n // Partial matches got longer by one.\r\n states[start] = triePointer;\r\n } else {\r\n // The new character has not match in the trie, we need\r\n // to stop keeping track of this partial match.\r\n // We can't do it directly within the loop because of how\r\n // TypeScript iteration works\r\n toRemove.add(parseInt(start));\r\n }\r\n }\r\n\r\n // Either clearing the full start (we found a real match)\r\n // Or clearing only the partial matches that didn't work.\r\n if (reset) {\r\n states = {};\r\n } else {\r\n for (const start of toRemove) {\r\n delete states[start];\r\n }\r\n }\r\n\r\n // If this character is a starting character within the trie\r\n // start keeping track of this partial match.\r\n if (current >= skip && text[current] in this.data) {\r\n states[current] = this.data[text[current]];\r\n }\r\n }\r\n\r\n // We have a cut at the end with states.\r\n for (const start in states) {\r\n const triePointer = states[start];\r\n if (\"\" in triePointer) {\r\n // This is a final match, we need to reset and\r\n // store the results in `offsets`.\r\n const end = text.length;\r\n offsets.push(parseInt(start));\r\n offsets.push(end);\r\n // Longest cut is always the one with lower start so the first\r\n // item so we need to break.\r\n break;\r\n }\r\n }\r\n\r\n return this.cutText(text, offsets);\r\n }\r\n\r\n protected cutText(text: string, offsets: number[]): string[] {\r\n // We have all the offsets now, we just need to do the actual splitting.\r\n // We need to eventually add the first part of the string and the eventual\r\n // last part.\r\n offsets.push(text.length);\r\n const tokens: string[] = [];\r\n let start = 0;\r\n for (const end of offsets) {\r\n if (start > end) {\r\n console.error(\r\n \"There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.\"\r\n );\r\n continue;\r\n } else if (start === end) {\r\n // This might happen if there's a match at index 0\r\n // we're also preventing zero-width cuts in case of two\r\n // consecutive matches\r\n continue;\r\n }\r\n tokens.push(text.slice(start, end));\r\n start = end;\r\n }\r\n\r\n return tokens;\r\n }\r\n}\r\n\r\n// test case\r\n// const main = () => {\r\n// const trie = new Trie();\r\n// console.log(trie.split(\"[CLS] This is a extra_id_100\"));\r\n// // [\"[CLS] This is a extra_id_100\"]\r\n\r\n// trie.add(\"[CLS]\");\r\n// trie.add(\"extra_id_1\");\r\n// trie.add(\"extra_id_100\");\r\n// console.log(trie.split(\"[CLS] This is a extra_id_100\"));\r\n// // [\"[CLS]\", \" This is a \", \"extra_id_100\"]\r\n// };\r\n\r\n// main();\r\n","import { Trie } from \"./Trie\";\r\n\r\nconst utf8Encoder = new TextEncoder();\r\nconst utf8Decoder = new TextDecoder(\"utf-8\");\r\n\r\nconst byteToHex = (byte: number) =>\r\n byte.toString(16).padStart(2, \"0\").toUpperCase();\r\n\r\nexport class Llama2Tokenizer {\r\n protected tokens_trie = new Trie();\r\n protected special_tokens: Record<string, number> = {};\r\n\r\n protected vocab: Record<string, number> = {};\r\n protected vocab_ids: Record<number, string> = {};\r\n\r\n constructor() {}\r\n\r\n /**\r\n * Install the provided vocabulary into the class instance.\r\n *\r\n * @param {Record<string, number>} vocab - The vocabulary to be installed\r\n */\r\n install_vocab(vocab: Record<string, number>) {\r\n this.vocab = vocab;\r\n this.vocab_ids = Object.fromEntries(\r\n Object.entries(vocab).map(([token, id]) => [id, token])\r\n );\r\n this.tokens_trie = new Trie();\r\n for (const [token, id] of Object.entries(vocab)) {\r\n this.tokens_trie.add(token);\r\n }\r\n }\r\n\r\n /**\r\n * Get the size of the vocabulary, including special tokens.\r\n *\r\n * @return {number} the size of the vocabulary\r\n */\r\n get vocab_size(): number {\r\n return (\r\n Object.keys(this.vocab).length + Object.keys(this.special_tokens).length\r\n );\r\n }\r\n\r\n /**\r\n * Get the maximum id from the vocab_ids and special_tokens.\r\n *\r\n * @return {number} the maximum id\r\n */\r\n get max_id(): number {\r\n // NOTE: vocab 最大可能超过 js 函数参数个数最大范围，所以不能 `Math.max(...Object.keys(this.vocab_ids))`\r\n let max_id = 0;\r\n for (const id of Object.keys(this.vocab_ids)) {\r\n max_id = Math.max(max_id, parseInt(id));\r\n }\r\n for (const id of Object.values(this.special_tokens)) {\r\n max_id = Math.max(max_id, id);\r\n }\r\n return max_id;\r\n }\r\n\r\n /**\r\n * Adds a special token with an optional token ID.\r\n *\r\n * @param {string} token - the special token to be added\r\n * @param {number} [token_id] - the optional token ID\r\n * @return {void}\r\n */\r\n add_special_token(token: string, token_id?: number) {\r\n if (token_id === undefined) {\r\n token_id = this.max_id + 1;\r\n }\r\n this.special_tokens[token] = token_id;\r\n this.tokens_trie.add(token);\r\n }\r\n\r\n /**\r\n * Adds special tokens to the list of tokens.\r\n *\r\n * @param {Array} tokens - An array of tokens to add. Each token can be a string or an object with `token` and `token_id` properties.\r\n */\r\n add_special_tokens(\r\n tokens: (\r\n | string\r\n | {\r\n token: string;\r\n token_id: number;\r\n }\r\n )[]\r\n ) {\r\n for (const token of tokens) {\r\n if (typeof token === \"string\") {\r\n this.add_special_token(token);\r\n } else {\r\n this.add_special_token(token.token, token.token_id);\r\n }\r\n }\r\n }\r\n\r\n /**\r\n * Convert an id to a token.\r\n *\r\n * @param {number} id - The id to be converted to a token.\r\n * @return {string} The corresponding token for the given id.\r\n */\r\n ids_to_token(id: number): string {\r\n const token = this.vocab_ids[id];\r\n const special_token = Object.entries(this.special_tokens).find(\r\n ([_, token_id]) => token_id === id\r\n );\r\n if (token) {\r\n return token;\r\n } else if (special_token) {\r\n return special_token[0];\r\n } else {\r\n throw new Error(`Unknown id: ${id}`);\r\n }\r\n }\r\n /**\r\n * token_to_id function takes a token as input and returns its corresponding id if found in the vocabulary, otherwise throws an error.\r\n *\r\n * @param {string} token - the input token\r\n * @return {number} the corresponding id of the input token\r\n */\r\n token_to_id(token: string): number {\r\n const id = this.vocab[token];\r\n const special_token = this.special_tokens[token];\r\n if (id !== undefined) {\r\n return id;\r\n } else if (special_token !== undefined) {\r\n return special_token;\r\n } else {\r\n throw new Error(`Unknown token: ${token}`);\r\n }\r\n }\r\n\r\n /**\r\n * Retrieve the vocabulary.\r\n *\r\n * @return {Object} a shallow copy of the vocabulary\r\n */\r\n get_vocab() {\r\n return { ...this.vocab, ...this.special_tokens };\r\n }\r\n\r\n /**\r\n * Checks if the token is a valid token.\r\n *\r\n * @param {string} token - the token to be checked\r\n * @return {boolean} true if the token is valid, false otherwise\r\n */\r\n valid_token(token: string): boolean {\r\n return token in this.vocab || token in this.special_tokens;\r\n }\r\n\r\n /**\r\n * Converts a string in a sequence of tokens, using the tokenizer.\r\n */\r\n tokenize(text: string): string[] {\r\n const tokens = this.tokens_trie.split(text);\r\n\r\n const result = [] as string[];\r\n for (const token of tokens) {\r\n if (this.valid_token(token)) {\r\n result.push(token);\r\n } else {\r\n // convert unknown unicode to <0xXX>\r\n // TODO: use a better way to handle unknown unicode (某些vocab不支持unknown unicode可能需要<unk>代替)\r\n const bytes = utf8Encoder.encode(token);\r\n for (const byte of bytes) {\r\n result.push(`<0x${byteToHex(byte)}>`);\r\n }\r\n }\r\n }\r\n\r\n return result;\r\n }\r\n\r\n /**\r\n * Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.\r\n */\r\n encode(text: string): number[] {\r\n return this.convert_tokens_to_ids(this.tokenize(text));\r\n }\r\n\r\n /**\r\n * Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens.\r\n */\r\n decode(ids: number[]): string {\r\n return this.convert_tokens_to_string(this.convert_ids_to_tokens(ids));\r\n }\r\n\r\n /**\r\n * Converts a sequence of tokens (string) in a single string.\r\n */\r\n convert_tokens_to_string(tokens: string[]): string {\r\n for (const token of tokens) {\r\n if (!this.valid_token(token)) {\r\n throw new Error(`Unknown token: ${token}`);\r\n }\r\n }\r\n const chars = [] as string[];\r\n\r\n let index = 0;\r\n while (index < tokens.length) {\r\n let token = tokens[index];\r\n index += 1;\r\n if (!token.startsWith(\"<0x\")) {\r\n chars.push(token);\r\n continue;\r\n }\r\n const bytes = [] as number[];\r\n while (token && token.startsWith(\"<0x\")) {\r\n bytes.push(parseInt(token.slice(3, 5), 16));\r\n token = tokens[index];\r\n index += 1;\r\n }\r\n chars.push(utf8Decoder.decode(new Uint8Array(bytes)));\r\n }\r\n\r\n return chars.join(\"\");\r\n }\r\n\r\n /**\r\n * Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary.\r\n */\r\n convert_tokens_to_ids(tokens: string[]): number[] {\r\n let result: number[] = [];\r\n for (const token of tokens) {\r\n const id = this.token_to_id(token);\r\n result.push(id);\r\n }\r\n return result;\r\n }\r\n\r\n /**\r\n * Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens.\r\n */\r\n convert_ids_to_tokens(ids: number[]): string[] {\r\n return ids.map((id) => {\r\n const token = this.ids_to_token(id);\r\n return token;\r\n });\r\n }\r\n}\r\n\r\n// test\r\n// const main = async () => {\r\n// const tokenizer = new Llama2Tokenizer();\r\n// tokenizer.load_llama2_vocab();\r\n// console.log(tokenizer.tokenize(\"你好，世界！\"));\r\n// // [\"你\", \"好\", \"，\", \"世\", \"界\", \"！\"]\r\n// };\r\n// main();\r\n"],"mappings":"+EAIO,IAAMA,EAAN,KAAW,CAJlB,MAIkB,CAAAC,EAAA,aACR,KACA,QAER,aAAc,CACZ,KAAK,KAAO,CAAC,EACb,KAAK,QAAU,IAAI,GACrB,CAqBA,IAAIC,EAAoB,CACtB,GAAI,CAACA,EAEH,OAGF,KAAK,QAAQ,IAAIA,CAAI,EACrB,IAAIC,EAAM,KAAK,KACf,QAAWC,KAAQF,EACjBC,EAAIC,CAAI,EAAIA,KAAQD,EAAMA,EAAIC,CAAI,EAAI,CAAC,EACvCD,EAAMA,EAAIC,CAAI,EAEhBD,EAAI,EAAE,EAAI,CACZ,CAsBA,MAAME,EAAwB,CAC5B,GAAI,CAACA,EACH,MAAO,CAAC,EAEV,IAAIC,EAA8B,CAAC,EAc7BC,EAAoB,CAAC,CAAC,EAKxBC,EAAO,EAGX,QAASC,EAAU,EAAGA,EAAUJ,EAAK,OAAQI,IAAW,CACtD,GAAID,GAAQC,EAAUD,EAGpB,SAOF,IAAIE,EAAwB,IAAI,IAG5BC,EAAQ,GAGZ,QAASC,KAASN,EAAQ,CACxB,IAAIO,EAAcP,EAAOM,CAAK,EAC9B,GAAI,KAAMC,EAAa,CASrB,IAAIC,EACAC,EACAC,EAEJ,QAAWC,KAAaX,EAAQ,CAC9B,IAAIY,EAAkBZ,EAAOW,CAAS,EACtC,GAAI,SAASA,CAAS,EAAI,SAASL,CAAK,EAEtC,MAoBF,IAnBW,SAASK,CAAS,EAAI,SAASL,CAAK,GAG7CE,EAAiBL,EAAU,EAC3BM,EAAMN,EAAU,IAKhBK,EAAiBL,EACjBM,EAAMN,GAERO,EACEF,EAAiBT,EAAK,OAASA,EAAKS,CAAc,EAAI,MACpD,KAAMI,KACRN,EAAQK,EACRF,EAAMD,EACNN,EAAOM,GAEFE,GAAYA,KAAYE,IAC7BA,EAAkBA,EAAgBF,CAAQ,EAC1CF,GAAkB,EACd,KAAMI,IACRN,EAAQK,EACRF,EAAMD,EACNN,EAAOM,GAGLA,IAAmBT,EAAK,SAI5BW,EAAWX,EAAKS,CAAc,CAGlC,CAGAP,EAAQ,KAAK,SAASK,CAAK,CAAC,EAC5BL,EAAQ,KAAKQ,CAAI,EACjBJ,EAAQ,GACR,KACF,MAAWN,EAAKI,CAAO,IAAKI,GAG1BA,EAAcA,EAAYR,EAAKI,CAAO,CAAC,EAIvCH,EAAOM,CAAK,EAAIC,GAMhBH,EAAS,IAAI,SAASE,CAAK,CAAC,CAEhC,CAIA,GAAID,EACFL,EAAS,CAAC,MAEV,SAAWM,KAASF,EAClB,OAAOJ,EAAOM,CAAK,EAMnBH,GAAWD,GAAQH,EAAKI,CAAO,IAAK,KAAK,OAC3CH,EAAOG,CAAO,EAAI,KAAK,KAAKJ,EAAKI,CAAO,CAAC,EAE7C,CAGA,QAAWG,KAASN,EAElB,GAAI,KADgBA,EAAOM,CAAK,EACT,CAGrB,IAAMG,EAAMV,EAAK,OACjBE,EAAQ,KAAK,SAASK,CAAK,CAAC,EAC5BL,EAAQ,KAAKQ,CAAG,EAGhB,KACF,CAGF,OAAO,KAAK,QAAQV,EAAME,CAAO,CACnC,CAEU,QAAQF,EAAcE,EAA6B,CAI3DA,EAAQ,KAAKF,EAAK,MAAM,EACxB,IAAMc,EAAmB,CAAC,EACtBP,EAAQ,EACZ,QAAWG,KAAOR,EAAS,CACzB,GAAIK,EAAQG,EAAK,CACf,QAAQ,MACN,oGACF,EACA,QACF,SAAWH,IAAUG,EAInB,SAEFI,EAAO,KAAKd,EAAK,MAAMO,EAAOG,CAAG,CAAC,EAClCH,EAAQG,CACV,CAEA,OAAOI,CACT,CACF,ECvPA,IAAMC,EAAc,IAAI,YAClBC,EAAc,IAAI,YAAY,OAAO,EAErCC,EAAYC,EAACC,GACjBA,EAAK,SAAS,EAAE,EAAE,SAAS,EAAG,GAAG,EAAE,YAAY,EAD/B,aAGLC,EAAN,KAAsB,CAR7B,MAQ6B,CAAAF,EAAA,wBACjB,YAAc,IAAIG,EAClB,eAAyC,CAAC,EAE1C,MAAgC,CAAC,EACjC,UAAoC,CAAC,EAE/C,aAAc,CAAC,CAOf,cAAcC,EAA+B,CAC3C,KAAK,MAAQA,EACb,KAAK,UAAY,OAAO,YACtB,OAAO,QAAQA,CAAK,EAAE,IAAI,CAAC,CAACC,EAAOC,CAAE,IAAM,CAACA,EAAID,CAAK,CAAC,CACxD,EACA,KAAK,YAAc,IAAIF,EACvB,OAAW,CAACE,EAAOC,CAAE,IAAK,OAAO,QAAQF,CAAK,EAC5C,KAAK,YAAY,IAAIC,CAAK,CAE9B,CAOA,IAAI,YAAqB,CACvB,OACE,OAAO,KAAK,KAAK,KAAK,EAAE,OAAS,OAAO,KAAK,KAAK,cAAc,EAAE,MAEtE,CAOA,IAAI,QAAiB,CAEnB,IAAIE,EAAS,EACb,QAAWD,KAAM,OAAO,KAAK,KAAK,SAAS,EACzCC,EAAS,KAAK,IAAIA,EAAQ,SAASD,CAAE,CAAC,EAExC,QAAWA,KAAM,OAAO,OAAO,KAAK,cAAc,EAChDC,EAAS,KAAK,IAAIA,EAAQD,CAAE,EAE9B,OAAOC,CACT,CASA,kBAAkBF,EAAeG,EAAmB,CAC9CA,IAAa,SACfA,EAAW,KAAK,OAAS,GAE3B,KAAK,eAAeH,CAAK,EAAIG,EAC7B,KAAK,YAAY,IAAIH,CAAK,CAC5B,CAOA,mBACEI,EAOA,CACA,QAAWJ,KAASI,EACd,OAAOJ,GAAU,SACnB,KAAK,kBAAkBA,CAAK,EAE5B,KAAK,kBAAkBA,EAAM,MAAOA,EAAM,QAAQ,CAGxD,CAQA,aAAaC,EAAoB,CAC/B,IAAMD,EAAQ,KAAK,UAAUC,CAAE,EACzBI,EAAgB,OAAO,QAAQ,KAAK,cAAc,EAAE,KACxD,CAAC,CAACC,EAAGH,CAAQ,IAAMA,IAAaF,CAClC,EACA,GAAID,EACF,OAAOA,EACF,GAAIK,EACT,OAAOA,EAAc,CAAC,EAEtB,MAAM,IAAI,MAAM,eAAeJ,CAAE,EAAE,CAEvC,CAOA,YAAYD,EAAuB,CACjC,IAAMC,EAAK,KAAK,MAAMD,CAAK,EACrBK,EAAgB,KAAK,eAAeL,CAAK,EAC/C,GAAIC,IAAO,OACT,OAAOA,EACF,GAAII,IAAkB,OAC3B,OAAOA,EAEP,MAAM,IAAI,MAAM,kBAAkBL,CAAK,EAAE,CAE7C,CAOA,WAAY,CACV,MAAO,CAAE,GAAG,KAAK,MAAO,GAAG,KAAK,cAAe,CACjD,CAQA,YAAYA,EAAwB,CAClC,OAAOA,KAAS,KAAK,OAASA,KAAS,KAAK,cAC9C,CAKA,SAASO,EAAwB,CAC/B,IAAMH,EAAS,KAAK,YAAY,MAAMG,CAAI,EAEpCC,EAAS,CAAC,EAChB,QAAWR,KAASI,EAClB,GAAI,KAAK,YAAYJ,CAAK,EACxBQ,EAAO,KAAKR,CAAK,MACZ,CAGL,IAAMS,EAAQjB,EAAY,OAAOQ,CAAK,EACtC,QAAWJ,KAAQa,EACjBD,EAAO,KAAK,MAAMd,EAAUE,CAAI,CAAC,GAAG,CAExC,CAGF,OAAOY,CACT,CAKA,OAAOD,EAAwB,CAC7B,OAAO,KAAK,sBAAsB,KAAK,SAASA,CAAI,CAAC,CACvD,CAKA,OAAOG,EAAuB,CAC5B,OAAO,KAAK,yBAAyB,KAAK,sBAAsBA,CAAG,CAAC,CACtE,CAKA,yBAAyBN,EAA0B,CACjD,QAAWJ,KAASI,EAClB,GAAI,CAAC,KAAK,YAAYJ,CAAK,EACzB,MAAM,IAAI,MAAM,kBAAkBA,CAAK,EAAE,EAG7C,IAAMW,EAAQ,CAAC,EAEXC,EAAQ,EACZ,KAAOA,EAAQR,EAAO,QAAQ,CAC5B,IAAIJ,EAAQI,EAAOQ,CAAK,EAExB,GADAA,GAAS,EACL,CAACZ,EAAM,WAAW,KAAK,EAAG,CAC5BW,EAAM,KAAKX,CAAK,EAChB,QACF,CACA,IAAMS,EAAQ,CAAC,EACf,KAAOT,GAASA,EAAM,WAAW,KAAK,GACpCS,EAAM,KAAK,SAAST,EAAM,MAAM,EAAG,CAAC,EAAG,EAAE,CAAC,EAC1CA,EAAQI,EAAOQ,CAAK,EACpBA,GAAS,EAEXD,EAAM,KAAKlB,EAAY,OAAO,IAAI,WAAWgB,CAAK,CAAC,CAAC,CACtD,CAEA,OAAOE,EAAM,KAAK,EAAE,CACtB,CAKA,sBAAsBP,EAA4B,CAChD,IAAII,EAAmB,CAAC,EACxB,QAAWR,KAASI,EAAQ,CAC1B,IAAMH,EAAK,KAAK,YAAYD,CAAK,EACjCQ,EAAO,KAAKP,CAAE,CAChB,CACA,OAAOO,CACT,CAKA,sBAAsBE,EAAyB,CAC7C,OAAOA,EAAI,IAAKT,GACA,KAAK,aAAaA,CAAE,CAEnC,CACH,CACF","names":["Trie","__name","word","ref","char","text","states","offsets","skip","current","toRemove","reset","start","triePointer","lookaheadIndex","end","nextChar","lookStart","lookTriePointer","tokens","utf8Encoder","utf8Decoder","byteToHex","__name","byte","Llama2Tokenizer","Trie","vocab","token","id","max_id","token_id","tokens","special_token","_","text","result","bytes","ids","chars","index"]}