UNPKG

@lenml/llama2-tokenizer

Version:

Our library `@lenml/llama2-tokenizer` has been deprecated. We are excited to introduce our new library `@lenml/tokenizers` as its replacement, offering a broader set of features and an enhanced experience.

2 lines 3.85 kB
"use strict";var Llama2Tokenizer=(()=>{var b=Object.defineProperty;var m=Object.getOwnPropertyDescriptor;var v=Object.getOwnPropertyNames;var w=Object.prototype.hasOwnProperty;var f=(o,t)=>b(o,"name",{value:t,configurable:!0});var y=(o,t)=>{for(var e in t)b(o,e,{get:t[e],enumerable:!0})},I=(o,t,e,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let s of v(t))!w.call(o,s)&&s!==e&&b(o,s,{get:()=>t[s],enumerable:!(n=m(t,s))||n.enumerable});return o};var j=o=>I(b({},"__esModule",{value:!0}),o);var S={};y(S,{Llama2Tokenizer:()=>g});var k=class{static{f(this,"Trie")}data;_tokens;constructor(){this.data={},this._tokens=new Set}add(t){if(!t)return;this._tokens.add(t);let e=this.data;for(let n of t)e[n]=n in e?e[n]:{},e=e[n];e[""]=1}split(t){if(!t)return[];let e={},n=[0],s=0;for(let r=0;r<t.length;r++){if(s&&r<s)continue;let a=new Set,p=!1;for(let c in e){let d=e[c];if(""in d){let i,l,u;for(let h in e){let _=e[h];if(parseInt(h)>parseInt(c))break;for(parseInt(h)<parseInt(c)?(i=r+1,l=r+1):(i=r,l=r),u=i<t.length?t[i]:null,(""in _)&&(c=h,l=i,s=i);u&&u in _&&(_=_[u],i+=1,""in _&&(c=h,l=i,s=i),i!==t.length);)u=t[i]}n.push(parseInt(c)),n.push(l),p=!0;break}else t[r]in d?(d=d[t[r]],e[c]=d):a.add(parseInt(c))}if(p)e={};else for(let c of a)delete e[c];r>=s&&t[r]in this.data&&(e[r]=this.data[t[r]])}for(let r in e)if(""in e[r]){let p=t.length;n.push(parseInt(r)),n.push(p);break}return this.cutText(t,n)}cutText(t,e){e.push(t.length);let n=[],s=0;for(let r of e){if(s>r){console.error("There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.");continue}else if(s===r)continue;n.push(t.slice(s,r)),s=r}return n}};var O=new TextEncoder,x=new TextDecoder("utf-8"),R=f(o=>o.toString(16).padStart(2,"0").toUpperCase(),"byteToHex"),g=class{static{f(this,"Llama2Tokenizer")}tokens_trie=new k;special_tokens={};vocab={};vocab_ids={};constructor(){}install_vocab(t){this.vocab=t,this.vocab_ids=Object.fromEntries(Object.entries(t).map(([e,n])=>[n,e])),this.tokens_trie=new k;for(let[e,n]of Object.entries(t))this.tokens_trie.add(e)}get vocab_size(){return Object.keys(this.vocab).length+Object.keys(this.special_tokens).length}get max_id(){let t=0;for(let e of Object.keys(this.vocab_ids))t=Math.max(t,parseInt(e));for(let e of Object.values(this.special_tokens))t=Math.max(t,e);return t}add_special_token(t,e){e===void 0&&(e=this.max_id+1),this.special_tokens[t]=e,this.tokens_trie.add(t)}add_special_tokens(t){for(let e of t)typeof e=="string"?this.add_special_token(e):this.add_special_token(e.token,e.token_id)}ids_to_token(t){let e=this.vocab_ids[t],n=Object.entries(this.special_tokens).find(([s,r])=>r===t);if(e)return e;if(n)return n[0];throw new Error(`Unknown id: ${t}`)}token_to_id(t){let e=this.vocab[t],n=this.special_tokens[t];if(e!==void 0)return e;if(n!==void 0)return n;throw new Error(`Unknown token: ${t}`)}get_vocab(){return{...this.vocab,...this.special_tokens}}valid_token(t){return t in this.vocab||t in this.special_tokens}tokenize(t){let e=this.tokens_trie.split(t),n=[];for(let s of e)if(this.valid_token(s))n.push(s);else{let r=O.encode(s);for(let a of r)n.push(`<0x${R(a)}>`)}return n}encode(t){return this.convert_tokens_to_ids(this.tokenize(t))}decode(t){return this.convert_tokens_to_string(this.convert_ids_to_tokens(t))}convert_tokens_to_string(t){for(let s of t)if(!this.valid_token(s))throw new Error(`Unknown token: ${s}`);let e=[],n=0;for(;n<t.length;){let s=t[n];if(n+=1,!s.startsWith("<0x")){e.push(s);continue}let r=[];for(;s&&s.startsWith("<0x");)r.push(parseInt(s.slice(3,5),16)),s=t[n],n+=1;e.push(x.decode(new Uint8Array(r)))}return e.join("")}convert_tokens_to_ids(t){let e=[];for(let n of t){let s=this.token_to_id(n);e.push(s)}return e}convert_ids_to_tokens(t){return t.map(e=>this.ids_to_token(e))}};return j(S);})(); //# sourceMappingURL=main.global.js.map