UNPKG

@lenml/llama2-tokenizer

Version:

Our library `@lenml/llama2-tokenizer` has been deprecated. We are excited to introduce our new library `@lenml/tokenizers` as its replacement, offering a broader set of features and an enhanced experience.

2 lines 3.39 kB
var g=Object.defineProperty;var f=(c,t)=>g(c,"name",{value:t,configurable:!0});var k=class{static{f(this,"Trie")}data;_tokens;constructor(){this.data={},this._tokens=new Set}add(t){if(!t)return;this._tokens.add(t);let e=this.data;for(let n of t)e[n]=n in e?e[n]:{},e=e[n];e[""]=1}split(t){if(!t)return[];let e={},n=[0],r=0;for(let s=0;s<t.length;s++){if(r&&s<r)continue;let a=new Set,p=!1;for(let i in e){let d=e[i];if(""in d){let o,l,u;for(let h in e){let _=e[h];if(parseInt(h)>parseInt(i))break;for(parseInt(h)<parseInt(i)?(o=s+1,l=s+1):(o=s,l=s),u=o<t.length?t[o]:null,(""in _)&&(i=h,l=o,r=o);u&&u in _&&(_=_[u],o+=1,""in _&&(i=h,l=o,r=o),o!==t.length);)u=t[o]}n.push(parseInt(i)),n.push(l),p=!0;break}else t[s]in d?(d=d[t[s]],e[i]=d):a.add(parseInt(i))}if(p)e={};else for(let i of a)delete e[i];s>=r&&t[s]in this.data&&(e[s]=this.data[t[s]])}for(let s in e)if(""in e[s]){let p=t.length;n.push(parseInt(s)),n.push(p);break}return this.cutText(t,n)}cutText(t,e){e.push(t.length);let n=[],r=0;for(let s of e){if(r>s){console.error("There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.");continue}else if(r===s)continue;n.push(t.slice(r,s)),r=s}return n}};var m=new TextEncoder,v=new TextDecoder("utf-8"),w=f(c=>c.toString(16).padStart(2,"0").toUpperCase(),"byteToHex"),b=class{static{f(this,"Llama2Tokenizer")}tokens_trie=new k;special_tokens={};vocab={};vocab_ids={};constructor(){}install_vocab(t){this.vocab=t,this.vocab_ids=Object.fromEntries(Object.entries(t).map(([e,n])=>[n,e])),this.tokens_trie=new k;for(let[e,n]of Object.entries(t))this.tokens_trie.add(e)}get vocab_size(){return Object.keys(this.vocab).length+Object.keys(this.special_tokens).length}get max_id(){let t=0;for(let e of Object.keys(this.vocab_ids))t=Math.max(t,parseInt(e));for(let e of Object.values(this.special_tokens))t=Math.max(t,e);return t}add_special_token(t,e){e===void 0&&(e=this.max_id+1),this.special_tokens[t]=e,this.tokens_trie.add(t)}add_special_tokens(t){for(let e of t)typeof e=="string"?this.add_special_token(e):this.add_special_token(e.token,e.token_id)}ids_to_token(t){let e=this.vocab_ids[t],n=Object.entries(this.special_tokens).find(([r,s])=>s===t);if(e)return e;if(n)return n[0];throw new Error(`Unknown id: ${t}`)}token_to_id(t){let e=this.vocab[t],n=this.special_tokens[t];if(e!==void 0)return e;if(n!==void 0)return n;throw new Error(`Unknown token: ${t}`)}get_vocab(){return{...this.vocab,...this.special_tokens}}valid_token(t){return t in this.vocab||t in this.special_tokens}tokenize(t){let e=this.tokens_trie.split(t),n=[];for(let r of e)if(this.valid_token(r))n.push(r);else{let s=m.encode(r);for(let a of s)n.push(`<0x${w(a)}>`)}return n}encode(t){return this.convert_tokens_to_ids(this.tokenize(t))}decode(t){return this.convert_tokens_to_string(this.convert_ids_to_tokens(t))}convert_tokens_to_string(t){for(let r of t)if(!this.valid_token(r))throw new Error(`Unknown token: ${r}`);let e=[],n=0;for(;n<t.length;){let r=t[n];if(n+=1,!r.startsWith("<0x")){e.push(r);continue}let s=[];for(;r&&r.startsWith("<0x");)s.push(parseInt(r.slice(3,5),16)),r=t[n],n+=1;e.push(v.decode(new Uint8Array(s)))}return e.join("")}convert_tokens_to_ids(t){let e=[];for(let n of t){let r=this.token_to_id(n);e.push(r)}return e}convert_ids_to_tokens(t){return t.map(e=>this.ids_to_token(e))}};export{b as Llama2Tokenizer}; //# sourceMappingURL=main.mjs.map