kitoken
Version:
Fast tokenizer for language models, supporting BPE, Unigram and WordPiece tokenization
624 lines (590 loc) • 22.1 kB
JavaScript
/**
* Kitoken tokenizer.
* A fast and versatile tokenizer for language models.
*/
export class Kitoken {
__destroy_into_raw() {
const ptr = this.__wbg_ptr;
this.__wbg_ptr = 0;
KitokenFinalization.unregister(this);
return ptr;
}
free() {
const ptr = this.__destroy_into_raw();
wasm.__wbg_kitoken_free(ptr, 0);
}
/**
* Returns the configuration of the tokenizer.
* @returns {any}
*/
config() {
const ret = wasm.kitoken_config(this.__wbg_ptr);
return takeObject(ret);
}
/**
* Decodes the given sequence of tokens into text.
*
* `decode_specials` specifies which tokens from the special vocabulary are included in the output.
* Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`.
*
* Returns a list of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration.
* @param {Uint32Array} tokens
* @param {any} decode_specials
* @returns {Uint8Array}
*/
decode(tokens, decode_specials) {
try {
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
const ptr0 = passArray32ToWasm0(tokens, wasm.__wbindgen_export);
const len0 = WASM_VECTOR_LEN;
wasm.kitoken_decode(retptr, this.__wbg_ptr, ptr0, len0, addHeapObject(decode_specials));
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
if (r3) {
throw takeObject(r2);
}
var v2 = getArrayU8FromWasm0(r0, r1).slice();
wasm.__wbindgen_export4(r0, r1 * 1, 1);
return v2;
} finally {
wasm.__wbindgen_add_to_stack_pointer(16);
}
}
/**
* Decodes the given sequences of tokens into texts.
*
* `decode_specials` specifies which tokens from the special vocabulary are included in the output.
* Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`.
*
* Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration.
* @param {any[]} tokens
* @param {any} decode_specials
* @returns {any[]}
*/
decode_all(tokens, decode_specials) {
try {
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
const ptr0 = passArrayJsValueToWasm0(tokens, wasm.__wbindgen_export);
const len0 = WASM_VECTOR_LEN;
wasm.kitoken_decode_all(retptr, this.__wbg_ptr, ptr0, len0, addHeapObject(decode_specials));
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
if (r3) {
throw takeObject(r2);
}
var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
wasm.__wbindgen_export4(r0, r1 * 4, 4);
return v2;
} finally {
wasm.__wbindgen_add_to_stack_pointer(16);
}
}
/**
* Encodes the given text into a sequence of tokens.
*
* `encode_specials` specifies which special tokens are tokenized with the special vocabulary instead of the regular vocabulary.
* Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`.
* When `true`, all special token categories from the special vocabulary are used.
*
* Returns a list of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration.
* @param {string} text
* @param {any} encode_specials
* @returns {Uint32Array}
*/
encode(text, encode_specials) {
try {
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
const ptr0 = passStringToWasm0(text, wasm.__wbindgen_export, wasm.__wbindgen_export2);
const len0 = WASM_VECTOR_LEN;
wasm.kitoken_encode(retptr, this.__wbg_ptr, ptr0, len0, addHeapObject(encode_specials));
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
if (r3) {
throw takeObject(r2);
}
var v2 = getArrayU32FromWasm0(r0, r1).slice();
wasm.__wbindgen_export4(r0, r1 * 4, 4);
return v2;
} finally {
wasm.__wbindgen_add_to_stack_pointer(16);
}
}
/**
* Encodes the given texts into sequences of tokens.
*
* `encode_specials` specifies which special tokens are tokenized with the special vocabulary instead of the regular vocabulary.
* Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`.
* When `true`, all special token categories from the special vocabulary are used.
*
* Returns a list of lists of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration.
* @param {string[]} text
* @param {any} encode_specials
* @returns {any[]}
*/
encode_all(text, encode_specials) {
try {
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
const ptr0 = passArrayJsValueToWasm0(text, wasm.__wbindgen_export);
const len0 = WASM_VECTOR_LEN;
wasm.kitoken_encode_all(retptr, this.__wbg_ptr, ptr0, len0, addHeapObject(encode_specials));
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
if (r3) {
throw takeObject(r2);
}
var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
wasm.__wbindgen_export4(r0, r1 * 4, 4);
return v2;
} finally {
wasm.__wbindgen_add_to_stack_pointer(16);
}
}
/**
* Returns the metadata of the tokenizer.
* @returns {any}
*/
meta() {
const ret = wasm.kitoken_meta(this.__wbg_ptr);
return takeObject(ret);
}
/**
* Initializes the tokenizer from a serialized `kitoken` definition.
* @param {Uint8Array} data
*/
constructor(data) {
try {
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
const ptr0 = passArray8ToWasm0(data, wasm.__wbindgen_export);
const len0 = WASM_VECTOR_LEN;
wasm.kitoken_new(retptr, ptr0, len0);
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
if (r2) {
throw takeObject(r1);
}
this.__wbg_ptr = r0;
KitokenFinalization.register(this, this.__wbg_ptr, this);
return this;
} finally {
wasm.__wbindgen_add_to_stack_pointer(16);
}
}
}
if (Symbol.dispose) Kitoken.prototype[Symbol.dispose] = Kitoken.prototype.free;
export function __wbg_Error_bce6d499ff0a4aff(arg0, arg1) {
const ret = Error(getStringFromWasm0(arg0, arg1));
return addHeapObject(ret);
}
export function __wbg_Number_b7972a139bfbfdf0(arg0) {
const ret = Number(getObject(arg0));
return ret;
}
export function __wbg_String_8564e559799eccda(arg0, arg1) {
const ret = String(getObject(arg1));
const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_export, wasm.__wbindgen_export2);
const len1 = WASM_VECTOR_LEN;
getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
}
export function __wbg___wbindgen_boolean_get_2304fb8c853028c8(arg0) {
const v = getObject(arg0);
const ret = typeof(v) === 'boolean' ? v : undefined;
return isLikeNone(ret) ? 0xFFFFFF : ret ? 1 : 0;
}
export function __wbg___wbindgen_debug_string_edece8177ad01481(arg0, arg1) {
const ret = debugString(getObject(arg1));
const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_export, wasm.__wbindgen_export2);
const len1 = WASM_VECTOR_LEN;
getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
}
export function __wbg___wbindgen_is_function_5cd60d5cf78b4eef(arg0) {
const ret = typeof(getObject(arg0)) === 'function';
return ret;
}
export function __wbg___wbindgen_is_null_or_undefined_74c58242e8529df3(arg0) {
const ret = getObject(arg0) == null;
return ret;
}
export function __wbg___wbindgen_is_object_b4593df85baada48(arg0) {
const val = getObject(arg0);
const ret = typeof(val) === 'object' && val !== null;
return ret;
}
export function __wbg___wbindgen_jsval_loose_eq_0ad77b7717db155c(arg0, arg1) {
const ret = getObject(arg0) == getObject(arg1);
return ret;
}
export function __wbg___wbindgen_number_get_f73a1244370fcc2c(arg0, arg1) {
const obj = getObject(arg1);
const ret = typeof(obj) === 'number' ? obj : undefined;
getDataViewMemory0().setFloat64(arg0 + 8 * 1, isLikeNone(ret) ? 0 : ret, true);
getDataViewMemory0().setInt32(arg0 + 4 * 0, !isLikeNone(ret), true);
}
export function __wbg___wbindgen_string_get_d109740c0d18f4d7(arg0, arg1) {
const obj = getObject(arg1);
const ret = typeof(obj) === 'string' ? obj : undefined;
var ptr1 = isLikeNone(ret) ? 0 : passStringToWasm0(ret, wasm.__wbindgen_export, wasm.__wbindgen_export2);
var len1 = WASM_VECTOR_LEN;
getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
}
export function __wbg___wbindgen_throw_9c31b086c2b26051(arg0, arg1) {
throw new Error(getStringFromWasm0(arg0, arg1));
}
export function __wbg_call_13665d9f14390edc() { return handleError(function (arg0, arg1) {
const ret = getObject(arg0).call(getObject(arg1));
return addHeapObject(ret);
}, arguments); }
export function __wbg_done_54b8da57023b7ed2(arg0) {
const ret = getObject(arg0).done;
return ret;
}
export function __wbg_fromCodePoint_a42ea1d19a55f2d5() { return handleError(function (arg0) {
const ret = String.fromCodePoint(arg0 >>> 0);
return addHeapObject(ret);
}, arguments); }
export function __wbg_get_3e9a707ab7d352eb() { return handleError(function (arg0, arg1) {
const ret = Reflect.get(getObject(arg0), getObject(arg1));
return addHeapObject(ret);
}, arguments); }
export function __wbg_get_unchecked_1dfe6d05ad91d9b7(arg0, arg1) {
const ret = getObject(arg0)[arg1 >>> 0];
return addHeapObject(ret);
}
export function __wbg_instanceof_ArrayBuffer_53db37b06f6b9afe(arg0) {
let result;
try {
result = getObject(arg0) instanceof ArrayBuffer;
} catch (_) {
result = false;
}
const ret = result;
return ret;
}
export function __wbg_instanceof_Uint8Array_abd07d4bd221d50b(arg0) {
let result;
try {
result = getObject(arg0) instanceof Uint8Array;
} catch (_) {
result = false;
}
const ret = result;
return ret;
}
export function __wbg_isArray_74b636a53056fecb(arg0) {
const ret = Array.isArray(getObject(arg0));
return ret;
}
export function __wbg_isArray_94898ed3aad6947b(arg0) {
const ret = Array.isArray(getObject(arg0));
return ret;
}
export function __wbg_isSafeInteger_01e964d144ad3a55(arg0) {
const ret = Number.isSafeInteger(getObject(arg0));
return ret;
}
export function __wbg_iterator_1441b47f341dc34f() {
const ret = Symbol.iterator;
return addHeapObject(ret);
}
export function __wbg_length_2591a0f4f659a55c(arg0) {
const ret = getObject(arg0).length;
return ret;
}
export function __wbg_length_56fcd3e2b7e0299d(arg0) {
const ret = getObject(arg0).length;
return ret;
}
export function __wbg_new_02d162bc6cf02f60() {
const ret = new Object();
return addHeapObject(ret);
}
export function __wbg_new_310879b66b6e95e1() {
const ret = new Array();
return addHeapObject(ret);
}
export function __wbg_new_7ddec6de44ff8f5d(arg0) {
const ret = new Uint8Array(getObject(arg0));
return addHeapObject(ret);
}
export function __wbg_next_2a4e19f4f5083b0f(arg0) {
const ret = getObject(arg0).next;
return addHeapObject(ret);
}
export function __wbg_next_6429a146bf756f93() { return handleError(function (arg0) {
const ret = getObject(arg0).next();
return addHeapObject(ret);
}, arguments); }
export function __wbg_prototypesetcall_5f9bdc8d75e07276(arg0, arg1, arg2) {
Uint8Array.prototype.set.call(getArrayU8FromWasm0(arg0, arg1), getObject(arg2));
}
export function __wbg_set_6be42768c690e380(arg0, arg1, arg2) {
getObject(arg0)[takeObject(arg1)] = takeObject(arg2);
}
export function __wbg_set_78ea6a19f4818587(arg0, arg1, arg2) {
getObject(arg0)[arg1 >>> 0] = takeObject(arg2);
}
export function __wbg_value_9cc0518af87a489c(arg0) {
const ret = getObject(arg0).value;
return addHeapObject(ret);
}
export function __wbindgen_cast_0000000000000001(arg0) {
// Cast intrinsic for `F64 -> Externref`.
const ret = arg0;
return addHeapObject(ret);
}
export function __wbindgen_cast_0000000000000002(arg0, arg1) {
// Cast intrinsic for `Ref(String) -> Externref`.
const ret = getStringFromWasm0(arg0, arg1);
return addHeapObject(ret);
}
export function __wbindgen_cast_0000000000000003(arg0, arg1) {
var v0 = getArrayU32FromWasm0(arg0, arg1).slice();
wasm.__wbindgen_export4(arg0, arg1 * 4, 4);
// Cast intrinsic for `Vector(U32) -> Externref`.
const ret = v0;
return addHeapObject(ret);
}
export function __wbindgen_cast_0000000000000004(arg0, arg1) {
var v0 = getArrayU8FromWasm0(arg0, arg1).slice();
wasm.__wbindgen_export4(arg0, arg1 * 1, 1);
// Cast intrinsic for `Vector(U8) -> Externref`.
const ret = v0;
return addHeapObject(ret);
}
export function __wbindgen_object_clone_ref(arg0) {
const ret = getObject(arg0);
return addHeapObject(ret);
}
export function __wbindgen_object_drop_ref(arg0) {
takeObject(arg0);
}
const KitokenFinalization = (typeof FinalizationRegistry === 'undefined')
? { register: () => {}, unregister: () => {} }
: new FinalizationRegistry(ptr => wasm.__wbg_kitoken_free(ptr, 1));
function addHeapObject(obj) {
if (heap_next === heap.length) heap.push(heap.length + 1);
const idx = heap_next;
heap_next = heap[idx];
heap[idx] = obj;
return idx;
}
function debugString(val) {
// primitive types
const type = typeof val;
if (type == 'number' || type == 'boolean' || val == null) {
return `${val}`;
}
if (type == 'string') {
return `"${val}"`;
}
if (type == 'symbol') {
const description = val.description;
if (description == null) {
return 'Symbol';
} else {
return `Symbol(${description})`;
}
}
if (type == 'function') {
const name = val.name;
if (typeof name == 'string' && name.length > 0) {
return `Function(${name})`;
} else {
return 'Function';
}
}
// objects
if (Array.isArray(val)) {
const length = val.length;
let debug = '[';
if (length > 0) {
debug += debugString(val[0]);
}
for(let i = 1; i < length; i++) {
debug += ', ' + debugString(val[i]);
}
debug += ']';
return debug;
}
// Test for built-in
const builtInMatches = /\[object ([^\]]+)\]/.exec(toString.call(val));
let className;
if (builtInMatches && builtInMatches.length > 1) {
className = builtInMatches[1];
} else {
// Failed to match the standard '[object ClassName]'
return toString.call(val);
}
if (className == 'Object') {
// we're a user defined class or Object
// JSON.stringify avoids problems with cycles, and is generally much
// easier than looping through ownProperties of `val`.
try {
return 'Object(' + JSON.stringify(val) + ')';
} catch (_) {
return 'Object';
}
}
// errors
if (val instanceof Error) {
return `${val.name}: ${val.message}\n${val.stack}`;
}
// TODO we could test for more things here, like `Set`s and `Map`s.
return className;
}
function dropObject(idx) {
if (idx < 1028) return;
heap[idx] = heap_next;
heap_next = idx;
}
function getArrayJsValueFromWasm0(ptr, len) {
ptr = ptr >>> 0;
const mem = getDataViewMemory0();
const result = [];
for (let i = ptr; i < ptr + 4 * len; i += 4) {
result.push(takeObject(mem.getUint32(i, true)));
}
return result;
}
function getArrayU32FromWasm0(ptr, len) {
ptr = ptr >>> 0;
return getUint32ArrayMemory0().subarray(ptr / 4, ptr / 4 + len);
}
function getArrayU8FromWasm0(ptr, len) {
ptr = ptr >>> 0;
return getUint8ArrayMemory0().subarray(ptr / 1, ptr / 1 + len);
}
let cachedDataViewMemory0 = null;
function getDataViewMemory0() {
if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) {
cachedDataViewMemory0 = new DataView(wasm.memory.buffer);
}
return cachedDataViewMemory0;
}
function getStringFromWasm0(ptr, len) {
return decodeText(ptr >>> 0, len);
}
let cachedUint32ArrayMemory0 = null;
function getUint32ArrayMemory0() {
if (cachedUint32ArrayMemory0 === null || cachedUint32ArrayMemory0.byteLength === 0) {
cachedUint32ArrayMemory0 = new Uint32Array(wasm.memory.buffer);
}
return cachedUint32ArrayMemory0;
}
let cachedUint8ArrayMemory0 = null;
function getUint8ArrayMemory0() {
if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
}
return cachedUint8ArrayMemory0;
}
function getObject(idx) { return heap[idx]; }
function handleError(f, args) {
try {
return f.apply(this, args);
} catch (e) {
wasm.__wbindgen_export3(addHeapObject(e));
}
}
let heap = new Array(1024).fill(undefined);
heap.push(undefined, null, true, false);
let heap_next = heap.length;
function isLikeNone(x) {
return x === undefined || x === null;
}
function passArray32ToWasm0(arg, malloc) {
const ptr = malloc(arg.length * 4, 4) >>> 0;
getUint32ArrayMemory0().set(arg, ptr / 4);
WASM_VECTOR_LEN = arg.length;
return ptr;
}
function passArray8ToWasm0(arg, malloc) {
const ptr = malloc(arg.length * 1, 1) >>> 0;
getUint8ArrayMemory0().set(arg, ptr / 1);
WASM_VECTOR_LEN = arg.length;
return ptr;
}
function passArrayJsValueToWasm0(array, malloc) {
const ptr = malloc(array.length * 4, 4) >>> 0;
const mem = getDataViewMemory0();
for (let i = 0; i < array.length; i++) {
mem.setUint32(ptr + 4 * i, addHeapObject(array[i]), true);
}
WASM_VECTOR_LEN = array.length;
return ptr;
}
function passStringToWasm0(arg, malloc, realloc) {
if (realloc === undefined) {
const buf = cachedTextEncoder.encode(arg);
const ptr = malloc(buf.length, 1) >>> 0;
getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
WASM_VECTOR_LEN = buf.length;
return ptr;
}
let len = arg.length;
let ptr = malloc(len, 1) >>> 0;
const mem = getUint8ArrayMemory0();
let offset = 0;
for (; offset < len; offset++) {
const code = arg.charCodeAt(offset);
if (code > 0x7F) break;
mem[ptr + offset] = code;
}
if (offset !== len) {
if (offset !== 0) {
arg = arg.slice(offset);
}
ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
const ret = cachedTextEncoder.encodeInto(arg, view);
offset += ret.written;
ptr = realloc(ptr, len, offset, 1) >>> 0;
}
WASM_VECTOR_LEN = offset;
return ptr;
}
function takeObject(idx) {
const ret = getObject(idx);
dropObject(idx);
return ret;
}
let cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
cachedTextDecoder.decode();
const MAX_SAFARI_DECODE_BYTES = 2146435072;
let numBytesDecoded = 0;
function decodeText(ptr, len) {
numBytesDecoded += len;
if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) {
cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
cachedTextDecoder.decode();
numBytesDecoded = len;
}
return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
}
const cachedTextEncoder = new TextEncoder();
if (!('encodeInto' in cachedTextEncoder)) {
cachedTextEncoder.encodeInto = function (arg, view) {
const buf = cachedTextEncoder.encode(arg);
view.set(buf);
return {
read: arg.length,
written: buf.length
};
};
}
let WASM_VECTOR_LEN = 0;
let wasm;
export function __wbg_set_wasm(val) {
wasm = val;
}