@cloudpss/ubjson
Version:
Opinionated UBJSON encoder/decoder for CloudPSS.
156 lines (141 loc) • 5.2 kB
text/typescript
// https://github.com/ehmicky/string-byte-length/blob/main/src/char_code.js
// Last ASCII character (1 byte)
const LAST_ASCII_CODEPOINT = 0x80;
// Last 2-bytes character
const LAST_TWO_BYTES_CODEPOINT = 0x800;
// Others are 3 bytes characters
// However, U+d800 to U+dbff:
// - Followed by U+dc00 to U+dfff -> 4 bytes together (astral character)
// - Otherwise -> 3 bytes (like above)
const FIRST_HIGH_SURROGATE = 0xd800;
const LAST_HIGH_SURROGATE = 0xdbff;
const FIRST_LOW_SURROGATE = 0xdc00;
const LAST_LOW_SURROGATE = 0xdfff;
/** 计算使用的空间 */
export function jsStringByteLength(string: string): number {
const charLength = string.length;
let byteLength = charLength;
for (let charIndex = 0; charIndex < charLength; charIndex += 1) {
const codepoint = string.charCodeAt(charIndex);
if (codepoint < LAST_ASCII_CODEPOINT) {
continue;
}
if (codepoint < LAST_TWO_BYTES_CODEPOINT) {
byteLength += 1;
continue;
}
byteLength += 2;
if (codepoint < FIRST_HIGH_SURROGATE || codepoint > LAST_HIGH_SURROGATE) {
continue;
}
// When out-of-bound, this returns NaN, which is `false` with the
// next condition
const nextCodepoint = string.charCodeAt(charIndex + 1);
// High surrogates should be followed by low surrogates.
// However, JavaScript strings allow invalid surrogates, which are counted
// as a normal 3-byte character. This should not happen often in real code
// though.
if (nextCodepoint < FIRST_LOW_SURROGATE || nextCodepoint > LAST_LOW_SURROGATE) {
continue;
}
charIndex += 1;
}
return byteLength;
}
/** 计算使用的空间 */
export function nodeStringByteLength(string: string): number {
return Buffer.byteLength(string, 'utf8');
}
/** 计算使用的空间 */
export let stringByteLength: (v: string) => number;
/**
* Encode string into utf-8.
* Provided `buf` MUST have enough space.
*/
export function jsEncodeInto(v: string, buf: Uint8Array, offset: number): number {
let ptr = offset;
const strLen = v.length;
for (let i = 0; i < strLen; i++) {
const c1 = v.charCodeAt(i);
if (c1 < LAST_ASCII_CODEPOINT) {
buf[ptr++] = c1;
} else if (c1 < LAST_TWO_BYTES_CODEPOINT) {
buf[ptr++] = 0xc0 | (c1 >> 6);
buf[ptr++] = 0x80 | (c1 & 0x3f);
} else if (c1 < FIRST_HIGH_SURROGATE || c1 > LAST_LOW_SURROGATE) {
buf[ptr++] = 0xe0 | (c1 >> 12);
buf[ptr++] = 0x80 | ((c1 >> 6) & 0x3f);
buf[ptr++] = 0x80 | (c1 & 0x3f);
} else if (c1 > LAST_HIGH_SURROGATE) {
// low surrogate without high surrogate
buf[ptr++] = 0xef;
buf[ptr++] = 0xbf;
buf[ptr++] = 0xbd;
} else {
const c2 = v.charCodeAt(++i);
if (i >= strLen || c2 < FIRST_LOW_SURROGATE || c2 > LAST_LOW_SURROGATE) {
// high surrogate not followed by low surrogate
buf[ptr++] = 0xef;
buf[ptr++] = 0xbf;
buf[ptr++] = 0xbd;
i--;
continue;
}
const c = ((c1 & 0x3ff) << 10) + (c2 & 0x3ff) + 0x1_0000;
buf[ptr++] = 0xf0 | (c >> 18);
buf[ptr++] = 0x80 | ((c >> 12) & 0x3f);
buf[ptr++] = 0x80 | ((c >> 6) & 0x3f);
buf[ptr++] = 0x80 | (c & 0x3f);
}
}
return ptr - offset;
}
/* c8 ignore next 1 */
let TEXT_ENCODER: TextEncoder | null;
/**
* Encode string into utf-8.
* Provided `buf` MUST have enough space.
*/
export function nativeEncodeInto(v: string, buf: Uint8Array, offset: number): number {
const encoded = TEXT_ENCODER!.encodeInto(v, buf.subarray(offset));
return encoded.written;
}
/**
* Encode string into utf-8.
* Provided `buf` MUST have enough space.
*/
export function myEncodeInto(v: string, buf: Uint8Array, offset: number): number {
if (v.length < 55) {
return jsEncodeInto(v, buf, offset);
}
return nativeEncodeInto(v, buf, offset);
}
/**
* Encode string into utf-8.
* Provided `buf` MUST have enough space.
*/
export let encodeInto: (v: string, buf: Uint8Array, offset: number) => number;
/**
* Encode string into utf-8.
*/
export let encode: (v: string) => Uint8Array;
/** 重设环境 */
export function resetEnv(): void {
TEXT_ENCODER = typeof TextEncoder == 'function' ? new TextEncoder() : null;
stringByteLength =
typeof Buffer == 'function' && typeof Buffer.byteLength == 'function'
? nodeStringByteLength
: jsStringByteLength;
encodeInto = typeof TEXT_ENCODER?.encodeInto == 'function' ? myEncodeInto : jsEncodeInto;
encode =
typeof Buffer == 'function' && Buffer.from
? (v) => Buffer.from(v, 'utf8')
: TEXT_ENCODER
? (v) => TEXT_ENCODER!.encode(v)
: (v) => {
const buf = new Uint8Array(stringByteLength(v));
jsEncodeInto(v, buf, 0);
return buf;
};
}
resetEnv();