@technobuddha/library
Version:
A large library of useful functions
59 lines (49 loc) • 2.32 kB
text/typescript
import { empty } from '../constants';
/**
* Decode a UTF8 encoded string into unicode
*
* @param input the utf encoded string
* @returns the decoded strings (which is encoded as UTF-16 by javascript)
*/
export function decodeUTF8(input: string): string {
let result = empty;
for(let i = 0; i < input.length; ++i) {
let c0: number = input.charCodeAt(i);
let c1: number;
let c2: number;
let c3: number;
if(c0 > 0x007F) {
if(c0 > 0x00BF && c0 < 0x00E0) {
c1 = input.charCodeAt(++i);
if(i >= input.length)
throw new Error('Incomplete 2-byte sequence');
if((c1 & 0xC0) !== 0x80)
throw new Error('Incorrect 2 byte sequence');
c0 = (c0 & 0x001F) << 6 | (c1 & 0x003F);
} else if(c0 >= 0x00E0 && c0 < 0x00F0) {
c1 = input.charCodeAt(++i);
c2 = input.charCodeAt(++i);
if(i >= input.length)
throw new Error('Incomplete 3-byte sequence');
if((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80)
throw new Error('Incorrect 3 byte sequence');
c0 = (c0 & 0x000F) << 12 | (c1 & 0x003F) << 6 | (c2 & 0x003F);
} else if(c0 >= 0x00F0 && c0 < 0x00F8) {
c1 = input.charCodeAt(++i);
c2 = input.charCodeAt(++i);
c3 = input.charCodeAt(++i);
if(i >= input.length)
throw new Error('incomplete 4 byte sequence');
if((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80 || (c3 & 0xc0) !== 0x80)
throw new Error('Incorrect 3 byte sequence');
c0 = (c0 & 0x000f) << 18 | (c1 & 0x003F) << 12 | (c2 & 0x003F) << 6 | (c3 & 0x003F);
} else { throw new Error(`unknown multibyte start 0x${c0.toString(16)} @${i}`); }
}
if(c0 <= 0xFFFF) { result += String.fromCharCode(c0); } else if(c0 <= 0x0010FFFF) {
c0 -= 0x00010000;
result += String.fromCharCode(c0 >> 10 | 0xD800) + String.fromCharCode(c0 & 0x03FF | 0xDC00);
} else { throw new Error(`code point 0x${c0.toString(16)} exceeds UTF-16 reach`); }
}
return result;
}
export default decodeUTF8;