xterm
Version:
Full xterm terminal, in your browser
343 lines (321 loc) • 10.4 kB
text/typescript
/**
* Copyright (c) 2019 The xterm.js authors. All rights reserved.
* @license MIT
*/
/**
* Polyfill - Convert UTF32 codepoint into JS string.
* Note: The built-in String.fromCodePoint happens to be much slower
* due to additional sanity checks. We can avoid them since
* we always operate on legal UTF32 (granted by the input decoders)
* and use this faster version instead.
*/
export function stringFromCodePoint(codePoint: number): string {
if (codePoint > 0xFFFF) {
codePoint -= 0x10000;
return String.fromCharCode((codePoint >> 10) + 0xD800) + String.fromCharCode((codePoint % 0x400) + 0xDC00);
}
return String.fromCharCode(codePoint);
}
/**
* Convert UTF32 char codes into JS string.
* Basically the same as `stringFromCodePoint` but for multiple codepoints
* in a loop (which is a lot faster).
*/
export function utf32ToString(data: Uint32Array, start: number = 0, end: number = data.length): string {
let result = '';
for (let i = start; i < end; ++i) {
let codepoint = data[i];
if (codepoint > 0xFFFF) {
// JS strings are encoded as UTF16, thus a non BMP codepoint gets converted into a surrogate pair
// conversion rules:
// - subtract 0x10000 from code point, leaving a 20 bit number
// - add high 10 bits to 0xD800 --> first surrogate
// - add low 10 bits to 0xDC00 --> second surrogate
codepoint -= 0x10000;
result += String.fromCharCode((codepoint >> 10) + 0xD800) + String.fromCharCode((codepoint % 0x400) + 0xDC00);
} else {
result += String.fromCharCode(codepoint);
}
}
return result;
}
/**
* StringToUtf32 - decodes UTF16 sequences into UTF32 codepoints.
* To keep the decoder in line with JS strings it handles single surrogates as UCS2.
*/
export class StringToUtf32 {
private _interim: number = 0;
/**
* Clears interim and resets decoder to clean state.
*/
public clear(): void {
this._interim = 0;
}
/**
* Decode JS string to UTF32 codepoints.
* The methods assumes stream input and will store partly transmitted
* surrogate pairs and decode them with the next data chunk.
* Note: The method does no bound checks for target, therefore make sure
* the provided input data does not exceed the size of `target`.
* Returns the number of written codepoints in `target`.
*/
decode(input: string, target: Uint32Array): number {
const length = input.length;
if (!length) {
return 0;
}
let size = 0;
let startPos = 0;
// handle leftover surrogate high
if (this._interim) {
const second = input.charCodeAt(startPos++);
if (0xDC00 <= second && second <= 0xDFFF) {
target[size++] = (this._interim - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
} else {
// illegal codepoint (USC2 handling)
target[size++] = this._interim;
target[size++] = second;
}
this._interim = 0;
}
for (let i = startPos; i < length; ++i) {
const code = input.charCodeAt(i);
// surrogate pair first
if (0xD800 <= code && code <= 0xDBFF) {
if (++i >= length) {
this._interim = code;
return size;
}
const second = input.charCodeAt(i);
if (0xDC00 <= second && second <= 0xDFFF) {
target[size++] = (code - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
} else {
// illegal codepoint (USC2 handling)
target[size++] = code;
target[size++] = second;
}
continue;
}
target[size++] = code;
}
return size;
}
}
/**
* Utf8Decoder - decodes UTF8 byte sequences into UTF32 codepoints.
*/
export class Utf8ToUtf32 {
public interim: Uint8Array = new Uint8Array(3);
/**
* Clears interim bytes and resets decoder to clean state.
*/
public clear(): void {
this.interim.fill(0);
}
/**
* Decodes UTF8 byte sequences in `input` to UTF32 codepoints in `target`.
* The methods assumes stream input and will store partly transmitted bytes
* and decode them with the next data chunk.
* Note: The method does no bound checks for target, therefore make sure
* the provided data chunk does not exceed the size of `target`.
* Returns the number of written codepoints in `target`.
*/
decode(input: Uint8Array, target: Uint32Array): number {
const length = input.length;
if (!length) {
return 0;
}
let size = 0;
let byte1: number;
let byte2: number;
let byte3: number;
let byte4: number;
let codepoint = 0;
let startPos = 0;
// handle leftover bytes
if (this.interim[0]) {
let discardInterim = false;
let cp = this.interim[0];
cp &= ((((cp & 0xE0) === 0xC0)) ? 0x1F : (((cp & 0xF0) === 0xE0)) ? 0x0F : 0x07);
let pos = 0;
let tmp: number;
while ((tmp = this.interim[++pos] & 0x3F) && pos < 4) {
cp <<= 6;
cp |= tmp;
}
// missing bytes - read ahead from input
const type = (((this.interim[0] & 0xE0) === 0xC0)) ? 2 : (((this.interim[0] & 0xF0) === 0xE0)) ? 3 : 4;
const missing = type - pos;
while (startPos < missing) {
if (startPos >= length) {
return 0;
}
tmp = input[startPos++];
if ((tmp & 0xC0) !== 0x80) {
// wrong continuation, discard interim bytes completely
startPos--;
discardInterim = true;
break;
} else {
// need to save so we can continue short inputs in next call
this.interim[pos++] = tmp;
cp <<= 6;
cp |= tmp & 0x3F;
}
}
if (!discardInterim) {
// final test is type dependent
if (type === 2) {
if (cp < 0x80) {
// wrong starter byte
startPos--;
} else {
target[size++] = cp;
}
} else if (type === 3) {
if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
// illegal codepoint
} else {
target[size++] = cp;
}
} else {
if (codepoint < 0x010000 || codepoint > 0x10FFFF) {
// illegal codepoint
} else {
target[size++] = cp;
}
}
}
this.interim.fill(0);
}
// loop through input
const fourStop = length - 4;
let i = startPos;
while (i < length) {
/**
* ASCII shortcut with loop unrolled to 4 consecutive ASCII chars.
* This is a compromise between speed gain for ASCII
* and penalty for non ASCII:
* For best ASCII performance the char should be stored directly into target,
* but even a single attempt to write to target and compare afterwards
* penalizes non ASCII really bad (-50%), thus we load the char into byteX first,
* which reduces ASCII performance by ~15%.
* This trial for ASCII reduces non ASCII performance by ~10% which seems acceptible
* compared to the gains.
* Note that this optimization only takes place for 4 consecutive ASCII chars,
* for any shorter it bails out. Worst case - all 4 bytes being read but
* thrown away due to the last being a non ASCII char (-10% performance).
*/
while (i < fourStop
&& !((byte1 = input[i]) & 0x80)
&& !((byte2 = input[i + 1]) & 0x80)
&& !((byte3 = input[i + 2]) & 0x80)
&& !((byte4 = input[i + 3]) & 0x80))
{
target[size++] = byte1;
target[size++] = byte2;
target[size++] = byte3;
target[size++] = byte4;
i += 4;
}
// reread byte1
byte1 = input[i++];
// 1 byte
if (byte1 < 0x80) {
target[size++] = byte1;
// 2 bytes
} else if ((byte1 & 0xE0) === 0xC0) {
if (i >= length) {
this.interim[0] = byte1;
return size;
}
byte2 = input[i++];
if ((byte2 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
codepoint = (byte1 & 0x1F) << 6 | (byte2 & 0x3F);
if (codepoint < 0x80) {
// wrong starter byte
i--;
continue;
}
target[size++] = codepoint;
// 3 bytes
} else if ((byte1 & 0xF0) === 0xE0) {
if (i >= length) {
this.interim[0] = byte1;
return size;
}
byte2 = input[i++];
if ((byte2 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
if (i >= length) {
this.interim[0] = byte1;
this.interim[1] = byte2;
return size;
}
byte3 = input[i++];
if ((byte3 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F);
if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
// illegal codepoint, no i-- here
continue;
}
target[size++] = codepoint;
// 4 bytes
} else if ((byte1 & 0xF8) === 0xF0) {
if (i >= length) {
this.interim[0] = byte1;
return size;
}
byte2 = input[i++];
if ((byte2 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
if (i >= length) {
this.interim[0] = byte1;
this.interim[1] = byte2;
return size;
}
byte3 = input[i++];
if ((byte3 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
if (i >= length) {
this.interim[0] = byte1;
this.interim[1] = byte2;
this.interim[2] = byte3;
return size;
}
byte4 = input[i++];
if ((byte4 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
codepoint = (byte1 & 0x07) << 18 | (byte2 & 0x3F) << 12 | (byte3 & 0x3F) << 6 | (byte4 & 0x3F);
if (codepoint < 0x010000 || codepoint > 0x10FFFF) {
// illegal codepoint, no i-- here
continue;
}
target[size++] = codepoint;
} else {
// illegal byte, just skip
}
}
return size;
}
}