@kayahr/text-encoding
Version:
Text encoder and decoder
140 lines • 4.38 kB
JavaScript
/*
* Copyright (C) 2021 Klaus Reimer <k@ailis.de>
* See LICENSE.md for licensing information.
*/
import { ByteBuffer } from "./ByteBuffer.js";
import { FINISHED } from "./constants.js";
import { getEncoding } from "./Encoding.js";
/**
* Converts string to code points.
*
* @param string - Input string of UTF-16 code units.
* @returns Code points.
*/
function stringToCodePoints(string) {
const n = string.length;
let i = 0;
const codePoints = [];
while (i < n) {
const c = string.charCodeAt(i);
if (c < 0xD800 || c > 0xDFFF) {
codePoints.push(c);
}
else if (c >= 0xDC00 && c <= 0xDFFF) {
codePoints.push(0xFFFD);
}
else if (c >= 0xD800 && c <= 0xDBFF) {
if (i === n - 1) {
codePoints.push(0xFFFD);
}
else {
const d = string.charCodeAt(i + 1);
if (d >= 0xDC00 && d <= 0xDFFF) {
const a = c & 0x3FF;
const b = d & 0x3FF;
codePoints.push(0x10000 + (a << 10) + b);
i++;
}
else {
codePoints.push(0xFFFD);
}
}
}
i++;
}
return codePoints;
}
/**
* The TextEncoder represents an encoder for a specific text encoding, such as UTF-8, ISO-8859-2, KOI8-R, GBK, etc.
* An encoder takes a string and emits an array of encoded bytes.
*/
export class TextEncoder {
enc;
encoder = null;
/**
* Creates a new encoder for the given encoding.
*
* @param label - The encoding label. Defaults to UTF-8
*/
constructor(label = "utf-8") {
this.enc = getEncoding(label);
}
/** @returns The name of the encoding. */
get encoding() {
return this.enc.getName();
}
/**
* Encodes the given string and returns the encoded bytes.
*
* @param input - The string to encode.
* @returns The encoded bytes.
*/
encode(input = "") {
// Initialize encoder if not already done
this.encoder ??= this.enc.createEncoder();
// Encode the input string
const inputStream = new ByteBuffer(stringToCodePoints(input));
const output = [];
let result;
while (true) {
result = this.encoder.encode(inputStream);
if (result === FINISHED) {
break;
}
if (Array.isArray(result)) {
output.push(...result);
}
else {
output.push(result);
}
}
// Create and return byte array with encoded string
return new Uint8Array(output);
}
/** @inheritdoc */
encodeInto(source, destination) {
// Initialize encoder if not already done
this.encoder ??= this.enc.createEncoder();
// Encode the input string
const inputStream = new ByteBuffer(stringToCodePoints(source));
let result;
let read = 0;
let written = 0;
while (written < destination.byteLength) {
result = this.encoder.encode(inputStream);
if (result === FINISHED) {
break;
}
if (Array.isArray(result)) {
if (result.length + written > destination.byteLength) {
break;
}
destination.set(result, written);
written += result.length;
}
else {
destination[written++] = result;
}
read++;
}
// Report back number of code points read and bytes written
return { read, written };
}
}
/**
* Creates and returns a new text encoder for the given encoding. When encoding is utf-8 then the built-in
* text encoder (which only supports utf-8) is returned. Otherwise our own implementation is returned for this
* specific encoding.
*
* @param label - The encoding label. Defaults to "utf-8".
* @returns The created text encoder.
*/
export function createTextEncoder(label = "utf-8") {
if (label === "utf-8" && typeof globalThis.TextEncoder === "function") {
return new globalThis.TextEncoder();
}
else {
return new TextEncoder(label);
}
}
//# sourceMappingURL=TextEncoder.js.map