id3-parser
Version:
A pure JavaScript id3 tag parser.
220 lines (219 loc) • 7.66 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.skipPaddingZeros = exports.getEndpointOfBytes = exports.readBytesToString = exports.readBytesToISO8859 = exports.readBytesToUTF16 = exports.readBytesToUTF8 = void 0;
const toStr = String.fromCharCode;
/**
* Convert utf8 bytes to string.
* @description
* According to utf8 spec, char is encoded to [1,4] byte.
* 1. 1 byte, 0 - 0x7f, the same as Ascii chars.
* 2. 2 bytes, 110xxxxx 10xxxxxx.
* 3. 3 bytes, 1110xxxx 10xxxxxx 10xxxxxx.
* 4. 4 bytes, 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
* For 2-4 bytes, remove leading 10/110/1110/11110 and get final codepoint.
* @param bytes Utf8 binary bytes, usually array of numbers.
* @param maxToRead Max number of bytes to read.
*/
function readBytesToUTF8(bytes, maxToRead) {
if (maxToRead == null || maxToRead < 0) {
maxToRead = bytes.length;
}
else {
maxToRead = Math.min(maxToRead, bytes.length);
}
let index = 0;
// Process BOM(Byte order mark).
if (bytes[0] === 0xEF && bytes[1] === 0xBB && bytes[2] === 0xBF) {
index = 3;
}
const arr = [];
// Continue to insert string to arr until processed bytes' length reach max.
for (let i = 0; index < maxToRead; i++) {
const byte1 = bytes[index++];
let byte2;
let byte3;
let byte4;
let codepoint;
// End flag.
if (byte1 === 0x00) {
break;
}
// ASCII
else if (byte1 < 0x80) {
arr[i] = toStr(byte1);
}
// Check 110yyyyy(C0-DF) 10zzzzzz(80-BF)--> 000080 - 0007FF
// Because C0/C1 is invalid (RFC 3629), begin with C2.
else if (byte1 >= 0xC2 && byte1 < 0xE0) {
byte2 = bytes[index++];
arr[i] = toStr(((byte1 & 0x1F) << 6) + (byte2 & 0x3F));
}
// Check 1110xxxx(E0-EF) 10yyyyyy 10zzzzzz --> 000800 - 00D7FF 00E000 - 00FFFF
else if (byte1 >= 0xE0 && byte1 < 0xF0) {
byte2 = bytes[index++];
byte3 = bytes[index++];
arr[i] = toStr(((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F));
}
// Check 11110www(F0-F7) 10xxxxxx 10yyyyyy 10zzzzzz
// --> 010000 - 10FFFF
// RFC 3629 makes F5, F6, F7 invalid.
else if (byte1 >= 0xF0 && byte1 < 0xF5) {
byte2 = bytes[index++];
byte3 = bytes[index++];
byte4 = bytes[index++];
// See <https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae>
codepoint = ((byte1 & 0x07) << 18) +
((byte2 & 0x3F) << 12) +
((byte3 & 0x3F) << 6) +
(byte4 & 0x3F) - 0x10000;
// Invoke String.fromCharCode(H, L) to get correct char.
arr[i] = toStr((codepoint >> 10) + 0xD800, (codepoint & 0x3FF) + 0xDC00);
}
}
return arr.join('');
}
exports.readBytesToUTF8 = readBytesToUTF8;
/**
* Convert utf16 bytes to string.
* @description
* Utf16 represents char with one or two 16-bit code units per code point.
* 1. Range 0 - 0xFFFF (i.e. the BMP), can be represented with one 16-bit.
* 2. Range 0x10000 - 0x10FFFF (i.e. outside the BMP), can only be encoded using two 16-bit code units.
*
* The two 16-bit is called a surrogate pair.
* - The first code unit of a surrogate pair is always in the range from 0xD800 to 0xDBFF,
* and is called a high surrogate or a lead surrogate.
* - The second code unit of a surrogate pair is always in the range from 0xDC00 to 0xDFFF,
* and is called a low surrogate or a trail surrogate.
*
* A codepoint `C` greater than 0xFFFF corresponds to a surrogate pair <H, L>:
* H = Math.floor((C - 0x10000) / 0x400) + 0xD800
* L = (C - 0x10000) % 0x400 + 0xDC00
* C = (H - 0xD800) * 0x400 + L - 0xDC00 + 0x10000
* @param bytes Utf16 binary bytes, usually array of numbers.
* @param isBigEndian Specify whether utf16 bytes big-endian or little-endian.
* @param maxToRead Max number of bytes to read.
*/
function readBytesToUTF16(bytes, isBigEndian, maxToRead) {
if (maxToRead == null || maxToRead < 0) {
maxToRead = bytes.length;
}
else {
maxToRead = Math.min(maxToRead, bytes.length);
}
let index = 0;
let offset1 = 1;
let offset2 = 0;
// Check BOM and set isBigEndian.
if (bytes[0] === 0xFE && bytes[1] === 0xFF) {
isBigEndian = true;
index = 2;
}
else if (bytes[0] === 0xFF && bytes[1] === 0xFE) {
isBigEndian = false;
index = 2;
}
if (isBigEndian) {
offset1 = 0;
offset2 = 1;
}
const arr = [];
let byte1;
let byte2;
let word1;
let word2;
let byte3;
let byte4;
for (let i = 0; index < maxToRead; i++) {
// Set high/low 8 bit corresponding to LE/BE.
byte1 = bytes[index + offset1];
byte2 = bytes[index + offset2];
// Get first 16 bits' value.
word1 = (byte1 << 8) + byte2;
index += 2;
// If 16 bits are all 0, means end.
if (word1 === 0x0000) {
break;
}
// First 16 bit: 0xD800 - 0xDFFF is reversed to indicate that
// the current 32 bits are to represent one char in Supplementary Planes.
// And if first 8 bits are not in [0xD8, 0xE0),means that
// the current 16 bits are represent one char in Basic Multilingual Plane.
else if (byte1 < 0xD8 || byte1 >= 0xE0) {
arr[i] = toStr(word1);
}
else {
// Get next 16 bits.
byte3 = bytes[index + offset1];
byte4 = bytes[index + offset2];
word2 = (byte3 << 8) + byte4;
index += 2;
// Then invoke String.fromCharCode(H, L) to get correct char.
arr[i] = toStr(word1, word2);
}
}
return arr.join('');
}
exports.readBytesToUTF16 = readBytesToUTF16;
function readBytesToISO8859(bytes, maxToRead) {
if (maxToRead == null || maxToRead < 0) {
maxToRead = bytes.length;
}
else {
maxToRead = Math.min(maxToRead, bytes.length);
}
const arr = [];
for (let i = 0; i < maxToRead; i++) {
arr.push(toStr(bytes[i]));
}
return arr.join('');
}
exports.readBytesToISO8859 = readBytesToISO8859;
/**
* Convert bytes to string according to encoding.
* @param bytes Binary bytes.
* @param encoding id3v2 tag encoding, always 0/1/2/3.
* @param maxToRead Max number of bytes to read.
*/
function readBytesToString(bytes, encoding, maxToRead) {
if (encoding === 0) {
return readBytesToISO8859(bytes, maxToRead);
}
else if (encoding === 3) {
return readBytesToUTF8(bytes, maxToRead);
}
else if (encoding === 1 || encoding === 2) {
return readBytesToUTF16(bytes, undefined, maxToRead);
}
else {
return null;
}
}
exports.readBytesToString = readBytesToString;
function getEndpointOfBytes(bytes, encoding, start = 0) {
// ISO-8859 use $00 as end flag, and
// unicode use $00 00 as end flag.
const checker = encoding === 0
? (index) => bytes[index] === 0
: (index) => (bytes[index] === 0 && bytes[index + 1] === 0);
let i = start;
for (; i < bytes.length; i++) {
if (checker(i)) {
break;
}
}
return i;
}
exports.getEndpointOfBytes = getEndpointOfBytes;
function skipPaddingZeros(bytes, start) {
for (let i = start;; i++) {
if (bytes[i] === 0) {
start++;
}
else {
break;
}
}
return start;
}
exports.skipPaddingZeros = skipPaddingZeros;