fix-latin1-to-utf8
Version:
Fix mojibake when converting Latin-1 encoded text to UTF-8
163 lines (156 loc) • 3.3 kB
JavaScript
;
/**
* @description Latin-1 characters and their corresponding UTF-8 characters.
* @type {Readonly<Record<string, string>>}
*/
const REPLACEMENTS = Object.freeze({
// Actual: Expected
"€": "€",
"‚": "‚",
"Æ’": "ƒ",
"„": "„",
"…": "…",
"â€\u00A0": "†",
"‡": "‡",
"ˆ": "ˆ",
"‰": "‰",
"Å\u00A0": "Š",
"‹": "‹",
"Å’": "Œ",
"Ž": "Ž",
"‘": "‘",
"’": "’",
"“": "“",
"â€\u009D": "”",
"•": "•",
"–": "–",
"—": "—",
Ëœ: "˜",
"â„¢": "™",
"Å¡": "š",
"›": "›",
"Å“": "œ",
"ž": "ž",
"Ÿ": "Ÿ",
"Â ": " ",
"¡": "¡",
"¢": "¢",
"£": "£",
"¤": "¤",
"Â¥": "¥",
"¦": "¦",
"§": "§",
"¨": "¨",
"©": "©",
ª: "ª",
"«": "«",
"¬": "¬",
"Â": "",
"®": "®",
"¯": "¯",
"°": "°",
"±": "±",
"²": "²",
"³": "³",
"´": "´",
µ: "µ",
"¶": "¶",
"·": "·",
"¸": "¸",
"¹": "¹",
º: "º",
"»": "»",
"¼": "¼",
"½": "½",
"¾": "¾",
"¿": "¿",
"À": "À",
"Â": "Â",
Ã: "Ã",
"Ä": "Ä",
"Ã…": "Å",
"Æ": "Æ",
"Ç": "Ç",
È: "È",
"É": "É",
Ê: "Ê",
"Ë": "Ë",
ÃŒ: "Ì",
"Ã\u008D": "Í",
ÃŽ: "Î",
"Ã\u008F": "Ï",
"Ã\u0090": "Ð",
"Ñ": "Ñ",
"Ã’": "Ò",
"Ó": "Ó",
"Ô": "Ô",
"Õ": "Õ",
"Ö": "Ö",
"×": "×",
"Ø": "Ø",
"Ù": "Ù",
Ú: "Ú",
"Û": "Û",
Ü: "Ü",
"Ã\u009D": "Ý",
Þ: "Þ",
ß: "ß",
"Ã\u00A0": "à",
"á": "á",
"â": "â",
"ã": "ã",
"ä": "ä",
"Ã¥": "å",
"æ": "æ",
"ç": "ç",
"è": "è",
"é": "é",
ê: "ê",
"ë": "ë",
"ì": "ì",
"Ã\u00AD": "í",
"î": "î",
"ï": "ï",
"ð": "ð",
"ñ": "ñ",
"ò": "ò",
"ó": "ó",
"ô": "ô",
õ: "õ",
"ö": "ö",
"÷": "÷",
"ø": "ø",
"ù": "ù",
ú: "ú",
"û": "û",
"ü": "ü",
"ý": "ý",
"þ": "þ",
"ÿ": "ÿ",
});
// Cache immutable regex as they are expensive to create and garbage collect
const LATIN1_PATTERN = /[ãâåæë]/iu;
// eslint-disable-next-line security/detect-non-literal-regexp -- Static regex, no user input
const MATCH_REG = new RegExp(Object.keys(REPLACEMENTS).join("|"), "gu");
/**
* @author Frazer Smith
* @description Fixes common encoding errors when converting from Latin-1 (and Windows-1252) to UTF-8.
* @see {@link http://www.i18nqa.com/debug/utf8-debug.html | UTF-8 Encoding Debugging Chart}
* @param {string} str - The string to be converted.
* @returns {string} The converted string.
* @throws {TypeError} If the argument is not a string.
*/
function fixLatin1ToUtf8(str) {
if (typeof str !== "string") {
throw new TypeError("Expected a string");
}
// Early return if no matches
if (!LATIN1_PATTERN.test(str)) {
return str;
}
return str.replace(MATCH_REG, (match) => REPLACEMENTS[match]);
}
module.exports = fixLatin1ToUtf8; // CommonJS export
module.exports.default = fixLatin1ToUtf8; // ESM default export
module.exports.fixLatin1ToUtf8 = fixLatin1ToUtf8; // TypeScript and named export
module.exports.REPLACEMENTS = REPLACEMENTS;