UNPKG

fix-latin1-to-utf8

Version:

Fix mojibake when converting Latin-1 encoded text to UTF-8

163 lines (156 loc) 3.3 kB
"use strict"; /** * @description Latin-1 characters and their corresponding UTF-8 characters. * @type {Readonly<Record<string, string>>} */ const REPLACEMENTS = Object.freeze({ // Actual: Expected "€": "€", "‚": "‚", "Æ’": "ƒ", "„": "„", "…": "…", "â€\u00A0": "†", "‡": "‡", "ˆ": "ˆ", "‰": "‰", "Å\u00A0": "Š", "‹": "‹", "Å’": "Œ", "Ž": "Ž", "‘": "‘", "’": "’", "“": "“", "â€\u009D": "”", "•": "•", "–": "–", "—": "—", Ëœ: "˜", "â„¢": "™", "Å¡": "š", "›": "›", "Å“": "œ", "ž": "ž", "Ÿ": "Ÿ", " ": " ", "¡": "¡", "¢": "¢", "£": "£", "¤": "¤", "Â¥": "¥", "¦": "¦", "§": "§", "¨": "¨", "©": "©", ª: "ª", "«": "«", "¬": "¬", "­": "­", "®": "®", "¯": "¯", "°": "°", "±": "±", "²": "²", "³": "³", "´": "´", µ: "µ", "¶": "¶", "·": "·", "¸": "¸", "¹": "¹", º: "º", "»": "»", "¼": "¼", "½": "½", "¾": "¾", "¿": "¿", "À": "À", "Â": "Â", Ã: "Ã", "Ä": "Ä", "Ã…": "Å", "Æ": "Æ", "Ç": "Ç", È: "È", "É": "É", Ê: "Ê", "Ë": "Ë", ÃŒ: "Ì", "Ã\u008D": "Í", ÃŽ: "Î", "Ã\u008F": "Ï", "Ã\u0090": "Ð", "Ñ": "Ñ", "Ã’": "Ò", "Ó": "Ó", "Ô": "Ô", "Õ": "Õ", "Ö": "Ö", "×": "×", "Ø": "Ø", "Ù": "Ù", Ú: "Ú", "Û": "Û", Ü: "Ü", "Ã\u009D": "Ý", Þ: "Þ", ß: "ß", "Ã\u00A0": "à", "á": "á", "â": "â", "ã": "ã", "ä": "ä", "Ã¥": "å", "æ": "æ", "ç": "ç", "è": "è", "é": "é", ê: "ê", "ë": "ë", "ì": "ì", "Ã\u00AD": "í", "î": "î", "ï": "ï", "ð": "ð", "ñ": "ñ", "ò": "ò", "ó": "ó", "ô": "ô", õ: "õ", "ö": "ö", "÷": "÷", "ø": "ø", "ù": "ù", ú: "ú", "û": "û", "ü": "ü", "ý": "ý", "þ": "þ", "ÿ": "ÿ", }); // Cache immutable regex as they are expensive to create and garbage collect const LATIN1_PATTERN = /[ãâåæë]/iu; // eslint-disable-next-line security/detect-non-literal-regexp -- Static regex, no user input const MATCH_REG = new RegExp(Object.keys(REPLACEMENTS).join("|"), "gu"); /** * @author Frazer Smith * @description Fixes common encoding errors when converting from Latin-1 (and Windows-1252) to UTF-8. * @see {@link http://www.i18nqa.com/debug/utf8-debug.html | UTF-8 Encoding Debugging Chart} * @param {string} str - The string to be converted. * @returns {string} The converted string. * @throws {TypeError} If the argument is not a string. */ function fixLatin1ToUtf8(str) { if (typeof str !== "string") { throw new TypeError("Expected a string"); } // Early return if no matches if (!LATIN1_PATTERN.test(str)) { return str; } return str.replace(MATCH_REG, (match) => REPLACEMENTS[match]); } module.exports = fixLatin1ToUtf8; // CommonJS export module.exports.default = fixLatin1ToUtf8; // ESM default export module.exports.fixLatin1ToUtf8 = fixLatin1ToUtf8; // TypeScript and named export module.exports.REPLACEMENTS = REPLACEMENTS;