unicode-to-plain-text
Version:
Convert fancy Unicode text to plain ASCII with smart language preservation
94 lines (93 loc) • 3.6 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.mapCharacters = void 0;
const UNICODE_RANGES_1 = require("./maps/UNICODE_RANGES");
const MATH_SYMBOLS_MAP_1 = require("./maps/MATH_SYMBOLS_MAP");
const MIRRORED_MAP_1 = require("./maps/MIRRORED_MAP");
const CANADIAN_ABORIGINAL_MAP_1 = require("./maps/CANADIAN_ABORIGINAL_MAP");
const SMALL_CAPS_MAP_1 = require("./maps/SMALL_CAPS_MAP");
const STROKED_MAP_1 = require("./maps/STROKED_MAP");
const SUPERSCRIPT_MAP_1 = require("./maps/SUPERSCRIPT_MAP");
const SUBSCRIPT_MAP_1 = require("./maps/SUBSCRIPT_MAP");
const CURRENCY_MAP_1 = require("./maps/CURRENCY_MAP");
const CIRCLED_NUMBERS_MAP_1 = require("./maps/CIRCLED_NUMBERS_MAP");
const BLACK_CIRCLES_MAP_1 = require("./maps/BLACK_CIRCLES_MAP");
const DARK_SQUARES_MAP_1 = require("./maps/DARK_SQUARES_MAP");
const REGIONAL_INDICATORS_MAP_1 = require("./maps/REGIONAL_INDICATORS_MAP");
const PARENTHESIZED_MAP_1 = require("./maps/PARENTHESIZED_MAP");
const MISCELLANEOUS_MAP_1 = require("./maps/MISCELLANEOUS_MAP");
const mapCharacters = (text) => {
const preserve = getPreserveSet(text);
const map = getCharMap();
return Array.from(text)
.map((char) => (preserve.has(char) ? char : (map.get(char) ?? char)))
.join('');
};
exports.mapCharacters = mapCharacters;
const getPreserveSet = (text) => {
const set = new Set();
if (hasRealGreek(text)) {
addRange(set, 0x0370, 0x03ff);
addRange(set, 0x1f00, 0x1fff);
}
if (hasRealEthiopic(text)) {
addRange(set, 0x1200, 0x137f);
}
return set;
};
const hasRealGreek = (text) => [...text].some((char) => {
const code = char.charCodeAt(0);
return (code >= 0x0370 && code <= 0x03ff) || (code >= 0x1f00 && code <= 0x1fff);
});
const hasRealEthiopic = (text) => {
let count = 0, hasSpace = false, inEthiopicWord = false;
for (const char of text) {
const code = char.charCodeAt(0);
if (code >= 0x1200 && code <= 0x137f) {
if (++count > 5)
return true;
if (inEthiopicWord && char === ' ')
hasSpace = true;
inEthiopicWord = true;
}
else
inEthiopicWord = false;
}
return hasSpace && count > 1;
};
const addRange = (set, start, end) => {
for (let i = start; i <= end; i++)
set.add(String.fromCodePoint(i));
};
let CHAR_MAP = null;
const buildCharMap = () => {
const map = new Map();
for (const [start, base, length] of UNICODE_RANGES_1.UNICODE_RANGES) {
const baseCode = base.charCodeAt(0);
for (let i = 0; i < length; i++) {
map.set(String.fromCodePoint(start + i), String.fromCharCode(baseCode + i));
}
}
const maps = [
MATH_SYMBOLS_MAP_1.MATH_SYMBOLS_MAP,
MIRRORED_MAP_1.MIRRORED_MAP,
CANADIAN_ABORIGINAL_MAP_1.CANADIAN_ABORIGINAL_MAP,
SMALL_CAPS_MAP_1.SMALL_CAPS_MAP,
STROKED_MAP_1.STROKED_MAP,
SUPERSCRIPT_MAP_1.SUPERSCRIPT_MAP,
SUBSCRIPT_MAP_1.SUBSCRIPT_MAP,
CURRENCY_MAP_1.CURRENCY_MAP,
CIRCLED_NUMBERS_MAP_1.CIRCLED_NUMBERS_MAP,
BLACK_CIRCLES_MAP_1.BLACK_CIRCLES_MAP,
DARK_SQUARES_MAP_1.DARK_SQUARES_MAP,
REGIONAL_INDICATORS_MAP_1.REGIONAL_INDICATORS_MAP,
PARENTHESIZED_MAP_1.PARENTHESIZED_MAP,
MISCELLANEOUS_MAP_1.MISCELLANEOUS_MAP
];
for (const m of maps) {
for (const [f, p] of Object.entries(m))
map.set(f, p);
}
return map;
};
const getCharMap = () => (CHAR_MAP ?? (CHAR_MAP = buildCharMap()));