UNPKG

@cch137/format-utils

Version:

A collection of utility modules for formatting and processing data

86 lines (85 loc) 3.06 kB
function concentrate(text, targetLength) { const textLength = text.length; if (textLength <= targetLength) return text; let step = Math.ceil(textLength / targetLength), j = 0, k, l; const selected = Array.from({ length: textLength }).map((v, i) => i % step === 0 ? (j++, true) : false); const remainder = targetLength - j; if (remainder !== 0) { step = Math.ceil(textLength / remainder); for (let i = 0; i < remainder; i++) { (j = i * step + Math.ceil(step / 2)), (k = 0), (l = 0); while (Math.abs(l) < textLength) { if (l >= 0 && !selected[l]) { selected[l] = true; break; } k = k >= 0 ? -(k + 1) : (k = -k); l = j + k; } } } return text .split("") .filter((v, i) => selected[i]) .join(""); } const languageCodeRanges = { en: [[0x0000, 0x007f]], zh: [ [0x4e00, 0x9fff], [0x3400, 0x4dbf], [0x20000, 0x2a6df], [0x2a700, 0x2b73f], [0x2b740, 0x2b81f], ], ja: [ [0x3040, 0x309f], [0x30a0, 0x30ff], [0x31f0, 0x31ff], [0x1b000, 0x1b0ff], [0x1f200, 0x1f2ff], ], ko: [[0xac00, 0xd7af]], ru: [ [0x0400, 0x04ff], [0x0500, 0x052f], ], }; export default function detectLanguages(text, sampleProportion = 0.1, minSampleSize = 100, maxSampleSize = 1000) { const selectedCharacters = concentrate(text.replace(/\s/g, ""), Math.floor(Math.min(maxSampleSize, Math.max(minSampleSize, text.length * sampleProportion)))).split(""); const languageDistribution = {}; let detectedTotal = 0, detected; for (const character of selectedCharacters) { detected = false; for (const languageCode in languageCodeRanges) { for (const characterRange of languageCodeRanges[languageCode]) { if (character.charCodeAt(0) >= characterRange[0] && character.charCodeAt(0) <= characterRange[1]) { if (languageCode in languageDistribution) languageDistribution[languageCode]++; else languageDistribution[languageCode] = 1; detectedTotal++; detected = true; break; } } if (detected) break; } } for (const languageCode in languageDistribution) languageDistribution[languageCode] /= detectedTotal; return languageDistribution; } export function detectLanguage(text, sampleProportion = 0.1, minSampleSize = 100, maxSampleSize = 1000) { const data = detectLanguages(text, sampleProportion, minSampleSize, maxSampleSize); let language = null, v = 0; for (const languageCode in data) { const u = data[languageCode]; if (u > v) (language = languageCode), (v = u); } return language || "en"; }