UNPKG

safe-code-point

Version:

Ascertains whether a Unicode code point is 'safe' for the purposes of encoding binary data

github.com/qntm/safe-code-point

qntm/safe-code-point

85 lines (75 loc) • 3.17 kB

JavaScript

/** Ascertain whether a Unicode code point is "safe" for use in a binary encoding. */ import * as ucd from './ucd.js' import canonicalCombiningClass from './canonical-combining-class.js' import eastAsianWidth from './east-asian-width.js' import generalCategory from './general-category.js' import normalizationProperties from './normalization-properties.js' import wordBreak from './word-break.js' const quickChecks = [ 'NFD_QC', // canonical decomposition 'NFC_QC', // canonical decomposition + canonical composition 'NFKD_QC', // compatibility decomposition 'NFKC_QC' // compatibility decomposition + canonical composition ] // General Categories CONSIDERED SAFE const DEFAULT_SAFE_CATEGORIES = { Ll: true, // Letter, Lowercase Lm: true, // Letter, Modifier Lo: true, // Letter, Other Lt: true, // Letter, Titlecase Lu: true, // Letter, Uppercase Me: false, // Mark, Enclosing Mn: false, // Mark, Nonspacing Mc: false, // Mark, Spacing Combining Nd: true, // Number, Decimal Digit Nl: true, // Number, Letter No: true, // Number, Other Cc: false, // Other, Control Cf: false, // Other, Format Cn: false, // Other, Not Assigned (no characters in the file have this property) Co: false, // Other, Private Use Cs: false, // Other, Surrogate Pe: false, // Punctuation, Close Pc: false, // Punctuation, Connector Pd: false, // Punctuation, Dash Pf: false, // Punctuation, Final quote (may behave like Ps or Pe depending on usage) Pi: false, // Punctuation, Initial quote (may behave like Ps or Pe depending on usage) Ps: false, // Punctuation, Open Po: false, // Punctuation, Other Zl: false, // Separator, Line Zp: false, // Separator, Paragraph Zs: false, // Separator, Space Sc: true, // Symbol, Currency Sm: true, // Symbol, Math Sk: true, // Symbol, Modifier So: true // Symbol, Other } export default async (version, { safeCategories = DEFAULT_SAFE_CATEGORIES } = {}) => { // First load up a bunch of data const cccData = await ucd.get(version, 'extracted/DerivedCombiningClass.txt') const eawData = await ucd.get(version, 'EastAsianWidth.txt') const gcData = await ucd.get(version, 'extracted/DerivedGeneralCategory.txt') const npData = await ucd.get(version, 'DerivedNormalizationProps.txt') const wbpData = await ucd.get(version, 'auxiliary/WordBreakProperty.txt') const ccc = canonicalCombiningClass(cccData) const eaw = eastAsianWidth(eawData) const gc = generalCategory(gcData) const np = normalizationProperties(npData) const wb = wordBreak(wbpData) const safeCodePoint = codePoint => { const passesQuickChecks = quickChecks.every(property => np(codePoint, property) === 'Y' ) const inSafeGc = safeCategories[gc(codePoint)] === true const hasCcc0 = ccc(codePoint) === 0 return passesQuickChecks && inSafeGc && hasCcc0 } safeCodePoint.canonicalCombiningClass = ccc safeCodePoint.eastAsianWidth = eaw safeCodePoint.generalCategory = gc safeCodePoint.normalizationProperties = np safeCodePoint.wordBreak = wb return safeCodePoint }