fullwidth-quotes
Version:
Convert CJK quotation marks to fullwidth according to Unicode Standardized Variation Sequence (SVS)
201 lines (200 loc) • 7.97 kB
JavaScript
import * as chars from "./chars.js";
import { eastAsianWidthType } from "get-east-asian-width";
const segmenter = new Intl.Segmenter();
const charSets = {
neutral: [
chars.LEFT_DOUBLE_QUOTE,
chars.RIGHT_DOUBLE_QUOTE,
chars.LEFT_SINGLE_QUOTE,
chars.RIGHT_SINGLE_QUOTE,
],
fullwidth: [
chars.LEFT_DOUBLE_QUOTE_FULLWIDTH,
chars.RIGHT_DOUBLE_QUOTE_FULLWIDTH,
chars.LEFT_SINGLE_QUOTE_FULLWIDTH,
chars.RIGHT_SINGLE_QUOTE_FULLWIDTH,
],
halfwidth: [
chars.LEFT_DOUBLE_QUOTE_HALFWIDTH,
chars.RIGHT_DOUBLE_QUOTE_HALFWIDTH,
chars.LEFT_SINGLE_QUOTE_HALFWIDTH,
chars.RIGHT_SINGLE_QUOTE_HALFWIDTH,
],
};
/**
* Convert a string to Unicode sequence that won't divide the variation selectors or something else
* into single characters.
* @param str - String.
* @returns An array where each element is a valid Unicode Variation Sequences.
*/
export function toUnicodeStringSequence(str) {
return Array.from(segmenter.segment(str), segment => segment.segment);
}
/**
* Check if a character is fullwidth.
* But treat hangul as halfwidth due to korean uses halfwidth punctuation marks,
* and treat empty string as ambiguous.
* @param char - Character.
* @returns
* East Asian Width | Returns
* --- | ---
* fullwidth | true
* halfwidth | false
* wide | true
* narrow | false
* neutral | false
* ambiguous | null
*
* Special | Returns
* --- | ---
* Quote + VS1 | false
* Quote + VS2 | true
* Hangul | false
* Empty String | null
*/
export function isFullwidth(char) {
if (charSets.fullwidth.includes(char))
// Explicit fullwidth quotation marks
return true;
else if (charSets.halfwidth.includes(char))
// Explicit halfwidth quotation marks
return false;
else if (char.match(/\p{Script=Hangul}/u))
// Treat hangul as halfwidth, because korean uses halfwidth punctuation marks.
return false;
const codePoint = char.codePointAt(0);
if (codePoint === undefined)
// Empty string, treat as ambiguous.
return null;
const width = eastAsianWidthType(codePoint);
if (width === "ambiguous")
return null;
return width === "fullwidth" || width === "wide";
// Treat neutral as halfwidth.
}
function shouldFullwidthInternal(str, side = "both") {
const sequence = typeof str === "string" ? toUnicodeStringSequence(str) : str;
let fullwidth;
let start = -1;
if (side !== "end")
for (start = 0; start < sequence.length; start++) {
const char = sequence[start];
const matched = isFullwidth(char);
if (matched === null)
continue;
fullwidth = matched;
break;
}
if (fullwidth)
return true;
if (side !== "start")
for (let end = sequence.length - 1; end > start; end--) {
const char = sequence[end];
const matched = isFullwidth(char);
if (matched === null)
continue;
fullwidth = matched;
break;
}
return fullwidth === true;
}
/**
* Determine whether a string should be enclosed in fullwidth brackets or quotation marks.
*
* @remarks Determine rules:
* 1. Query whether the first or last characters of a string is full width characters, and return true if so.
* 2. If there are ambiguous characters, query the second or penultimate characters, and so on.
* 3. If the entire string contains ambiguous characters, return false.
*
* @param str - The string to determine.
* @returns The string should be enclosed in fullwidth brackets or quotation marks.
*/
export function shouldFullwidth(str) {
return shouldFullwidthInternal(str);
}
const VARIATION_SELECTORS = "[\\u180b-\\u180d\\ufe00-\\ufe0f\\u{e0100}-\\u{e01ef}]";
const QUOTES_NEUTRAL_REGEXP = /([“”‘’])/gu;
const QUOTES_SVS_REGEXP = new RegExp(`([“”‘’])${VARIATION_SELECTORS}*`, "gu");
/**
* Always convert all quotation marks in the string to fullwidth, regardless of the context character.
* @note Not recommended, recommended to use `enableSvsQuotes`.
* @param str - String.
* @param includesExplicit - If a quotation mark already contains any variation selector, should it be ignored and replaced? Defaults to false.
* @returns Every quotation marks become fullwidth.
*/
export function alwaysToFullwidthQuotes(str, includesExplicit = false) {
return str.replaceAll(includesExplicit ? QUOTES_SVS_REGEXP : QUOTES_NEUTRAL_REGEXP, "$1" + chars.FULLWIDTH_VS);
}
/**
* Always convert all quotation marks in the string to halfwidth, regardless of the context character.
* @note Not recommended, recommended to use `enableSvsQuotes`.
* @param str - String.
* @param includesExplicit - If a quotation mark already contains any variation selector, should it be ignored and replaced? Defaults to false.
* @returns Every quotation marks become halfwidth.
*/
export function alwaysToHalfwidthQuotes(str, includesExplicit = false) {
return str.replaceAll(includesExplicit ? QUOTES_SVS_REGEXP : QUOTES_NEUTRAL_REGEXP, "$1" + chars.HALFWIDTH_VS);
}
/**
* Remove the variation selectors from all quotation marks to restore them to ambiguous pure characters.
* @param str - String.
* @returns Quotation marks with no variation selectors.
*/
export function disableSvsQuotes(str) {
return str.replaceAll(QUOTES_SVS_REGEXP, "$1");
}
const getVs = (fullwidth) => fullwidth ? chars.FULLWIDTH_VS : chars.HALFWIDTH_VS;
export function enableSvsQuotes(str, locale) {
if (locale)
return enableSvsQuotesWithLocale(str, locale);
const sequence = toUnicodeStringSequence(str);
const stack = [];
for (let i = 0; i < sequence.length; i++) {
const char = sequence[i];
if (!charSets.neutral.includes(char[0]))
continue;
if (char[0] === chars.LEFT_DOUBLE_QUOTE || char[0] === chars.LEFT_SINGLE_QUOTE) {
stack.push({ quote: char, index: i });
continue;
}
if (char[0] === chars.RIGHT_DOUBLE_QUOTE || char[0] === chars.RIGHT_SINGLE_QUOTE) {
const right = char, rightIndex = i, rightVs = right.slice(1);
const _left = stack.pop();
if (_left) {
const { quote: left, index: leftIndex } = _left, leftVs = left.slice(1);
if (leftVs && rightVs && leftVs === rightVs)
continue;
const fullwidth = shouldFullwidthInternal(sequence.slice(leftIndex + 1, rightIndex));
const vs = getVs(fullwidth);
sequence[leftIndex] = left[0] + vs;
sequence[rightIndex] = right[0] + vs;
}
else { // Found the right quote but can't find the corresponding left quote.
if (rightVs)
continue;
const fullwidth = shouldFullwidthInternal(sequence.slice(0, rightIndex), "end");
sequence[rightIndex] = right[0] + getVs(fullwidth);
}
continue;
}
}
if (stack.length) // Found the left quote but can't find the corresponding right quote.
for (const _left of stack.reverse()) { // Downstream may not support `toReversed`.
const { quote: left, index: leftIndex } = _left, leftVs = left.slice(1);
if (leftVs)
continue;
const fullwidth = shouldFullwidthInternal(sequence.slice(leftIndex + 1), "start");
sequence[leftIndex] = left[0] + getVs(fullwidth);
}
return sequence.join("");
}
function enableSvsQuotesWithLocale(str, locale) {
if (typeof locale === "string")
locale = new Intl.Locale(locale);
locale = locale.maximize();
const { script } = locale;
if (["Hani", "Hans", "Hant", "Jpan"].includes(script))
return alwaysToFullwidthQuotes(str);
else
return alwaysToHalfwidthQuotes(str);
}