typopo
Version:
Fix frequent microtypography errors in multiple languages. Write neat texts without bothering about typography rules. Typopo works for English, German, Slovak, Czech and Rusyn language.
414 lines (351 loc) • 11.9 kB
JavaScript
import { replaceWithOverlapHandling } from "../../utils/regex-overlap.js";
import { base } from "../../const.js";
//
export function removeNbspBetweenMultiCharWords(string) {
// prettier-ignore
return replaceWithOverlapHandling(
string,
new RegExp(
`([${base.lowercaseChars}${base.uppercaseChars}]{2,})` +
`([${base.nbsp}${base.narrowNbsp}])` +
`([${base.lowercaseChars}${base.uppercaseChars}]{2,})`,
"g"
),
`$1 $3`
);
}
//
/**
Replace a space with a non-breaking space after a single-letter preposition
Examples:
V obchode → V⎵obchode
Skáče o tyči → Skáče o⎵tyči
Counterexamples
See test case
Approach
Split identification of
a) small letter prepositions (that can be placed anywhere in the sentence)
b) and capital letter prepositions (that are placed at the beginning of the sentence)
Reason: capital letters in the mid-sentence are rather bound to a previous word and nbsp is fixed by addNbspBeforeSingleLetter
c) “I” in English
@param {string} string — input text for identification
@param {string} locale: locale option
@returns {string} — output with correctly placed non-breaking space
*/
export function addNbspAfterPreposition(string, locale) {
// a) small letter prepositions
// prettier-ignore
string = replaceWithOverlapHandling(
string,
new RegExp(
`(^|[${base.space}]|[^${base.allChars}\\d${base.apostrophe}${base.plus}${base.minus}${base.hyphen}])` +
`([${base.lowercaseChars}])` +
`([${base.space}])`,
"g"
),
`$1$2${base.nbsp}`
);
// b) capital letter prepositions at the beggining of the sentence
// prettier-ignore
string = string.replace(
new RegExp(
`(^|[${base.sentencePunctuation}` +
`${base.ellipsis}` +
`${base.copyright}` +
`${base.registeredTrademark}` +
`${base.soundRecordingCopyright}]` +
`)` +
`([${base.spaces}]?)` +
`([${base.uppercaseChars}])` +
`([${base.spaces}])`,
"g"
),
`$1$2$3${base.nbsp}`
);
// c) "I" in English
if (locale.ID == "en-us") {
// prettier-ignore
string = string.replace(
new RegExp(
`(^|[${base.spaces}])` +
`(I)` +
`([${base.spaces}])`,
"g"
),
`$1$2${base.nbsp}`
);
}
return string;
}
//
export function addNbspAfterAmpersand(string) {
// prettier-ignore
return string.replace(
new RegExp(`([${base.spaces}])(${base.ampersand})([${base.spaces}])`, "g"),
` $2${base.nbsp}`
);
}
//
/**
Add a non-breaking space after a cardinal number (up to 99) that precedes a word.
Assumptions and Limitations
We’ll identify and place nbsp for 1- or 2-digit cardinal numbers.
@param {string} string: input text for identification
@returns {string} output with nbsp after cardinal numbers
*/
export function addNbspAfterCardinalNumber(string) {
// prettier-ignore
return string.replace(
new RegExp(
`([^${base.nbsp}\\d]|^)` +
`(\\d{1,2})` +
`([${base.spaces}])` +
`([${base.allChars}])`,
"g"
),
`$1` +
`$2` +
`${base.nbsp}` +
`$4`
);
}
//
/**
Add a non-breaking space after on ordinal number (up to 99) that precedes a word.
Assumptions and Limitations
We’ll identify and place nbsp for 1- or 2-digit ordinal numbers.
@param {string} string: input text for identification
@param {string} locale: locale option
@returns {string} output with nbsp after ordinal numbers
*/
export function addNbspAfterOrdinalNumber(string, locale) {
// prettier-ignore
return string.replace(
new RegExp(
`([^${base.nbsp}\\d_%\\-]|^)` +
`(\\d{1,2})` +
`(${locale.ordinalIndicator})` +
`([${base.spaces}]?)` +
`([${base.allChars}])`,
"g"
),
`$1$2$3${base.nbsp}$5`
);
}
//
/**
* Locale-specific spaces within a date, usually nbsp
*
* German standard orthography (Duden) recommends only one nbsp (or narrowNbsp) after the day and a regular interword space following the month*
* @param {string} string: input text for identification
* @param {string} locale: locale option
* @returns {string} output with added non-breaking space within ordinal dates
*/
export function addNbspWithinOrdinalDate(string, locale) {
// prettier-ignore
return string.replace(
new RegExp(
`(\\d)` +
`(\\.)` +
`([${base.spaces}]?)` +
`(\\d)` +
`(\\.)` +
`([${base.spaces}]?)` +
`(\\d)`,
"g"
),
`$1$2${locale.ordinalDate.firstSpace}$4$5${locale.ordinalDate.secondSpace}$7`
);
}
//
/**
Fix non-breaking space after Ordinal Roman Number
Examples:
I. kapitola
X. ročník
8. V. 1945
@param {string} string — input text for identification
@returns {string} — output with correctly placed non-breaking space
*/
export function addNbspAfterRomanNumeral(string, locale) {
// we can identify roman numeral effectively only if it has an ordinal indicator
if (locale.romanOrdinalIndicator != "") {
// prettier-ignore
return string.replace(
new RegExp(
`(\\b[${base.uppercaseChars}][${base.allChars}]?${locale.romanOrdinalIndicator}[${base.spaces}]?)?` +
`(\\b)` + // Ch.⎵
`([${base.romanNumerals}]+)` +
`(${locale.romanOrdinalIndicator})` +
`([${base.spaces}]?)` +
`([${base.allChars}\\d])`,
"g"
),
function($0, $1, $2, $3, $4, $5, $6) {
// Only replace if first group doesn't match
// to avoid false positives like G. D. Lambert
if (!$1) {
return `${$2}${$3}${$4}${base.nbsp}${$6}`;
}
return $0;
}
);
}
return string;
}
//
/**
Fix non-breaking space around name with regnal number
Examples:
Karel IV. → Karel⎵IV.
Karel IV.⎵byl → Karel⎵IV. byl
Charles IV → Charles⎵IV
Unsupported:
Charles I → Charles I
(first emperor, English language; otherwise “When I am” would be incorrectly fixed)
@param {string} string — input text for identification
@returns {string} — output with correctly placed non-breaking space
*/
export function fixNbspForNameWithRegnalNumber(string, locale) {
// prettier-ignore
let pattern =
`(\\b[${base.uppercaseChars}][${base.lowercaseChars}]+?)` +
`([${base.spaces}])` +
`([${base.romanNumerals}]+\\b)` +
`(${locale.romanOrdinalIndicator})` +
`([${base.nbsp}]?)`;
let re = new RegExp(pattern, "g");
return string.replace(re, function ($0, $1, $2, $3, $4, $5) {
if ($5 == "" && $3 == "I") {
return $1 + base.space + $3 + $4;
} else if ($5 == "" && $3 != "I") {
return $1 + base.nbsp + $3 + $4;
} else if ($5 == base.nbsp && $3 == "I") {
return $1 + base.space + $3 + $4 + $5;
} else {
return $1 + base.nbsp + $3 + $4 + base.space;
}
});
}
//
/**
Fix nbsp before % (percent), ‰ (permille) and ‱ (permyriad)
Locale differences
- en-us prefers no space (https://www.chicagomanualofstyle.org/qanda/data/faq/topics/Numbers/faq0005.html)
- de-de prefers narrow nbsp for perecent used as noun (https://german.stackexchange.com/questions/41550/what-does-din-5008-exactly-say-about-percent-character)
- sk, cs, rue prefers nbsp for percent used as noun
In sk, cs, rue, de-de, when percent is used as an adjective, there is no space between a number and a percent sign. This algorithm does not cover this use case, just tries to fix a space if there is one.
@param {string} string — input text for identification
@param {string} locale: locale option
@returns {string} — output with correctly added non-breaking space
*/
export function fixSpaceBeforePercent(string, locale) {
// prettier-ignore
return string.replace(
new RegExp(
`(\\d)` +
`([${base.spaces}])` +
`([${base.percent}${base.permille}${base.permyriad}])`,
"g"
),
`$1${locale.spaceBefore.percent}$3`
);
}
//
/**
Add/Swap non-breaking space before a single capital letter in a sentence
Examples:
The product X is missing the feature Y.
Sputnik V
© V Inc.
Človek Č
Counter examples:
When I talk to emerging product designers (capital I in English language)
Dear Christopher
pán Šťastný
pán ŠŤASTNÝ
One sentence ends. A bad apple. (single letter before sentence punctuation)
sentence; C-level executive (single letter befor sentence punctuation)
I’d say… A-player.
sentence (brackets) A-player
“qouted part” A capital letter
A × A (this should be fixed in multiplication sign, but maybe irrelevant)
famous company — A Inc. (this should be fixed in dash.js)
@param {string} string — input text for identification
@param {string} locale: locale option
@returns {string} — output with correctly added non-breaking space
*/
export function addNbspBeforeSingleLetter(string, locale) {
let uppercaseChars = base.uppercaseChars;
if (locale.ID == "en-us") {
// remove “I” from the list to avoid placing nbsp before “something I do”
uppercaseChars = uppercaseChars.replace(/A-Z/g, "A-HJ-Z");
}
// prettier-ignore
let pattern =
`([^${base.sentencePunctuation}${base.ellipsis}${base.closingBrackets}${locale.rightDoubleQuote}${locale.rightSingleQuote}${base.apostrophe}${base.multiplicationSign}${base.emDash}${base.enDash}])` +
`([${base.spaces}])` +
`([${uppercaseChars}])` +
`(([${base.spaces}])|(\\.$|$))`;
let re = new RegExp(pattern, "g");
return string.replace(re, function ($0, $1, $2, $3, $4, $5) {
if (locale.ID == "en-us") {
// don't make changes after "I" in en-us
return $1 + base.nbsp + $3 + $4;
} else if ($3 == "I" && ($5 == base.nbsp || $5 == base.hairSpace || $5 == base.narrowNbsp)) {
// replace nbsp after "I" in other languages
return $1 + base.nbsp + $3 + base.space;
} else {
// just add nbsp before single word capital letter in the rest of the cases
return $1 + base.nbsp + $3 + $4;
}
});
}
//
/**
Helper function that adds a nbsp (or a locale-specific space) after symbols in their respective *.js files
@param {string} string — input text for identification
@returns {string} — output with correctly added non-breaking space
*/
export function addNbspAfterSymbol(string, symbol, space) {
space = space !== undefined ? space : base.nbsp;
// prettier-ignore
return string.replace(
new RegExp(`(${symbol})([^${base.spaces}${symbol}])`, "g"),
`$1${space}$2`
);
}
//
/**
Helper function that fixes various spaces for nbsp after symbols
in their respective *.js files
@param {string} string — input text for identification
@returns {string} — output with correctly placed non-breaking space
*/
export function replaceSpacesWithNbspAfterSymbol(string, symbol, space) {
space = space !== undefined ? space : base.nbsp;
// prettier-ignore
return string.replace(
new RegExp(`(${symbol})([${base.spaces}]+)`, "g"),
`$1${space}`
);
}
//
/**
Consolidates the use of non-breaking spaces
@param {string} string — input text for identification
@returns {string} — output with correctly placed non-breaking space
*/
export function fixNbsp(string, locale) {
string = removeNbspBetweenMultiCharWords(string);
string = addNbspAfterPreposition(string, locale);
string = addNbspAfterAmpersand(string);
string = addNbspAfterCardinalNumber(string);
string = addNbspAfterOrdinalNumber(string, locale);
string = addNbspWithinOrdinalDate(string, locale);
string = addNbspAfterRomanNumeral(string, locale);
string = addNbspBeforeSingleLetter(string, locale);
string = fixNbspForNameWithRegnalNumber(string, locale);
string = fixSpaceBeforePercent(string, locale);
return string;
}