@danielhaim/titlecaser
Version:
A powerful utility for transforming text to title case with support for multiple style guides and extensive customization options.
543 lines (451 loc) • 21.3 kB
JavaScript
import {
shortWordsList,
specialTermsList,
phraseReplacementMap,
wordReplacementsList,
styleConfigMap,
REGEX_PATTERNS,
} from "./TitleCaserConsts.js";
import { TitleCaserUtils } from "./TitleCaserUtils.js";
export class TitleCaser {
constructor (options = {}) {
this.options = options;
this.debug = options.debug || false;
this.wordReplacementsList = JSON.parse(JSON.stringify(wordReplacementsList));
this.phraseReplacementMap = JSON.parse(JSON.stringify(phraseReplacementMap));
}
logWarning(message) {
if (this.debug) {
console.warn(`Warning: ${message}`);
}
}
toTitleCase(str) {
try {
// ! If input is not a string, throw an error.
if (typeof str !== "string") throw new TypeError("Invalid input: input must be a string.");
// ! If input is empty, throw an error.
if (str.length === 0) throw new TypeError("Invalid input: input must not be empty.");
// ! Input sanitization: limit length to prevent performance issues
if (str.length > 100000) throw new TypeError("Invalid input: input exceeds maximum length of 100,000 characters.");
// ! If options is not an object, throw an error.
if (typeof this.options !== "undefined" && typeof this.options !== "object")
throw new TypeError("Invalid options: options must be an object.");
const {
style = "ap",
neverCapitalize = [],
wordReplacementsList = this.wordReplacementsList,
smartQuotes = false, // Set to false by default
normalizeWhitespace = true,
} = this.options;
const styleConfig = styleConfigMap[style] || {};
const ignoreList = ["nl2br", ...neverCapitalize];
const {
articlesList,
shortConjunctionsList,
shortPrepositionsList,
neverCapitalizedList,
replaceTerms,
smartQuotes: mergedSmartQuotes,
} = TitleCaserUtils.getTitleCaseOptions(this.options, shortWordsList, wordReplacementsList);
// Preprocess the replaceTerms array to make it easier to search for.
const replaceTermsArray = wordReplacementsList.map((term) => Object.keys(term)[0].toLowerCase());
// Create an object from the replaceTerms array to make it easier to search for.
const replaceTermObj = Object.fromEntries(
wordReplacementsList.map((term) => [Object.keys(term)[0].toLowerCase(), Object.values(term)[0]]),
);
this.logWarning(`replaceTermsArray: ${replaceTermsArray}`);
this.logWarning(`this.wordReplacementsList: ${this.wordReplacementsList}`);
// Normalize HTML breaks and optionally normalize whitespace (see normalizeWhitespace option).
let inputString = str;
// Replace <br> and <br /> tags with a placeholder.
inputString = inputString.replace(REGEX_PATTERNS.HTML_BREAK, " nl2br ");
// Check if the entire input string is uppercase and normalize it to lowercase
// before processing if it is. This ensures consistent handling for all-caps text.
const isEntireStringUppercase = TitleCaserUtils.isEntirelyUppercase(inputString.replace(/[^a-zA-Z]/g, ''));
if (isEntireStringUppercase) {
this.logWarning("Input string is entirely uppercase, normalizing to lowercase first");
inputString = inputString.toLowerCase();
}
// Tokenize preserving whitespace
const tokens = inputString.split(/(\s+)/);
const wordsInTitleCase = tokens.map((token, i) => {
if (!token || /^\s+$/.test(token)) return token;
const word = token;
switch (true) {
case TitleCaserUtils.isWordAmpersand(word):
// ! if the word is an ampersand, return it as is.
return word;
case TitleCaserUtils.hasHtmlBreak(word):
// ! If the word is a <br> tag, return it as is.
return word;
case TitleCaserUtils.isWordIgnored(word, ignoreList):
// ! If the word is in the ignore list, return it as is.
return word;
case replaceTermsArray.includes(word.toLowerCase()):
// ! If the word is in the replaceTerms array, return the replacement.
return replaceTermObj[word.toLowerCase()];
case TitleCaserUtils.isWordInArray(word, specialTermsList):
// ! If the word is in the specialTermsList array, return the correct casing.
return TitleCaserUtils.correctTerm(word, specialTermsList);
case TitleCaserUtils.isElidedWord(word):
// ! If the word is an elided word, return the correct casing.
return TitleCaserUtils.normalizeElidedWord(word);
case TitleCaserUtils.hasHyphen(word):
// Separate the base word from any trailing punctuation
const baseWord = word.replace(/[\W_]+$/, "");
const trailingPunctuation = word.slice(baseWord.length);
// Split the base word at the hyphen and process each part
const parts = baseWord.split("-");
const replacedParts = parts.map((part) => {
const lowerCasePart = part.toLowerCase();
if (replaceTermsArray.includes(lowerCasePart)) {
return replaceTermObj[lowerCasePart];
}
return part;
});
// Determine if any part was replaced
const isReplaced = !replacedParts.every((part, index) => part === parts[index]);
// Reassemble the word with the hyphen, reattach trailing punctuation, and return
const processedWord = isReplaced ? replacedParts.join("-") : TitleCaserUtils.correctTermHyphenated(word, style);
return processedWord.endsWith(trailingPunctuation) ? processedWord : processedWord + trailingPunctuation;
case TitleCaserUtils.hasSuffix(word, style):
// ! If the word has a suffix, return the correct casing.
return TitleCaserUtils.correctSuffix(word, specialTermsList);
case TitleCaserUtils.hasUppercaseIntentional(word):
// ! If the word has an intentional uppercase letter, return the correct casing.
return word;
case TitleCaserUtils.isShortWord(word, style) && i !== 0:
// Find previous non-whitespace token
let prevToken = null;
for (let j = i - 1; j >= 0; j--) {
if (!/^\s+$/.test(tokens[j])) {
prevToken = tokens[j];
break;
}
}
const isAtEndOfSentence =
prevToken && TitleCaserUtils.endsWithSymbol(prevToken, [":", "?", "!", "."]);
if (isAtEndOfSentence) {
return word.charAt(0).toUpperCase() + word.slice(1);
}
const wordCasing = TitleCaserUtils.normalizeCasingForWordByStyle(word, style);
return wordCasing;
case TitleCaserUtils.endsWithSymbol(word):
this.logWarning(`Check if the word ends with a symbol: ${word}`);
// ! If the word ends with a symbol, return the correct casing.
const splitWord = word.split(REGEX_PATTERNS.SPLIT_AT_PUNCTUATION);
this.logWarning(`Splitting word at symbols, result: ${splitWord}`);
// Process each part for correct casing
const processedWords = splitWord.map((part) => {
this.logWarning(`Processing part: ${part}`);
// Check if part is a symbol
if (TitleCaserUtils.endsWithSymbol(part)) {
this.logWarning(`Part is a symbol: ${part}`);
return part;
} else {
this.logWarning(`Part is a word: ${part}`);
// ! If it's a word, process it for correct casing
if (TitleCaserUtils.isWordInArray(part, specialTermsList)) {
const correctedTerm = TitleCaserUtils.correctTerm(part, specialTermsList);
this.logWarning(`Word is in specialTermsList, corrected term: ${correctedTerm}`);
return correctedTerm;
} else if (replaceTermsArray.includes(part)) {
const replacement = replaceTermObj[part];
this.logWarning(`Word is in replaceTermsArray, replacement: ${replacement}`);
return replacement;
} else {
const titledWord = part.charAt(0).toUpperCase() + part.slice(1).toLowerCase();
this.logWarning(`Applying title casing to word: ${titledWord}`);
return titledWord;
}
}
});
// Join the processed words and return them.
return processedWords.join("");
case TitleCaserUtils.startsWithSymbol(word):
// ! If the word starts with a symbol, return the correct casing.
return !TitleCaserUtils.isWordInArray(word, specialTermsList)
? word
: TitleCaserUtils.correctTerm(word);
case TitleCaserUtils.hasRomanNumeral(word):
// ! If the word has a roman numeral, return the correct casing.
return word.toUpperCase();
case TitleCaserUtils.hasNumbers(word):
// ! If the word has numbers, return the correct casing.
return word;
default:
// Default to returning the word with the correct casing.
return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
}
});
// Join the words in the array into a string.
inputString = wordsInTitleCase.join("");
// Replace the nl2br placeholder with <br> tags.
inputString = inputString.replace(/nl2br/gi, "<br>");
// Convert quotation marks to smart quotes if enabled
// Refer to: https://github.com/danielhaim1/TitleCaser/issues/4
if (smartQuotes) {
inputString = TitleCaserUtils.convertQuotesToCurly(inputString);
}
const wordsForAcronyms = inputString.split(/(\s+)/);
// Extract non-whitespace words for first/second detection
// Extract non-whitespace words for first/second detection
const nonWhitespaceWords = wordsForAcronyms.filter(t => !/^\s+$/.test(t));
let firstWord = nonWhitespaceWords[0] || null;
let secondWord = nonWhitespaceWords[1] || null;
for (let i = 0; i < wordsForAcronyms.length; i++) {
if (/^\s+$/.test(wordsForAcronyms[i])) continue;
// Find previous non-whitespace word
let prevWord = null;
for (let j = i - 1; j >= 0; j--) {
if (!/^\s+$/.test(wordsForAcronyms[j])) {
prevWord = wordsForAcronyms[j];
break;
}
}
// Find next non-whitespace word
let nextWord = null;
for (let j = i + 1; j < wordsForAcronyms.length; j++) {
if (!/^\s+$/.test(wordsForAcronyms[j])) {
nextWord = wordsForAcronyms[j];
break;
}
}
let currentWord = wordsForAcronyms[i];
const punctuationMatch = currentWord.match(REGEX_PATTERNS.TRAILING_PUNCTUATION);
let punctuation = "";
if (punctuationMatch) {
punctuation = punctuationMatch[0];
currentWord = currentWord.replace(REGEX_PATTERNS.TRAILING_PUNCTUATION, "");
}
if (TitleCaserUtils.isRegionalAcronymNoDot(currentWord, nextWord, prevWord)) {
currentWord = TitleCaserUtils.normalizeRegionalAcronym(currentWord);
}
if (punctuation !== "") {
currentWord = currentWord + punctuation;
}
wordsForAcronyms[i] = currentWord;
}
inputString = wordsForAcronyms.join("");
const wordsForShortWords = inputString.split(/(\s+)/);
for (let i = 1; i < wordsForShortWords.length - 1; i++) {
const currentWord = wordsForShortWords[i];
const prevWord = wordsForShortWords[i - 1];
const nextWord = wordsForShortWords[i + 1];
if (
currentWord === currentWord.toUpperCase() ||
TitleCaserUtils.hasUppercaseIntentional(currentWord)
) {
continue;
}
if (TitleCaserUtils.isWordInArray(currentWord, shortWordsList)) {
wordsForShortWords[i] =
currentWord.length <= 3
? currentWord.toLowerCase()
: currentWord;
}
}
inputString = wordsForShortWords.join("");
const wordsForFinalPass = inputString.split(/(\s+)/);
for (let i = 0; i < wordsForFinalPass.length; i++) {
if (/^\s+$/.test(wordsForFinalPass[i])) continue;
let currentWord = wordsForFinalPass[i];
// Find previous non-whitespace word
let prevWord = null;
for (let j = i - 1; j >= 0; j--) {
if (!/^\s+$/.test(wordsForFinalPass[j])) {
prevWord = wordsForFinalPass[j];
break;
}
}
// Find next non-whitespace word
let nextWord = null;
for (let j = i + 1; j < wordsForFinalPass.length; j++) {
if (!/^\s+$/.test(wordsForFinalPass[j])) {
nextWord = wordsForFinalPass[j];
break;
}
}
if (nextWord && TitleCaserUtils.isRegionalAcronymNoDot(currentWord, nextWord, prevWord)) {
wordsForFinalPass[i] = currentWord.toUpperCase();
}
}
const nonWhitespaceFinal = wordsForFinalPass.filter(t => !/^\s+$/.test(t));
let finalWord = nonWhitespaceFinal[nonWhitespaceFinal.length - 1];
let wordBeforeFinal = nonWhitespaceFinal[nonWhitespaceFinal.length - 2];
let twoWordsBeforeFinal = nonWhitespaceFinal[nonWhitespaceFinal.length - 3];
if (firstWord && TitleCaserUtils.isRegionalAcronym(firstWord)) {
this.logWarning(`firstWord is a regional acronym: ${firstWord}`);
wordsForFinalPass[0] = firstWord.toUpperCase();
}
if (firstWord && secondWord && TitleCaserUtils.isRegionalAcronymNoDot(firstWord, secondWord)) {
wordsForFinalPass[0] = firstWord.toUpperCase();
}
if (
finalWord &&
wordBeforeFinal &&
TitleCaserUtils.isFinalWordRegionalAcronym(finalWord, wordBeforeFinal, twoWordsBeforeFinal)
) {
wordsForFinalPass[wordsForFinalPass.length - 1] = finalWord.toUpperCase();
}
inputString = wordsForFinalPass.join("");
for (const [phrase, replacement] of Object.entries(this.phraseReplacementMap)) {
// Create a regular expression for case-insensitive matching of the phrase
const regex = new RegExp(phrase.replace(REGEX_PATTERNS.REGEX_ESCAPE, "\\$&"), "gi");
// Replace the phrase in the input string with its corresponding replacement
inputString = inputString.replace(regex, replacement);
}
// ! Handle sentence case
if (styleConfig.caseStyle === "sentence") {
const words = inputString.split(/(\s+)/);
let firstWordFound = false;
for (let i = 0; i < words.length; i++) {
let word = words[i];
// 1) The first word: Capitalize first letter only, preserve existing brand/case in the rest
if (!firstWordFound && /[A-Za-z]/.test(word)) {
// If you want to skip altering brand or acronym, do one more check:
if (!TitleCaser.shouldKeepCasing(word, specialTermsList)) {
// "Normal" first word
words[i] = word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
}
// Otherwise, it's a brand/acronym, so leave it
firstWordFound = true;
continue;
}
// 2) For subsequent words, only force-lowercase if we do NOT want to preserve uppercase
if (!TitleCaser.shouldKeepCasing(word, specialTermsList)) {
words[i] = word.toLowerCase();
}
// else, we keep it exactly as is
}
inputString = words.join("");
}
if (normalizeWhitespace) {
inputString = inputString
.replace(/\s+/g, " ")
.trim();
}
return inputString;
} catch (error) {
// Preserve original error information
if (error instanceof Error) {
throw error;
} else {
throw new Error(String(error));
}
}
}
setReplaceTerms(terms) {
if (!Array.isArray(terms)) {
throw new TypeError("Invalid argument: setReplaceTerms must be an array of objects.");
}
// ! Iterate over each term-replacement object in the array
terms.forEach((termObject) => {
if (termObject && typeof termObject === "object") {
const [term, replacement] = Object.entries(termObject)[0];
const index = this.wordReplacementsList.findIndex((obj) => obj.hasOwnProperty(term));
if (index !== -1) {
// Update the existing term
this.wordReplacementsList[index][term] = replacement;
} else {
// Add the new term
this.wordReplacementsList.push({ [term]: replacement });
}
} else {
// Handle non-object entries in the array, if required
console.warn("Invalid entry in terms array:", termObject);
}
});
// Added check to prevent excessive number of replacement rules which could lead to performance issues
if (this.wordReplacementsList.length > 2000) {
throw new Error("Too many replacement rules.");
}
this.options.wordReplacementsList = this.wordReplacementsList;
this.logWarning(`Log the updated this.wordReplacementsList: ${this.wordReplacementsList}`);
}
addReplaceTerm(term, replacement) {
if (typeof term !== "string" || typeof replacement !== "string") {
throw new TypeError("Invalid argument: term and replacement must be strings.");
}
const index = this.wordReplacementsList.findIndex((obj) => Object.keys(obj)[0] === term);
if (index !== -1) {
this.wordReplacementsList[index][term] = replacement;
} else {
this.wordReplacementsList.push({ [term]: replacement });
}
if (this.wordReplacementsList.length > 2000) {
throw new Error("Too many replacement rules.");
}
this.options.wordReplacementsList = this.wordReplacementsList;
}
removeReplaceTerm(term) {
if (typeof term !== "string") {
throw new TypeError("Invalid argument: term must be a string.");
}
// Find the index of the term in the wordReplacementsList array
const index = this.wordReplacementsList.findIndex((obj) => Object.keys(obj)[0] === term);
// ! If the term is not found in the array, throw an error
if (index === -1) {
throw new Error(`Term '${term}' not found in word replacements list.`);
}
// Remove the term from the array
this.wordReplacementsList.splice(index, 1);
// Update the replace terms option
this.options.wordReplacementsList = this.wordReplacementsList;
this.logWarning(`Log the updated this.wordReplacementsList: ${this.wordReplacementsList}`);
}
addExactPhraseReplacements(newPhrases) {
if (!Array.isArray(newPhrases)) {
throw new TypeError("Invalid argument: newPhrases must be an array.");
}
newPhrases.forEach((item) => {
// ! If the item is an object with a single key-value pair
if (typeof item === "object" && !Array.isArray(item) && Object.keys(item).length === 1) {
const key = Object.keys(item)[0];
const value = item[key];
if (typeof key === "string" && typeof value === "string") {
this.phraseReplacementMap[key] = value;
} else {
throw new TypeError("Invalid argument: Each key-value pair must contain strings.");
}
}
// ! If the item is already a key-value pair
else if (typeof item === "object" && !Array.isArray(item)) {
Object.entries(item).forEach(([key, value]) => {
if (typeof key === "string" && typeof value === "string") {
this.phraseReplacementMap[key] = value;
} else {
throw new TypeError("Invalid argument: Each key-value pair must contain strings.");
}
});
}
// ! Invalid format
else {
throw new TypeError("Invalid argument: Each item must be an object with a single key-value pair.");
}
});
this.logWarning(`Log the this.phraseReplacementMap: ${this.phraseReplacementMap}`);
}
setStyle(style) {
if (typeof style !== "string") {
throw new TypeError("Invalid argument: style must be a string.");
}
this.options.style = style;
}
/**
* Determines if a word should keep its existing casing
* @param {string} word - The word to check
* @param {Array<string>} specialTermsList - List of terms to preserve
* @returns {boolean} True if word should keep its casing
*/
static shouldKeepCasing(word, specialTermsList) {
// If it's an acronym
if (TitleCaserUtils.isRegionalAcronym(word)) return true;
// If it has known "intentional uppercase" patterns
if (TitleCaserUtils.hasUppercaseIntentional(word)) return true;
// If it's in the brand/specialTermsList
if (TitleCaserUtils.isWordInArray(word, specialTermsList)) return true;
// Otherwise, no. It's safe to lowercase.
return false;
}
}