UNPKG

@danielhaim/titlecaser

Version:

A powerful utility for transforming text to title case with support for multiple style guides and extensive customization options.

github.com/danielhaim1/titlecaser

danielhaim1/titlecaser

543 lines (451 loc) • 21.3 kB

JavaScript

import { shortWordsList, specialTermsList, phraseReplacementMap, wordReplacementsList, styleConfigMap, REGEX_PATTERNS, } from "./TitleCaserConsts.js"; import { TitleCaserUtils } from "./TitleCaserUtils.js"; export class TitleCaser { constructor (options = {}) { this.options = options; this.debug = options.debug || false; this.wordReplacementsList = JSON.parse(JSON.stringify(wordReplacementsList)); this.phraseReplacementMap = JSON.parse(JSON.stringify(phraseReplacementMap)); } logWarning(message) { if (this.debug) { console.warn(`Warning: ${message}`); } } toTitleCase(str) { try { // ! If input is not a string, throw an error. if (typeof str !== "string") throw new TypeError("Invalid input: input must be a string."); // ! If input is empty, throw an error. if (str.length === 0) throw new TypeError("Invalid input: input must not be empty."); // ! Input sanitization: limit length to prevent performance issues if (str.length > 100000) throw new TypeError("Invalid input: input exceeds maximum length of 100,000 characters."); // ! If options is not an object, throw an error. if (typeof this.options !== "undefined" && typeof this.options !== "object") throw new TypeError("Invalid options: options must be an object."); const { style = "ap", neverCapitalize = [], wordReplacementsList = this.wordReplacementsList, smartQuotes = false, // Set to false by default normalizeWhitespace = true, } = this.options; const styleConfig = styleConfigMap[style] || {}; const ignoreList = ["nl2br", ...neverCapitalize]; const { articlesList, shortConjunctionsList, shortPrepositionsList, neverCapitalizedList, replaceTerms, smartQuotes: mergedSmartQuotes, } = TitleCaserUtils.getTitleCaseOptions(this.options, shortWordsList, wordReplacementsList); // Preprocess the replaceTerms array to make it easier to search for. const replaceTermsArray = wordReplacementsList.map((term) => Object.keys(term)[0].toLowerCase()); // Create an object from the replaceTerms array to make it easier to search for. const replaceTermObj = Object.fromEntries( wordReplacementsList.map((term) => [Object.keys(term)[0].toLowerCase(), Object.values(term)[0]]), ); this.logWarning(`replaceTermsArray: ${replaceTermsArray}`); this.logWarning(`this.wordReplacementsList: ${this.wordReplacementsList}`); // Normalize HTML breaks and optionally normalize whitespace (see normalizeWhitespace option). let inputString = str; // Replace <br> and <br /> tags with a placeholder. inputString = inputString.replace(REGEX_PATTERNS.HTML_BREAK, " nl2br "); // Check if the entire input string is uppercase and normalize it to lowercase // before processing if it is. This ensures consistent handling for all-caps text. const isEntireStringUppercase = TitleCaserUtils.isEntirelyUppercase(inputString.replace(/[^a-zA-Z]/g, '')); if (isEntireStringUppercase) { this.logWarning("Input string is entirely uppercase, normalizing to lowercase first"); inputString = inputString.toLowerCase(); } // Tokenize preserving whitespace const tokens = inputString.split(/(\s+)/); const wordsInTitleCase = tokens.map((token, i) => { if (!token || /^\s+$/.test(token)) return token; const word = token; switch (true) { case TitleCaserUtils.isWordAmpersand(word): // ! if the word is an ampersand, return it as is. return word; case TitleCaserUtils.hasHtmlBreak(word): // ! If the word is a <br> tag, return it as is. return word; case TitleCaserUtils.isWordIgnored(word, ignoreList): // ! If the word is in the ignore list, return it as is. return word; case replaceTermsArray.includes(word.toLowerCase()): // ! If the word is in the replaceTerms array, return the replacement. return replaceTermObj[word.toLowerCase()]; case TitleCaserUtils.isWordInArray(word, specialTermsList): // ! If the word is in the specialTermsList array, return the correct casing. return TitleCaserUtils.correctTerm(word, specialTermsList); case TitleCaserUtils.isElidedWord(word): // ! If the word is an elided word, return the correct casing. return TitleCaserUtils.normalizeElidedWord(word); case TitleCaserUtils.hasHyphen(word): // Separate the base word from any trailing punctuation const baseWord = word.replace(/[\W_]+$/, ""); const trailingPunctuation = word.slice(baseWord.length); // Split the base word at the hyphen and process each part const parts = baseWord.split("-"); const replacedParts = parts.map((part) => { const lowerCasePart = part.toLowerCase(); if (replaceTermsArray.includes(lowerCasePart)) { return replaceTermObj[lowerCasePart]; } return part; }); // Determine if any part was replaced const isReplaced = !replacedParts.every((part, index) => part === parts[index]); // Reassemble the word with the hyphen, reattach trailing punctuation, and return const processedWord = isReplaced ? replacedParts.join("-") : TitleCaserUtils.correctTermHyphenated(word, style); return processedWord.endsWith(trailingPunctuation) ? processedWord : processedWord + trailingPunctuation; case TitleCaserUtils.hasSuffix(word, style): // ! If the word has a suffix, return the correct casing. return TitleCaserUtils.correctSuffix(word, specialTermsList); case TitleCaserUtils.hasUppercaseIntentional(word): // ! If the word has an intentional uppercase letter, return the correct casing. return word; case TitleCaserUtils.isShortWord(word, style) && i !== 0: // Find previous non-whitespace token let prevToken = null; for (let j = i - 1; j >= 0; j--) { if (!/^\s+$/.test(tokens[j])) { prevToken = tokens[j]; break; } } const isAtEndOfSentence = prevToken && TitleCaserUtils.endsWithSymbol(prevToken, [":", "?", "!", "."]); if (isAtEndOfSentence) { return word.charAt(0).toUpperCase() + word.slice(1); } const wordCasing = TitleCaserUtils.normalizeCasingForWordByStyle(word, style); return wordCasing; case TitleCaserUtils.endsWithSymbol(word): this.logWarning(`Check if the word ends with a symbol: ${word}`); // ! If the word ends with a symbol, return the correct casing. const splitWord = word.split(REGEX_PATTERNS.SPLIT_AT_PUNCTUATION); this.logWarning(`Splitting word at symbols, result: ${splitWord}`); // Process each part for correct casing const processedWords = splitWord.map((part) => { this.logWarning(`Processing part: ${part}`); // Check if part is a symbol if (TitleCaserUtils.endsWithSymbol(part)) { this.logWarning(`Part is a symbol: ${part}`); return part; } else { this.logWarning(`Part is a word: ${part}`); // ! If it's a word, process it for correct casing if (TitleCaserUtils.isWordInArray(part, specialTermsList)) { const correctedTerm = TitleCaserUtils.correctTerm(part, specialTermsList); this.logWarning(`Word is in specialTermsList, corrected term: ${correctedTerm}`); return correctedTerm; } else if (replaceTermsArray.includes(part)) { const replacement = replaceTermObj[part]; this.logWarning(`Word is in replaceTermsArray, replacement: ${replacement}`); return replacement; } else { const titledWord = part.charAt(0).toUpperCase() + part.slice(1).toLowerCase(); this.logWarning(`Applying title casing to word: ${titledWord}`); return titledWord; } } }); // Join the processed words and return them. return processedWords.join(""); case TitleCaserUtils.startsWithSymbol(word): // ! If the word starts with a symbol, return the correct casing. return !TitleCaserUtils.isWordInArray(word, specialTermsList) ? word : TitleCaserUtils.correctTerm(word); case TitleCaserUtils.hasRomanNumeral(word): // ! If the word has a roman numeral, return the correct casing. return word.toUpperCase(); case TitleCaserUtils.hasNumbers(word): // ! If the word has numbers, return the correct casing. return word; default: // Default to returning the word with the correct casing. return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase(); } }); // Join the words in the array into a string. inputString = wordsInTitleCase.join(""); // Replace the nl2br placeholder with <br> tags. inputString = inputString.replace(/nl2br/gi, "<br>"); // Convert quotation marks to smart quotes if enabled // Refer to: https://github.com/danielhaim1/TitleCaser/issues/4 if (smartQuotes) { inputString = TitleCaserUtils.convertQuotesToCurly(inputString); } const wordsForAcronyms = inputString.split(/(\s+)/); // Extract non-whitespace words for first/second detection // Extract non-whitespace words for first/second detection const nonWhitespaceWords = wordsForAcronyms.filter(t => !/^\s+$/.test(t)); let firstWord = nonWhitespaceWords[0] || null; let secondWord = nonWhitespaceWords[1] || null; for (let i = 0; i < wordsForAcronyms.length; i++) { if (/^\s+$/.test(wordsForAcronyms[i])) continue; // Find previous non-whitespace word let prevWord = null; for (let j = i - 1; j >= 0; j--) { if (!/^\s+$/.test(wordsForAcronyms[j])) { prevWord = wordsForAcronyms[j]; break; } } // Find next non-whitespace word let nextWord = null; for (let j = i + 1; j < wordsForAcronyms.length; j++) { if (!/^\s+$/.test(wordsForAcronyms[j])) { nextWord = wordsForAcronyms[j]; break; } } let currentWord = wordsForAcronyms[i]; const punctuationMatch = currentWord.match(REGEX_PATTERNS.TRAILING_PUNCTUATION); let punctuation = ""; if (punctuationMatch) { punctuation = punctuationMatch[0]; currentWord = currentWord.replace(REGEX_PATTERNS.TRAILING_PUNCTUATION, ""); } if (TitleCaserUtils.isRegionalAcronymNoDot(currentWord, nextWord, prevWord)) { currentWord = TitleCaserUtils.normalizeRegionalAcronym(currentWord); } if (punctuation !== "") { currentWord = currentWord + punctuation; } wordsForAcronyms[i] = currentWord; } inputString = wordsForAcronyms.join(""); const wordsForShortWords = inputString.split(/(\s+)/); for (let i = 1; i < wordsForShortWords.length - 1; i++) { const currentWord = wordsForShortWords[i]; const prevWord = wordsForShortWords[i - 1]; const nextWord = wordsForShortWords[i + 1]; if ( currentWord === currentWord.toUpperCase() || TitleCaserUtils.hasUppercaseIntentional(currentWord) ) { continue; } if (TitleCaserUtils.isWordInArray(currentWord, shortWordsList)) { wordsForShortWords[i] = currentWord.length <= 3 ? currentWord.toLowerCase() : currentWord; } } inputString = wordsForShortWords.join(""); const wordsForFinalPass = inputString.split(/(\s+)/); for (let i = 0; i < wordsForFinalPass.length; i++) { if (/^\s+$/.test(wordsForFinalPass[i])) continue; let currentWord = wordsForFinalPass[i]; // Find previous non-whitespace word let prevWord = null; for (let j = i - 1; j >= 0; j--) { if (!/^\s+$/.test(wordsForFinalPass[j])) { prevWord = wordsForFinalPass[j]; break; } } // Find next non-whitespace word let nextWord = null; for (let j = i + 1; j < wordsForFinalPass.length; j++) { if (!/^\s+$/.test(wordsForFinalPass[j])) { nextWord = wordsForFinalPass[j]; break; } } if (nextWord && TitleCaserUtils.isRegionalAcronymNoDot(currentWord, nextWord, prevWord)) { wordsForFinalPass[i] = currentWord.toUpperCase(); } } const nonWhitespaceFinal = wordsForFinalPass.filter(t => !/^\s+$/.test(t)); let finalWord = nonWhitespaceFinal[nonWhitespaceFinal.length - 1]; let wordBeforeFinal = nonWhitespaceFinal[nonWhitespaceFinal.length - 2]; let twoWordsBeforeFinal = nonWhitespaceFinal[nonWhitespaceFinal.length - 3]; if (firstWord && TitleCaserUtils.isRegionalAcronym(firstWord)) { this.logWarning(`firstWord is a regional acronym: ${firstWord}`); wordsForFinalPass[0] = firstWord.toUpperCase(); } if (firstWord && secondWord && TitleCaserUtils.isRegionalAcronymNoDot(firstWord, secondWord)) { wordsForFinalPass[0] = firstWord.toUpperCase(); } if ( finalWord && wordBeforeFinal && TitleCaserUtils.isFinalWordRegionalAcronym(finalWord, wordBeforeFinal, twoWordsBeforeFinal) ) { wordsForFinalPass[wordsForFinalPass.length - 1] = finalWord.toUpperCase(); } inputString = wordsForFinalPass.join(""); for (const [phrase, replacement] of Object.entries(this.phraseReplacementMap)) { // Create a regular expression for case-insensitive matching of the phrase const regex = new RegExp(phrase.replace(REGEX_PATTERNS.REGEX_ESCAPE, "\\$&"), "gi"); // Replace the phrase in the input string with its corresponding replacement inputString = inputString.replace(regex, replacement); } // ! Handle sentence case if (styleConfig.caseStyle === "sentence") { const words = inputString.split(/(\s+)/); let firstWordFound = false; for (let i = 0; i < words.length; i++) { let word = words[i]; // 1) The first word: Capitalize first letter only, preserve existing brand/case in the rest if (!firstWordFound && /[A-Za-z]/.test(word)) { // If you want to skip altering brand or acronym, do one more check: if (!TitleCaser.shouldKeepCasing(word, specialTermsList)) { // "Normal" first word words[i] = word.charAt(0).toUpperCase() + word.slice(1).toLowerCase(); } // Otherwise, it's a brand/acronym, so leave it firstWordFound = true; continue; } // 2) For subsequent words, only force-lowercase if we do NOT want to preserve uppercase if (!TitleCaser.shouldKeepCasing(word, specialTermsList)) { words[i] = word.toLowerCase(); } // else, we keep it exactly as is } inputString = words.join(""); } if (normalizeWhitespace) { inputString = inputString .replace(/\s+/g, " ") .trim(); } return inputString; } catch (error) { // Preserve original error information if (error instanceof Error) { throw error; } else { throw new Error(String(error)); } } } setReplaceTerms(terms) { if (!Array.isArray(terms)) { throw new TypeError("Invalid argument: setReplaceTerms must be an array of objects."); } // ! Iterate over each term-replacement object in the array terms.forEach((termObject) => { if (termObject && typeof termObject === "object") { const [term, replacement] = Object.entries(termObject)[0]; const index = this.wordReplacementsList.findIndex((obj) => obj.hasOwnProperty(term)); if (index !== -1) { // Update the existing term this.wordReplacementsList[index][term] = replacement; } else { // Add the new term this.wordReplacementsList.push({ [term]: replacement }); } } else { // Handle non-object entries in the array, if required console.warn("Invalid entry in terms array:", termObject); } }); // Added check to prevent excessive number of replacement rules which could lead to performance issues if (this.wordReplacementsList.length > 2000) { throw new Error("Too many replacement rules."); } this.options.wordReplacementsList = this.wordReplacementsList; this.logWarning(`Log the updated this.wordReplacementsList: ${this.wordReplacementsList}`); } addReplaceTerm(term, replacement) { if (typeof term !== "string" || typeof replacement !== "string") { throw new TypeError("Invalid argument: term and replacement must be strings."); } const index = this.wordReplacementsList.findIndex((obj) => Object.keys(obj)[0] === term); if (index !== -1) { this.wordReplacementsList[index][term] = replacement; } else { this.wordReplacementsList.push({ [term]: replacement }); } if (this.wordReplacementsList.length > 2000) { throw new Error("Too many replacement rules."); } this.options.wordReplacementsList = this.wordReplacementsList; } removeReplaceTerm(term) { if (typeof term !== "string") { throw new TypeError("Invalid argument: term must be a string."); } // Find the index of the term in the wordReplacementsList array const index = this.wordReplacementsList.findIndex((obj) => Object.keys(obj)[0] === term); // ! If the term is not found in the array, throw an error if (index === -1) { throw new Error(`Term '${term}' not found in word replacements list.`); } // Remove the term from the array this.wordReplacementsList.splice(index, 1); // Update the replace terms option this.options.wordReplacementsList = this.wordReplacementsList; this.logWarning(`Log the updated this.wordReplacementsList: ${this.wordReplacementsList}`); } addExactPhraseReplacements(newPhrases) { if (!Array.isArray(newPhrases)) { throw new TypeError("Invalid argument: newPhrases must be an array."); } newPhrases.forEach((item) => { // ! If the item is an object with a single key-value pair if (typeof item === "object" && !Array.isArray(item) && Object.keys(item).length === 1) { const key = Object.keys(item)[0]; const value = item[key]; if (typeof key === "string" && typeof value === "string") { this.phraseReplacementMap[key] = value; } else { throw new TypeError("Invalid argument: Each key-value pair must contain strings."); } } // ! If the item is already a key-value pair else if (typeof item === "object" && !Array.isArray(item)) { Object.entries(item).forEach(([key, value]) => { if (typeof key === "string" && typeof value === "string") { this.phraseReplacementMap[key] = value; } else { throw new TypeError("Invalid argument: Each key-value pair must contain strings."); } }); } // ! Invalid format else { throw new TypeError("Invalid argument: Each item must be an object with a single key-value pair."); } }); this.logWarning(`Log the this.phraseReplacementMap: ${this.phraseReplacementMap}`); } setStyle(style) { if (typeof style !== "string") { throw new TypeError("Invalid argument: style must be a string."); } this.options.style = style; } /** * Determines if a word should keep its existing casing * @param {string} word - The word to check * @param {Array<string>} specialTermsList - List of terms to preserve * @returns {boolean} True if word should keep its casing */ static shouldKeepCasing(word, specialTermsList) { // If it's an acronym if (TitleCaserUtils.isRegionalAcronym(word)) return true; // If it has known "intentional uppercase" patterns if (TitleCaserUtils.hasUppercaseIntentional(word)) return true; // If it's in the brand/specialTermsList if (TitleCaserUtils.isWordInArray(word, specialTermsList)) return true; // Otherwise, no. It's safe to lowercase. return false; } }