sentence-splitter

import { SourceCode } from "./SourceCode.js"; import { Language } from "./lang/LanguageInterface.js"; import { English } from "./lang/English.js"; import { AbstractMarker } from "./AbstractMarker.js"; const isCapitalized = (text: string) => { if (!text || text.length === 0) { return false; } return /^\p{Uppercase_Letter}/u.test(text); }; const compareNoCaseSensitive = (a: string, b: string): boolean => { return a.toLowerCase() === b.toLowerCase(); }; /** * CJK characters act as word boundaries because CJK text does not use spaces between words. */ const isCJK = (char: string): boolean => { return /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u.test(char); }; export const DefaultOptions = { language: English }; export interface AbbrMarkerOptions { language?: Language; } /** * abbreviation marker */ export class AbbrMarker implements AbstractMarker { private lang: Language; constructor(readonly options?: AbbrMarkerOptions) { this.lang = options && options.language ? options.language : DefaultOptions.language; } /** * Get Word * word should have left space and right space (or CJK boundary), * @param {SourceCode} sourceCode * @param {number} startIndex * @returns {string} */ private getWord(sourceCode: SourceCode, startIndex: number = 0): string { const whiteSpace = /\s/; const prevChar = sourceCode.read(-1); // Also treat CJK characters as word boundaries if (prevChar && !whiteSpace.test(prevChar) && !isCJK(prevChar)) { return ""; } let word = ""; let count = startIndex; let char: boolean | string = ""; while ((char = sourceCode.read(count))) { if (whiteSpace.test(char) || isCJK(char)) { break; } word += char; count++; } return word; } private getPrevWord(sourceCode: SourceCode): string { const whiteSpace = /\s/; let count = -1; let char: boolean | string = ""; while ((char = sourceCode.read(count))) { if (!whiteSpace.test(char)) { break; } count--; } while ((char = sourceCode.read(count))) { if (whiteSpace.test(char)) { break; } count--; } return this.getWord(sourceCode, count + 1); } mark(sourceCode: SourceCode) { if (sourceCode.isInContextRange()) { return; } const currentWord = this.getWord(sourceCode); if (currentWord.length === 0) { return; } // Allow: Multi-period abbr // Example: U.S.A if (/^([a-zA-Z]\.){3,}$/.test(currentWord)) { return sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } // EXCALAMATION_WORDS // Example: Yahoo! const isMatchedEXCALAMATION_WORDS = this.lang.EXCLAMATION_WORDS.some((abbr) => { return compareNoCaseSensitive(abbr, currentWord); }); if (isMatchedEXCALAMATION_WORDS) { return sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } // PREPOSITIVE_ABBREVIATIONS // Example: Mr. Fuji const isMatchedPREPOSITIVE_ABBREVIATIONS = this.lang.PREPOSITIVE_ABBREVIATIONS.some((abbr) => { return compareNoCaseSensitive(abbr, currentWord); }); const isMatchedLineIndexes = /^\d+\.$/.test(currentWord); if (isMatchedPREPOSITIVE_ABBREVIATIONS || isMatchedLineIndexes) { return sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } // ABBREVIATIONS const isMatched = this.lang.ABBREVIATIONS.some((abbr) => { return compareNoCaseSensitive(abbr, currentWord); }); const prevWord = this.getPrevWord(sourceCode); const nextWord = this.getWord(sourceCode, currentWord.length + 1); // console.log("prevWord", prevWord); // console.log("currentWord", currentWord); // console.log("nextWord", nextWord); // Special case: Capital <ABBR>. Capital // Example: `I` as a sentence boundary and `I` as an abbreviation // > We make a good team, you and I. Did you see Albert I. Jones yesterday? // Related: https://github.com/azu/sentence-splitter/pull/31 if (isCapitalized(prevWord) && /^\p{Uppercase_Letter}\./u.test(currentWord) && isCapitalized(nextWord)) { sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } else if (isMatched && !isCapitalized(nextWord)) { // Exception. This allows to write Capitalized word at next word // A.M. is store. sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } } }