UNPKG

sentence-splitter

Version:

split {japanese, english} text into sentences.

github.com/textlint-rule/sentence-splitter

textlint-rule/sentence-splitter

127 lines • 4.7 kB

JavaScript

import { English } from "./lang/English.js"; const isCapitalized = (text) => { if (!text || text.length === 0) { return false; } return /^\p{Uppercase_Letter}/u.test(text); }; const compareNoCaseSensitive = (a, b) => { return a.toLowerCase() === b.toLowerCase(); }; /** * CJK characters act as word boundaries because CJK text does not use spaces between words. */ const isCJK = (char) => { return /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u.test(char); }; export const DefaultOptions = { language: English }; /** * abbreviation marker */ export class AbbrMarker { options; lang; constructor(options) { this.options = options; this.lang = options && options.language ? options.language : DefaultOptions.language; } /** * Get Word * word should have left space and right space (or CJK boundary), * @param {SourceCode} sourceCode * @param {number} startIndex * @returns {string} */ getWord(sourceCode, startIndex = 0) { const whiteSpace = /\s/; const prevChar = sourceCode.read(-1); // Also treat CJK characters as word boundaries if (prevChar && !whiteSpace.test(prevChar) && !isCJK(prevChar)) { return ""; } let word = ""; let count = startIndex; let char = ""; while ((char = sourceCode.read(count))) { if (whiteSpace.test(char) || isCJK(char)) { break; } word += char; count++; } return word; } getPrevWord(sourceCode) { const whiteSpace = /\s/; let count = -1; let char = ""; while ((char = sourceCode.read(count))) { if (!whiteSpace.test(char)) { break; } count--; } while ((char = sourceCode.read(count))) { if (whiteSpace.test(char)) { break; } count--; } return this.getWord(sourceCode, count + 1); } mark(sourceCode) { if (sourceCode.isInContextRange()) { return; } const currentWord = this.getWord(sourceCode); if (currentWord.length === 0) { return; } // Allow: Multi-period abbr // Example: U.S.A if (/^([a-zA-Z]\.){3,}$/.test(currentWord)) { return sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } // EXCALAMATION_WORDS // Example: Yahoo! const isMatchedEXCALAMATION_WORDS = this.lang.EXCLAMATION_WORDS.some((abbr) => { return compareNoCaseSensitive(abbr, currentWord); }); if (isMatchedEXCALAMATION_WORDS) { return sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } // PREPOSITIVE_ABBREVIATIONS // Example: Mr. Fuji const isMatchedPREPOSITIVE_ABBREVIATIONS = this.lang.PREPOSITIVE_ABBREVIATIONS.some((abbr) => { return compareNoCaseSensitive(abbr, currentWord); }); const isMatchedLineIndexes = /^\d+\.$/.test(currentWord); if (isMatchedPREPOSITIVE_ABBREVIATIONS || isMatchedLineIndexes) { return sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } // ABBREVIATIONS const isMatched = this.lang.ABBREVIATIONS.some((abbr) => { return compareNoCaseSensitive(abbr, currentWord); }); const prevWord = this.getPrevWord(sourceCode); const nextWord = this.getWord(sourceCode, currentWord.length + 1); // console.log("prevWord", prevWord); // console.log("currentWord", currentWord); // console.log("nextWord", nextWord); // Special case: Capital <ABBR>. Capital // Example: `I` as a sentence boundary and `I` as an abbreviation // > We make a good team, you and I. Did you see Albert I. Jones yesterday? // Related: https://github.com/azu/sentence-splitter/pull/31 if (isCapitalized(prevWord) && /^\p{Uppercase_Letter}\./u.test(currentWord) && isCapitalized(nextWord)) { sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } else if (isMatched && !isCapitalized(nextWord)) { // Exception. This allows to write Capitalized word at next word // A.M. is store. sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]); } } } //# sourceMappingURL=AbbrMarker.js.map