sentence-splitter
Version:
split {japanese, english} text into sentences.
140 lines (129 loc) • 5.07 kB
text/typescript
import { SourceCode } from "./SourceCode.js";
import { Language } from "./lang/LanguageInterface.js";
import { English } from "./lang/English.js";
import { AbstractMarker } from "./AbstractMarker.js";
const isCapitalized = (text: string) => {
if (!text || text.length === 0) {
return false;
}
return /^\p{Uppercase_Letter}/u.test(text);
};
const compareNoCaseSensitive = (a: string, b: string): boolean => {
return a.toLowerCase() === b.toLowerCase();
};
/**
* CJK characters act as word boundaries because CJK text does not use spaces between words.
*/
const isCJK = (char: string): boolean => {
return /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u.test(char);
};
export const DefaultOptions = {
language: English
};
export interface AbbrMarkerOptions {
language?: Language;
}
/**
* abbreviation marker
*/
export class AbbrMarker implements AbstractMarker {
private lang: Language;
constructor(readonly options?: AbbrMarkerOptions) {
this.lang = options && options.language ? options.language : DefaultOptions.language;
}
/**
* Get Word
* word should have left space and right space (or CJK boundary),
* @param {SourceCode} sourceCode
* @param {number} startIndex
* @returns {string}
*/
private getWord(sourceCode: SourceCode, startIndex: number = 0): string {
const whiteSpace = /\s/;
const prevChar = sourceCode.read(-1);
// Also treat CJK characters as word boundaries
if (prevChar && !whiteSpace.test(prevChar) && !isCJK(prevChar)) {
return "";
}
let word = "";
let count = startIndex;
let char: boolean | string = "";
while ((char = sourceCode.read(count))) {
if (whiteSpace.test(char) || isCJK(char)) {
break;
}
word += char;
count++;
}
return word;
}
private getPrevWord(sourceCode: SourceCode): string {
const whiteSpace = /\s/;
let count = -1;
let char: boolean | string = "";
while ((char = sourceCode.read(count))) {
if (!whiteSpace.test(char)) {
break;
}
count--;
}
while ((char = sourceCode.read(count))) {
if (whiteSpace.test(char)) {
break;
}
count--;
}
return this.getWord(sourceCode, count + 1);
}
mark(sourceCode: SourceCode) {
if (sourceCode.isInContextRange()) {
return;
}
const currentWord = this.getWord(sourceCode);
if (currentWord.length === 0) {
return;
}
// Allow: Multi-period abbr
// Example: U.S.A
if (/^([a-zA-Z]\.){3,}$/.test(currentWord)) {
return sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]);
}
// EXCALAMATION_WORDS
// Example: Yahoo!
const isMatchedEXCALAMATION_WORDS = this.lang.EXCLAMATION_WORDS.some((abbr) => {
return compareNoCaseSensitive(abbr, currentWord);
});
if (isMatchedEXCALAMATION_WORDS) {
return sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]);
}
// PREPOSITIVE_ABBREVIATIONS
// Example: Mr. Fuji
const isMatchedPREPOSITIVE_ABBREVIATIONS = this.lang.PREPOSITIVE_ABBREVIATIONS.some((abbr) => {
return compareNoCaseSensitive(abbr, currentWord);
});
const isMatchedLineIndexes = /^\d+\.$/.test(currentWord);
if (isMatchedPREPOSITIVE_ABBREVIATIONS || isMatchedLineIndexes) {
return sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]);
}
// ABBREVIATIONS
const isMatched = this.lang.ABBREVIATIONS.some((abbr) => {
return compareNoCaseSensitive(abbr, currentWord);
});
const prevWord = this.getPrevWord(sourceCode);
const nextWord = this.getWord(sourceCode, currentWord.length + 1);
// console.log("prevWord", prevWord);
// console.log("currentWord", currentWord);
// console.log("nextWord", nextWord);
// Special case: Capital <ABBR>. Capital
// Example: `I` as a sentence boundary and `I` as an abbreviation
// > We make a good team, you and I. Did you see Albert I. Jones yesterday?
// Related: https://github.com/azu/sentence-splitter/pull/31
if (isCapitalized(prevWord) && /^\p{Uppercase_Letter}\./u.test(currentWord) && isCapitalized(nextWord)) {
sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]);
} else if (isMatched && !isCapitalized(nextWord)) {
// Exception. This allows to write Capitalized word at next word
// A.M. is store.
sourceCode.markContextRange([sourceCode.offset, sourceCode.offset + currentWord.length]);
}
}
}