sentence-splitter
Version:
split {japanese, english} text into sentences.
69 lines (62 loc) • 1.87 kB
text/typescript
import { SourceCode } from "./SourceCode.js";
import { AbstractParser } from "./AbstractParser.js";
export const DefaultOptions = {
separatorCharacters: [
".", // period
".", // (ja) zenkaku-period
"。", // (ja) 句点
"?", // question mark
"!", // exclamation mark
"?", // (ja) zenkaku question mark
"!" // (ja) zenkaku exclamation mark
]
};
export interface SeparatorParserOptions {
/**
* Recognize each characters as separator
* Example [".", "!", "?"]
*/
separatorCharacters?: string[];
}
/**
* Separator parser
*/
export class SeparatorParser implements AbstractParser {
private separatorCharacters: string[];
constructor(readonly options?: SeparatorParserOptions) {
this.separatorCharacters =
options && options.separatorCharacters ? options.separatorCharacters : DefaultOptions.separatorCharacters;
}
test(sourceCode: SourceCode) {
if (sourceCode.isInContext()) {
return false;
}
if (sourceCode.isInContextRange()) {
return false;
}
const firstChar = sourceCode.read();
const nextChar = sourceCode.read(1);
if (!firstChar) {
return false;
}
if (!this.separatorCharacters.includes(firstChar)) {
return false;
}
// Need space after period
// Example: "This is a pen. This is not a pen."
// It will avoid false-position like `1.23`
if (firstChar === ".") {
if (nextChar) {
return /[\s\t\r\n]/.test(nextChar);
} else {
return true;
}
}
return true;
}
seek(sourceCode: SourceCode): void {
while (this.test(sourceCode)) {
sourceCode.peek();
}
}
}