sentence-splitter
Version:
split {japanese, english} text into sentences.
394 lines (368 loc) • 12.5 kB
text/typescript
import type { TxtParagraphNode, TxtParentNode, TxtStrNode, TxtTextNode } from "@textlint/ast-node-types";
import { ASTNodeTypes } from "@textlint/ast-node-types";
import { SourceCode } from "./parser/SourceCode.js";
import { AbstractParser } from "./parser/AbstractParser.js";
import { NewLineParser } from "./parser/NewLineParser.js";
import { SpaceParser } from "./parser/SpaceParser.js";
import {
DefaultOptions as DefaultSentenceSplitterOptions,
SeparatorParser,
SeparatorParserOptions
} from "./parser/SeparatorParser.js";
import { AnyValueParser } from "./parser/AnyValueParser.js";
import { AbbrMarker, AbbrMarkerOptions, DefaultOptions as DefaultAbbrMarkerOptions } from "./parser/AbbrMarker.js";
import { PairMaker } from "./parser/PairMaker.js";
import { nodeLog } from "./logger.js";
export const SentenceSplitterSyntax = {
WhiteSpace: "WhiteSpace",
Punctuation: "Punctuation",
Sentence: "Sentence",
Str: "Str",
PairMark: "PairMark"
} as const;
export type SentencePairMarkContext = {
type: "PairMark";
range: readonly [startIndex: number, endIndex: number];
loc: {
start: {
line: number;
column: number;
};
end: {
line: number;
column: number;
};
};
};
// SentenceNode does not have sentence
// Nested SentenceNode is not allowed
export type TxtSentenceNodeChildren =
| TxtParentNode["children"][number]
| TxtWhiteSpaceNode
| TxtPunctuationNode
| TxtStrNode;
export type TxtSentenceNode = Omit<TxtParentNode, "type" | "children"> & {
readonly type: "Sentence";
/**
* SentenceNode includes some context information
* - "PairMark": pair mark information
*/
readonly contexts: SentencePairMarkContext[];
children: TxtSentenceNodeChildren[];
};
export type TxtWhiteSpaceNode = Omit<TxtTextNode, "type"> & {
readonly type: "WhiteSpace";
};
export type TxtPunctuationNode = Omit<TxtTextNode, "type"> & {
readonly type: "Punctuation";
};
export type SentenceSplitterTxtNode = TxtSentenceNode | TxtWhiteSpaceNode | TxtPunctuationNode | TxtStrNode;
export type SentenceSplitterTxtNodeType = (typeof SentenceSplitterSyntax)[keyof typeof SentenceSplitterSyntax];
export type TxtParentNodeWithSentenceNodeContent = TxtParentNode["children"][number] | SentenceSplitterTxtNode;
export type TxtParentNodeWithSentenceNode = Omit<TxtParentNode, "children"> & {
children: TxtParentNodeWithSentenceNodeContent[];
};
class SplitParser {
private sentenceNodeList: TxtSentenceNode[] = [];
private results: TxtParentNodeWithSentenceNode["children"] = [];
public source: SourceCode;
constructor(text: string | TxtParentNode) {
this.source = new SourceCode(text);
}
get current(): TxtSentenceNode | undefined {
return this.sentenceNodeList[this.sentenceNodeList.length - 1];
}
pushNodeToCurrent(node: TxtSentenceNodeChildren) {
const current = this.current;
if (current) {
current.children.push(node);
} else {
// Under the root
this.results.push(node);
}
}
// open with ParentNode
open(parentNode: TxtSentenceNode) {
this.sentenceNodeList.push(parentNode);
}
isOpened() {
return this.sentenceNodeList.length > 0;
}
nextLine(parser: AbstractParser) {
const { value, startPosition, endPosition } = this.source.seekNext(parser);
this.pushNodeToCurrent(createWhiteSpaceNode(value, startPosition, endPosition));
return endPosition;
}
nextSpace(parser: AbstractParser) {
const { value, startPosition, endPosition } = this.source.seekNext(parser);
this.pushNodeToCurrent(createWhiteSpaceNode(value, startPosition, endPosition));
}
nextValue(parser: AbstractParser) {
const { value, startPosition, endPosition } = this.source.seekNext(parser);
this.pushNodeToCurrent(createTextNode(value, startPosition, endPosition));
}
// close current Node and remove it from list
close(parser: AbstractParser) {
const { value, startPosition, endPosition } = this.source.seekNext(parser);
// rest of the value is Punctuation
// Except for the case of the last character of the value is a space
// See "space-first-and-space-last" test case
if (startPosition.offset !== endPosition.offset && !/^\s+$/.test(value)) {
this.pushNodeToCurrent(createPunctuationNode(value, startPosition, endPosition));
}
const currentNode = this.sentenceNodeList.pop();
if (!currentNode) {
return;
}
if (currentNode.children.length === 0) {
return;
}
const firstChildNode = currentNode.children[0];
const endNow = this.source.now();
// update Sentence node's location and range
const rawValue = this.source.sliceRange(firstChildNode.range[0], endNow.offset);
const contexts = this.source.consumedContexts
.sort((a, b) => {
return a.range[0] - b.range[0];
})
.map((context) => {
return {
type: "PairMark" as const,
pairMark: context.pairMark,
range: context.range,
loc: context.loc
};
});
this.results.push({
...currentNode,
loc: {
start: firstChildNode.loc.start,
end: {
line: endNow.line,
column: endNow.column
}
},
range: [firstChildNode.range[0], endNow.offset],
raw: rawValue,
contexts: contexts
});
}
toList() {
return this.results;
}
}
export { DefaultAbbrMarkerOptions, DefaultSentenceSplitterOptions };
export interface splitOptions {
/**
* Separator & AbbrMarker options
*/
SeparatorParser?: SeparatorParserOptions;
AbbrMarker?: AbbrMarkerOptions;
}
const createParsers = (options: splitOptions = {}) => {
const newLine = new NewLineParser();
const space = new SpaceParser();
const separator = new SeparatorParser(options.SeparatorParser);
const abbrMarker = new AbbrMarker(options.AbbrMarker);
const pairMaker = new PairMaker();
// anyValueParser has multiple parser and markers.
// anyValueParse eat any value if it reaches to other value.
const anyValueParser = new AnyValueParser({
parsers: [newLine, separator],
markers: [abbrMarker, pairMaker]
});
return {
newLine,
space,
separator,
abbrMarker,
anyValueParser
};
};
/**
* split `text` into Sentence nodes
*/
export function split(text: string, options?: splitOptions): TxtParentNodeWithSentenceNode["children"] {
const { newLine, space, separator, anyValueParser } = createParsers(options);
const splitParser = new SplitParser(text);
const sourceCode = splitParser.source;
while (!sourceCode.hasEnd) {
if (newLine.test(sourceCode)) {
splitParser.nextLine(newLine);
} else if (space.test(sourceCode)) {
splitParser.nextSpace(space);
} else if (separator.test(sourceCode)) {
splitParser.close(separator);
} else {
if (!splitParser.isOpened()) {
splitParser.open(createEmptySentenceNode());
}
splitParser.nextValue(anyValueParser);
}
}
splitParser.close(space);
return splitParser.toList();
}
/**
* Convert Paragraph Node to Paragraph node that convert children to Sentence node
* This Node is based on TxtAST.
* See https://github.com/textlint/textlint/blob/master/docs/txtnode.md
*/
export function splitAST(paragraphNode: TxtParagraphNode, options?: splitOptions): TxtParentNodeWithSentenceNode {
const { newLine, space, separator, anyValueParser } = createParsers(options);
const splitParser = new SplitParser(paragraphNode);
const sourceCode = splitParser.source;
while (!sourceCode.hasEnd) {
const currentNode = sourceCode.readNode();
if (!currentNode) {
break;
}
if (currentNode.type === ASTNodeTypes.Str) {
if (space.test(sourceCode)) {
nodeLog("space", sourceCode);
splitParser.nextSpace(space);
} else if (separator.test(sourceCode)) {
nodeLog("separator", sourceCode);
splitParser.close(separator);
} else if (newLine.test(sourceCode)) {
nodeLog("newline", sourceCode);
splitParser.nextLine(newLine);
} else {
if (!splitParser.isOpened()) {
nodeLog("open -> createEmptySentenceNode()");
splitParser.open(createEmptySentenceNode());
}
nodeLog("other str value", sourceCode);
splitParser.nextValue(anyValueParser);
}
} else if (currentNode.type === ASTNodeTypes.Break) {
nodeLog("break", sourceCode);
// Break
// https://github.com/azu/sentence-splitter/issues/23
splitParser.pushNodeToCurrent(currentNode);
sourceCode.peekNode(currentNode);
} else {
if (!splitParser.isOpened()) {
nodeLog("open -> createEmptySentenceNode()");
splitParser.open(createEmptySentenceNode());
}
nodeLog("other node", sourceCode);
splitParser.pushNodeToCurrent(currentNode);
sourceCode.peekNode(currentNode);
}
}
nodeLog("end separator");
// It follow some text that is not ended with period.
// TODO: space is correct?
splitParser.close(space);
return {
...paragraphNode,
children: splitParser.toList()
};
}
/**
* WhiteSpace is space or linebreak
*/
function createWhiteSpaceNode(
text: string,
startPosition: {
line: number;
column: number;
offset: number;
},
endPosition: {
line: number;
column: number;
offset: number;
}
) {
return {
type: SentenceSplitterSyntax.WhiteSpace,
raw: text,
value: text,
loc: {
start: {
line: startPosition.line,
column: startPosition.column
},
end: {
line: endPosition.line,
column: endPosition.column
}
},
range: [startPosition.offset, endPosition.offset] as const
};
}
function createPunctuationNode(
text: string,
startPosition: {
line: number;
column: number;
offset: number;
},
endPosition: {
line: number;
column: number;
offset: number;
}
): TxtPunctuationNode {
return {
type: SentenceSplitterSyntax.Punctuation,
raw: text,
value: text,
loc: {
start: {
line: startPosition.line,
column: startPosition.column
},
end: {
line: endPosition.line,
column: endPosition.column
}
},
range: [startPosition.offset, endPosition.offset]
};
}
function createTextNode(
text: string,
startPosition: {
line: number;
column: number;
offset: number;
},
endPosition: {
line: number;
column: number;
offset: number;
}
): TxtStrNode {
return {
type: SentenceSplitterSyntax.Str,
raw: text,
value: text,
loc: {
start: {
line: startPosition.line,
column: startPosition.column
},
end: {
line: endPosition.line,
column: endPosition.column
}
},
range: [startPosition.offset, endPosition.offset]
};
}
function createEmptySentenceNode(): TxtSentenceNode {
return {
type: SentenceSplitterSyntax.Sentence,
raw: "",
loc: {
start: { column: NaN, line: NaN },
end: { column: NaN, line: NaN }
} as const,
range: [NaN, NaN] as const,
children: [],
contexts: []
};
}