sentence-splitter
Version:
split {japanese, english} text into sentences.
293 lines • 10.4 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.DefaultSentenceSplitterOptions = exports.DefaultAbbrMarkerOptions = exports.SentenceSplitterSyntax = void 0;
exports.split = split;
exports.splitAST = splitAST;
const ast_node_types_1 = require("@textlint/ast-node-types");
const SourceCode_js_1 = require("./parser/SourceCode.js");
const NewLineParser_js_1 = require("./parser/NewLineParser.js");
const SpaceParser_js_1 = require("./parser/SpaceParser.js");
const SeparatorParser_js_1 = require("./parser/SeparatorParser.js");
Object.defineProperty(exports, "DefaultSentenceSplitterOptions", { enumerable: true, get: function () { return SeparatorParser_js_1.DefaultOptions; } });
const AnyValueParser_js_1 = require("./parser/AnyValueParser.js");
const AbbrMarker_js_1 = require("./parser/AbbrMarker.js");
Object.defineProperty(exports, "DefaultAbbrMarkerOptions", { enumerable: true, get: function () { return AbbrMarker_js_1.DefaultOptions; } });
const PairMaker_js_1 = require("./parser/PairMaker.js");
const logger_js_1 = require("./logger.js");
exports.SentenceSplitterSyntax = {
WhiteSpace: "WhiteSpace",
Punctuation: "Punctuation",
Sentence: "Sentence",
Str: "Str",
PairMark: "PairMark"
};
class SplitParser {
sentenceNodeList = [];
results = [];
source;
constructor(text) {
this.source = new SourceCode_js_1.SourceCode(text);
}
get current() {
return this.sentenceNodeList[this.sentenceNodeList.length - 1];
}
pushNodeToCurrent(node) {
const current = this.current;
if (current) {
current.children.push(node);
}
else {
// Under the root
this.results.push(node);
}
}
// open with ParentNode
open(parentNode) {
this.sentenceNodeList.push(parentNode);
}
isOpened() {
return this.sentenceNodeList.length > 0;
}
nextLine(parser) {
const { value, startPosition, endPosition } = this.source.seekNext(parser);
this.pushNodeToCurrent(createWhiteSpaceNode(value, startPosition, endPosition));
return endPosition;
}
nextSpace(parser) {
const { value, startPosition, endPosition } = this.source.seekNext(parser);
this.pushNodeToCurrent(createWhiteSpaceNode(value, startPosition, endPosition));
}
nextValue(parser) {
const { value, startPosition, endPosition } = this.source.seekNext(parser);
this.pushNodeToCurrent(createTextNode(value, startPosition, endPosition));
}
// close current Node and remove it from list
close(parser) {
const { value, startPosition, endPosition } = this.source.seekNext(parser);
// rest of the value is Punctuation
// Except for the case of the last character of the value is a space
// See "space-first-and-space-last" test case
if (startPosition.offset !== endPosition.offset && !/^\s+$/.test(value)) {
this.pushNodeToCurrent(createPunctuationNode(value, startPosition, endPosition));
}
const currentNode = this.sentenceNodeList.pop();
if (!currentNode) {
return;
}
if (currentNode.children.length === 0) {
return;
}
const firstChildNode = currentNode.children[0];
const endNow = this.source.now();
// update Sentence node's location and range
const rawValue = this.source.sliceRange(firstChildNode.range[0], endNow.offset);
const contexts = this.source.consumedContexts
.sort((a, b) => {
return a.range[0] - b.range[0];
})
.map((context) => {
return {
type: "PairMark",
pairMark: context.pairMark,
range: context.range,
loc: context.loc
};
});
this.results.push({
...currentNode,
loc: {
start: firstChildNode.loc.start,
end: {
line: endNow.line,
column: endNow.column
}
},
range: [firstChildNode.range[0], endNow.offset],
raw: rawValue,
contexts: contexts
});
}
toList() {
return this.results;
}
}
const createParsers = (options = {}) => {
const newLine = new NewLineParser_js_1.NewLineParser();
const space = new SpaceParser_js_1.SpaceParser();
const separator = new SeparatorParser_js_1.SeparatorParser(options.SeparatorParser);
const abbrMarker = new AbbrMarker_js_1.AbbrMarker(options.AbbrMarker);
const pairMaker = new PairMaker_js_1.PairMaker();
// anyValueParser has multiple parser and markers.
// anyValueParse eat any value if it reaches to other value.
const anyValueParser = new AnyValueParser_js_1.AnyValueParser({
parsers: [newLine, separator],
markers: [abbrMarker, pairMaker]
});
return {
newLine,
space,
separator,
abbrMarker,
anyValueParser
};
};
/**
* split `text` into Sentence nodes
*/
function split(text, options) {
const { newLine, space, separator, anyValueParser } = createParsers(options);
const splitParser = new SplitParser(text);
const sourceCode = splitParser.source;
while (!sourceCode.hasEnd) {
if (newLine.test(sourceCode)) {
splitParser.nextLine(newLine);
}
else if (space.test(sourceCode)) {
splitParser.nextSpace(space);
}
else if (separator.test(sourceCode)) {
splitParser.close(separator);
}
else {
if (!splitParser.isOpened()) {
splitParser.open(createEmptySentenceNode());
}
splitParser.nextValue(anyValueParser);
}
}
splitParser.close(space);
return splitParser.toList();
}
/**
* Convert Paragraph Node to Paragraph node that convert children to Sentence node
* This Node is based on TxtAST.
* See https://github.com/textlint/textlint/blob/master/docs/txtnode.md
*/
function splitAST(paragraphNode, options) {
const { newLine, space, separator, anyValueParser } = createParsers(options);
const splitParser = new SplitParser(paragraphNode);
const sourceCode = splitParser.source;
while (!sourceCode.hasEnd) {
const currentNode = sourceCode.readNode();
if (!currentNode) {
break;
}
if (currentNode.type === ast_node_types_1.ASTNodeTypes.Str) {
if (space.test(sourceCode)) {
(0, logger_js_1.nodeLog)("space", sourceCode);
splitParser.nextSpace(space);
}
else if (separator.test(sourceCode)) {
(0, logger_js_1.nodeLog)("separator", sourceCode);
splitParser.close(separator);
}
else if (newLine.test(sourceCode)) {
(0, logger_js_1.nodeLog)("newline", sourceCode);
splitParser.nextLine(newLine);
}
else {
if (!splitParser.isOpened()) {
(0, logger_js_1.nodeLog)("open -> createEmptySentenceNode()");
splitParser.open(createEmptySentenceNode());
}
(0, logger_js_1.nodeLog)("other str value", sourceCode);
splitParser.nextValue(anyValueParser);
}
}
else if (currentNode.type === ast_node_types_1.ASTNodeTypes.Break) {
(0, logger_js_1.nodeLog)("break", sourceCode);
// Break
// https://github.com/azu/sentence-splitter/issues/23
splitParser.pushNodeToCurrent(currentNode);
sourceCode.peekNode(currentNode);
}
else {
if (!splitParser.isOpened()) {
(0, logger_js_1.nodeLog)("open -> createEmptySentenceNode()");
splitParser.open(createEmptySentenceNode());
}
(0, logger_js_1.nodeLog)("other node", sourceCode);
splitParser.pushNodeToCurrent(currentNode);
sourceCode.peekNode(currentNode);
}
}
(0, logger_js_1.nodeLog)("end separator");
// It follow some text that is not ended with period.
// TODO: space is correct?
splitParser.close(space);
return {
...paragraphNode,
children: splitParser.toList()
};
}
/**
* WhiteSpace is space or linebreak
*/
function createWhiteSpaceNode(text, startPosition, endPosition) {
return {
type: exports.SentenceSplitterSyntax.WhiteSpace,
raw: text,
value: text,
loc: {
start: {
line: startPosition.line,
column: startPosition.column
},
end: {
line: endPosition.line,
column: endPosition.column
}
},
range: [startPosition.offset, endPosition.offset]
};
}
function createPunctuationNode(text, startPosition, endPosition) {
return {
type: exports.SentenceSplitterSyntax.Punctuation,
raw: text,
value: text,
loc: {
start: {
line: startPosition.line,
column: startPosition.column
},
end: {
line: endPosition.line,
column: endPosition.column
}
},
range: [startPosition.offset, endPosition.offset]
};
}
function createTextNode(text, startPosition, endPosition) {
return {
type: exports.SentenceSplitterSyntax.Str,
raw: text,
value: text,
loc: {
start: {
line: startPosition.line,
column: startPosition.column
},
end: {
line: endPosition.line,
column: endPosition.column
}
},
range: [startPosition.offset, endPosition.offset]
};
}
function createEmptySentenceNode() {
return {
type: exports.SentenceSplitterSyntax.Sentence,
raw: "",
loc: {
start: { column: NaN, line: NaN },
end: { column: NaN, line: NaN }
},
range: [NaN, NaN],
children: [],
contexts: []
};
}
//# sourceMappingURL=sentence-splitter.js.map