UNPKG

sentence-splitter

Version:

split {japanese, english} text into sentences.

github.com/textlint-rule/sentence-splitter

textlint-rule/sentence-splitter

293 lines • 10.4 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.DefaultSentenceSplitterOptions = exports.DefaultAbbrMarkerOptions = exports.SentenceSplitterSyntax = void 0; exports.split = split; exports.splitAST = splitAST; const ast_node_types_1 = require("@textlint/ast-node-types"); const SourceCode_js_1 = require("./parser/SourceCode.js"); const NewLineParser_js_1 = require("./parser/NewLineParser.js"); const SpaceParser_js_1 = require("./parser/SpaceParser.js"); const SeparatorParser_js_1 = require("./parser/SeparatorParser.js"); Object.defineProperty(exports, "DefaultSentenceSplitterOptions", { enumerable: true, get: function () { return SeparatorParser_js_1.DefaultOptions; } }); const AnyValueParser_js_1 = require("./parser/AnyValueParser.js"); const AbbrMarker_js_1 = require("./parser/AbbrMarker.js"); Object.defineProperty(exports, "DefaultAbbrMarkerOptions", { enumerable: true, get: function () { return AbbrMarker_js_1.DefaultOptions; } }); const PairMaker_js_1 = require("./parser/PairMaker.js"); const logger_js_1 = require("./logger.js"); exports.SentenceSplitterSyntax = { WhiteSpace: "WhiteSpace", Punctuation: "Punctuation", Sentence: "Sentence", Str: "Str", PairMark: "PairMark" }; class SplitParser { sentenceNodeList = []; results = []; source; constructor(text) { this.source = new SourceCode_js_1.SourceCode(text); } get current() { return this.sentenceNodeList[this.sentenceNodeList.length - 1]; } pushNodeToCurrent(node) { const current = this.current; if (current) { current.children.push(node); } else { // Under the root this.results.push(node); } } // open with ParentNode open(parentNode) { this.sentenceNodeList.push(parentNode); } isOpened() { return this.sentenceNodeList.length > 0; } nextLine(parser) { const { value, startPosition, endPosition } = this.source.seekNext(parser); this.pushNodeToCurrent(createWhiteSpaceNode(value, startPosition, endPosition)); return endPosition; } nextSpace(parser) { const { value, startPosition, endPosition } = this.source.seekNext(parser); this.pushNodeToCurrent(createWhiteSpaceNode(value, startPosition, endPosition)); } nextValue(parser) { const { value, startPosition, endPosition } = this.source.seekNext(parser); this.pushNodeToCurrent(createTextNode(value, startPosition, endPosition)); } // close current Node and remove it from list close(parser) { const { value, startPosition, endPosition } = this.source.seekNext(parser); // rest of the value is Punctuation // Except for the case of the last character of the value is a space // See "space-first-and-space-last" test case if (startPosition.offset !== endPosition.offset && !/^\s+$/.test(value)) { this.pushNodeToCurrent(createPunctuationNode(value, startPosition, endPosition)); } const currentNode = this.sentenceNodeList.pop(); if (!currentNode) { return; } if (currentNode.children.length === 0) { return; } const firstChildNode = currentNode.children[0]; const endNow = this.source.now(); // update Sentence node's location and range const rawValue = this.source.sliceRange(firstChildNode.range[0], endNow.offset); const contexts = this.source.consumedContexts .sort((a, b) => { return a.range[0] - b.range[0]; }) .map((context) => { return { type: "PairMark", pairMark: context.pairMark, range: context.range, loc: context.loc }; }); this.results.push({ ...currentNode, loc: { start: firstChildNode.loc.start, end: { line: endNow.line, column: endNow.column } }, range: [firstChildNode.range[0], endNow.offset], raw: rawValue, contexts: contexts }); } toList() { return this.results; } } const createParsers = (options = {}) => { const newLine = new NewLineParser_js_1.NewLineParser(); const space = new SpaceParser_js_1.SpaceParser(); const separator = new SeparatorParser_js_1.SeparatorParser(options.SeparatorParser); const abbrMarker = new AbbrMarker_js_1.AbbrMarker(options.AbbrMarker); const pairMaker = new PairMaker_js_1.PairMaker(); // anyValueParser has multiple parser and markers. // anyValueParse eat any value if it reaches to other value. const anyValueParser = new AnyValueParser_js_1.AnyValueParser({ parsers: [newLine, separator], markers: [abbrMarker, pairMaker] }); return { newLine, space, separator, abbrMarker, anyValueParser }; }; /** * split `text` into Sentence nodes */ function split(text, options) { const { newLine, space, separator, anyValueParser } = createParsers(options); const splitParser = new SplitParser(text); const sourceCode = splitParser.source; while (!sourceCode.hasEnd) { if (newLine.test(sourceCode)) { splitParser.nextLine(newLine); } else if (space.test(sourceCode)) { splitParser.nextSpace(space); } else if (separator.test(sourceCode)) { splitParser.close(separator); } else { if (!splitParser.isOpened()) { splitParser.open(createEmptySentenceNode()); } splitParser.nextValue(anyValueParser); } } splitParser.close(space); return splitParser.toList(); } /** * Convert Paragraph Node to Paragraph node that convert children to Sentence node * This Node is based on TxtAST. * See https://github.com/textlint/textlint/blob/master/docs/txtnode.md */ function splitAST(paragraphNode, options) { const { newLine, space, separator, anyValueParser } = createParsers(options); const splitParser = new SplitParser(paragraphNode); const sourceCode = splitParser.source; while (!sourceCode.hasEnd) { const currentNode = sourceCode.readNode(); if (!currentNode) { break; } if (currentNode.type === ast_node_types_1.ASTNodeTypes.Str) { if (space.test(sourceCode)) { (0, logger_js_1.nodeLog)("space", sourceCode); splitParser.nextSpace(space); } else if (separator.test(sourceCode)) { (0, logger_js_1.nodeLog)("separator", sourceCode); splitParser.close(separator); } else if (newLine.test(sourceCode)) { (0, logger_js_1.nodeLog)("newline", sourceCode); splitParser.nextLine(newLine); } else { if (!splitParser.isOpened()) { (0, logger_js_1.nodeLog)("open -> createEmptySentenceNode()"); splitParser.open(createEmptySentenceNode()); } (0, logger_js_1.nodeLog)("other str value", sourceCode); splitParser.nextValue(anyValueParser); } } else if (currentNode.type === ast_node_types_1.ASTNodeTypes.Break) { (0, logger_js_1.nodeLog)("break", sourceCode); // Break // https://github.com/azu/sentence-splitter/issues/23 splitParser.pushNodeToCurrent(currentNode); sourceCode.peekNode(currentNode); } else { if (!splitParser.isOpened()) { (0, logger_js_1.nodeLog)("open -> createEmptySentenceNode()"); splitParser.open(createEmptySentenceNode()); } (0, logger_js_1.nodeLog)("other node", sourceCode); splitParser.pushNodeToCurrent(currentNode); sourceCode.peekNode(currentNode); } } (0, logger_js_1.nodeLog)("end separator"); // It follow some text that is not ended with period. // TODO: space is correct? splitParser.close(space); return { ...paragraphNode, children: splitParser.toList() }; } /** * WhiteSpace is space or linebreak */ function createWhiteSpaceNode(text, startPosition, endPosition) { return { type: exports.SentenceSplitterSyntax.WhiteSpace, raw: text, value: text, loc: { start: { line: startPosition.line, column: startPosition.column }, end: { line: endPosition.line, column: endPosition.column } }, range: [startPosition.offset, endPosition.offset] }; } function createPunctuationNode(text, startPosition, endPosition) { return { type: exports.SentenceSplitterSyntax.Punctuation, raw: text, value: text, loc: { start: { line: startPosition.line, column: startPosition.column }, end: { line: endPosition.line, column: endPosition.column } }, range: [startPosition.offset, endPosition.offset] }; } function createTextNode(text, startPosition, endPosition) { return { type: exports.SentenceSplitterSyntax.Str, raw: text, value: text, loc: { start: { line: startPosition.line, column: startPosition.column }, end: { line: endPosition.line, column: endPosition.column } }, range: [startPosition.offset, endPosition.offset] }; } function createEmptySentenceNode() { return { type: exports.SentenceSplitterSyntax.Sentence, raw: "", loc: { start: { column: NaN, line: NaN }, end: { column: NaN, line: NaN } }, range: [NaN, NaN], children: [], contexts: [] }; } //# sourceMappingURL=sentence-splitter.js.map