sbd-fork
Version:
Split text into sentences with Sentence Boundary Detection (SBD).
104 lines (103 loc) • 5.61 kB
JavaScript
;
/*jshint node:true, laxcomma:true */
/*global describe:true, it:true */
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
var assert_1 = __importDefault(require("assert"));
var tokenizer_1 = __importDefault(require("../lib/tokenizer"));
describe('Abbreviations in sentences', function () {
describe('Skip dotted abbreviations', function () {
var entry = "Lorem ipsum, dolor sed amat frequentor minimus In I.C.T we have multiple challenges! There should only be two sentences.";
var sentences = tokenizer_1.default.sentences(entry);
it("should get 2 sentences", function () {
assert_1.default.equal(sentences.length, 2);
});
});
describe('Skip dotted abbreviations (B)', function () {
var entry = "From amat frequentor minimus hello there at 8 a.m. there p.m. should only be two sentences.";
var sentences = tokenizer_1.default.sentences(entry);
it("should get 1 sentence", function () {
assert_1.default.equal(sentences.length, 1);
});
});
describe('Skip dotted abbreviations (C)', function () {
var entry = "The school, called Booker T and Stevie Ray\'s Wrestling and Mixed Mart Arts Academy, will have an open house 2-6 p.m. Saturday.";
var sentences = tokenizer_1.default.sentences(entry);
it("should get 1 sentence", function () {
assert_1.default.equal(sentences.length, 1);
});
});
describe('Skip dotted abbreviations with multiple capital letters', function () {
var entry = "State-owned lender Caixa Economica Federal (CEF.UL) is planning to list its asset management unit and the payments firm Elo by the beginning of 2022, Chief Executive Pedro Guimaraes said in an interview.";
var sentences = tokenizer_1.default.sentences(entry);
it("should get 1 sentence", function () {
assert_1.default.equal(sentences.length, 1);
});
});
describe('Skip common abbreviations', function () {
var entry = "Fig. 2. displays currency rates i.e. something libsum. Currencies widely available (i.e. euro, dollar, pound), or alternatively (e.g. €, $, etc.)";
var sentences = tokenizer_1.default.sentences(entry);
it("should get 2 sentences", function () {
assert_1.default.equal(sentences.length, 2);
});
});
describe('Skip two worded abbreviations', function () {
var entry = "Claims 1–6 and 15–26 are rejected under pre-AIA 35 USC § 103(a) as being unpatentable over Chalana et al. (US 2012/0179503) in view of Oh (US 2013/0013993).";
var sentences = tokenizer_1.default.sentences(entry);
it("should get 1 sentence", function () {
assert_1.default.equal(sentences.length, 1);
});
});
describe('Skip two worded abbreviations', function () {
var entry = "Et al. is an abbreviation of the Latin loanphrase et alii, meaning and others. It is similar to etc. (short for et cetera, meaning and the rest), but whereas etc. applies to things, et al. applies to people.";
var sentences = tokenizer_1.default.sentences(entry);
it("should get 2 sentences", function () {
assert_1.default.equal(sentences.length, 2);
});
});
describe('Use other languages (accented)', function () {
var options = {
"newline_boundaries": true,
"html_boundaries": false,
"sanitize": false,
"allowed_tags": false,
"preserve_whitespace": true,
"abbreviations": ["pré"]
};
var entry = "Random words pré. other words and things. Different status updates all assigned";
var sentences = tokenizer_1.default.sentences(entry, options);
it("should get 2 sentences", function () {
assert_1.default.equal(sentences.length, 2);
});
});
describe('Use other languages', function () {
var entry = "Trzeba tu coś napisać, np. fragment odnoszący się do pkt. 3 wcześniejszego tekstu.";
var sentencesEN = tokenizer_1.default.sentences(entry);
var sentencesPL = tokenizer_1.default.sentences(entry, { abbreviations: ["np", "pkt"] });
it("should get 1 sentence", function () {
assert_1.default.equal(sentencesEN.length, 3);
assert_1.default.equal(sentencesPL.length, 1);
});
it("should not permanently override abbreviations", function () {
var sentences = tokenizer_1.default.sentences(entry);
assert_1.default.equal(sentences.length, 3);
});
});
describe('Use other languages (Cyrillic)', function () {
var options = {
"newline_boundaries": true,
"html_boundaries": false,
"sanitize": false,
"allowed_tags": false,
"preserve_whitespace": true,
"abbreviations": ["табл", "рис"]
};
var entry = "матрицю SWOT- аналізу (табл. hello). Факторами макросередовища (рис. 5.8.). Things on a new line";
var sentencesCyrillic = tokenizer_1.default.sentences(entry, options);
it("should get 3 sentences", function () {
assert_1.default.equal(sentencesCyrillic.length, 3);
});
});
});