concepts-parser
Version:
Concepts Extracting from text
144 lines (143 loc) • 4.28 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
const utils_1 = require("./utils");
const concept_1 = require("./concept");
const data_1 = require("./data");
function isValid(concept) {
const value = concept.value;
if (utils_1.isLower(value)) {
return false;
}
if (!concept.isValid()) {
return false;
}
const invalid = data_1.getInvalidConcepts(concept.lang);
for (let reg of invalid) {
if (reg.test(value)) {
return false;
}
}
return true;
}
function createConcept(value, index, lang) {
return new concept_1.Concept({ value, index, lang });
}
function canSplit(concept) {
if (concept.get("isKnown")) {
return false;
}
return concept.value.length > 4 && concept.value.indexOf(" ") > 2;
}
function trimLowecaseWords(concept) {
if (concept.countWords > 1) {
const value = concept.value;
if (endsWithLowercaseWord(value)) {
concept.reset(value.substr(0, value.lastIndexOf(" ")), concept.index, concept.lang);
return trimLowecaseWords(concept);
}
else if (startsWithLowercaseWord(value)) {
const index = value.indexOf(" ") + 1;
concept.reset(value.substr(index), concept.index + index, concept.lang);
return trimLowecaseWords(concept);
}
}
return concept;
}
function createConceptsFromConcept(concept, index, separator) {
separator = separator || " ";
let list = [];
let c = createConcept(concept.value.substr(0, index), concept.index, concept.lang);
if (isValid(c)) {
if (c.countWords > 1) {
c = trimLowecaseWords(c);
if (isValid(c)) {
list.push(c);
}
}
else {
list.push(c);
}
}
index += separator.length;
c = createConcept(concept.value.substr(index), concept.index + index, concept.lang);
if (isValid(c)) {
if (c.countWords > 1) {
c = trimLowecaseWords(c);
if (isValid(c)) {
list.push(c);
}
}
else {
list.push(c);
}
}
return list;
}
exports.createConceptsFromConcept = createConceptsFromConcept;
function endsWithLowercaseWord(text) {
const words = text.split(/\s+/g);
return words[words.length - 1].toLowerCase() === words[words.length - 1];
}
function startsWithLowercaseWord(text) {
const words = text.split(/\s+/g);
return words[0].toLowerCase() === words[0];
}
function splitByWords(concept, words) {
let index;
let word;
if (!words) {
words = data_1.getSplitWords(concept.lang);
}
for (let i = 0; i < words.length; i++) {
word = " " + words[i] + " ";
index = concept.value.indexOf(word);
if (index > 0) {
return createConceptsFromConcept(concept, index, word);
}
}
return [];
}
exports.splitByWords = splitByWords;
function simpleSplit(concept) {
let list = [];
let splitLength = concept.countWords - 1;
const conceptWords = concept.value.split(/\s+/g);
const wordsLength = conceptWords.map((word) => word.length);
while (splitLength > 0) {
const index = wordsLength
.slice(0, splitLength)
.reduce((sum, current) => sum + current, 0) +
splitLength -
1;
list = list.concat(createConceptsFromConcept(concept, index));
splitLength--;
}
list = uniqConcepts(list);
return list;
}
exports.simpleSplit = simpleSplit;
function split(concept) {
let list = [];
if (!canSplit(concept)) {
return list;
}
list = splitByWords(concept);
if (list.length === 0) {
list = list.concat(simpleSplit(concept));
}
return list;
}
exports.split = split;
function uniqConcepts(list) {
let keys = {};
let key;
list = list.filter(function (item) {
key = item.index + item.value;
if (!keys[key]) {
keys[key] = true;
return true;
}
return false;
});
return list;
}
;