UNPKG

concepts-parser

Version:
144 lines (143 loc) 4.28 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const utils_1 = require("./utils"); const concept_1 = require("./concept"); const data_1 = require("./data"); function isValid(concept) { const value = concept.value; if (utils_1.isLower(value)) { return false; } if (!concept.isValid()) { return false; } const invalid = data_1.getInvalidConcepts(concept.lang); for (let reg of invalid) { if (reg.test(value)) { return false; } } return true; } function createConcept(value, index, lang) { return new concept_1.Concept({ value, index, lang }); } function canSplit(concept) { if (concept.get("isKnown")) { return false; } return concept.value.length > 4 && concept.value.indexOf(" ") > 2; } function trimLowecaseWords(concept) { if (concept.countWords > 1) { const value = concept.value; if (endsWithLowercaseWord(value)) { concept.reset(value.substr(0, value.lastIndexOf(" ")), concept.index, concept.lang); return trimLowecaseWords(concept); } else if (startsWithLowercaseWord(value)) { const index = value.indexOf(" ") + 1; concept.reset(value.substr(index), concept.index + index, concept.lang); return trimLowecaseWords(concept); } } return concept; } function createConceptsFromConcept(concept, index, separator) { separator = separator || " "; let list = []; let c = createConcept(concept.value.substr(0, index), concept.index, concept.lang); if (isValid(c)) { if (c.countWords > 1) { c = trimLowecaseWords(c); if (isValid(c)) { list.push(c); } } else { list.push(c); } } index += separator.length; c = createConcept(concept.value.substr(index), concept.index + index, concept.lang); if (isValid(c)) { if (c.countWords > 1) { c = trimLowecaseWords(c); if (isValid(c)) { list.push(c); } } else { list.push(c); } } return list; } exports.createConceptsFromConcept = createConceptsFromConcept; function endsWithLowercaseWord(text) { const words = text.split(/\s+/g); return words[words.length - 1].toLowerCase() === words[words.length - 1]; } function startsWithLowercaseWord(text) { const words = text.split(/\s+/g); return words[0].toLowerCase() === words[0]; } function splitByWords(concept, words) { let index; let word; if (!words) { words = data_1.getSplitWords(concept.lang); } for (let i = 0; i < words.length; i++) { word = " " + words[i] + " "; index = concept.value.indexOf(word); if (index > 0) { return createConceptsFromConcept(concept, index, word); } } return []; } exports.splitByWords = splitByWords; function simpleSplit(concept) { let list = []; let splitLength = concept.countWords - 1; const conceptWords = concept.value.split(/\s+/g); const wordsLength = conceptWords.map((word) => word.length); while (splitLength > 0) { const index = wordsLength .slice(0, splitLength) .reduce((sum, current) => sum + current, 0) + splitLength - 1; list = list.concat(createConceptsFromConcept(concept, index)); splitLength--; } list = uniqConcepts(list); return list; } exports.simpleSplit = simpleSplit; function split(concept) { let list = []; if (!canSplit(concept)) { return list; } list = splitByWords(concept); if (list.length === 0) { list = list.concat(simpleSplit(concept)); } return list; } exports.split = split; function uniqConcepts(list) { let keys = {}; let key; list = list.filter(function (item) { key = item.index + item.value; if (!keys[key]) { keys[key] = true; return true; } return false; }); return list; }