concepts-parser
Version:
Concepts Extracting from text
203 lines (202 loc) • 6.31 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
const fs = require("fs");
const path = require("path");
const LANGUAGES = [
"ro",
"ru",
"bg",
"hu",
"cs",
"pl",
"it",
"en",
"es"
];
const NAMES = [
"connect_words",
"split_words",
"invalid_concepts",
"invalid_prefixes",
"known_concepts",
"partial_concepts",
"valid_prefixes",
"valid_suffixes",
"firstnames"
];
const NAMES_INFO = {
connect_words: { atonic: false, insensitive: false, sort: true },
split_words: { atonic: false, insensitive: false, sort: true },
invalid_concepts: { atonic: true, insensitive: false, sort: true },
invalid_prefixes: { atonic: true, insensitive: false, sort: true },
known_concepts: { atonic: false, insensitive: false, sort: true },
partial_concepts: { atonic: false, insensitive: false, sort: true },
valid_prefixes: { atonic: false, insensitive: false, sort: true },
valid_suffixes: { atonic: false, insensitive: false, sort: true },
firstnames: { atonic: false, insensitive: false, sort: true }
};
const DATA = {};
const builders = {
invalid_concepts: function (items) {
return items.length > 0 ? [new RegExp(`^(${items.join("|")})$`, "i")] : [];
},
invalid_prefixes: function (items) {
return items.length > 0 ? [new RegExp(`^(${items.join("|")}) `, "i")] : [];
},
known_concepts: function (items) {
items = sortByCountWordsDesc(items);
return items.map((item) => {
return new RegExp(`(\\b|\\s)${item}(\\b|\\s)`, "ig");
});
},
partial_concepts: function (items) {
return items.length > 0 ? [new RegExp(`^(${items.join("|")})$`, "i")] : [];
},
valid_prefixes: function (items) {
items = sortByCountWordsDesc(items);
return items.length > 0
? [new RegExp(`(^|\\b|\\s)(${items.join("|")}) $`, "i")]
: [];
},
valid_suffixes: function (items) {
if (items.length === 0) {
return [];
}
let simpleList = [];
const complexList = [];
items.forEach((item) => {
item = item.trim();
const parts = item.split(/\s*\t\s*/g);
if (parts.length === 0) {
throw new Error(`Invalid suffix line`);
}
if (parts.length === 1) {
simpleList.push(item);
}
else {
const concat = parts[1] === "1" ||
(parts.length > 2 && parts[2] === "1") ||
undefined;
const prefix = parts[1].length > 1
? new RegExp(`(^|\\b|\\s)(${parts[1]})$`, "i")
: undefined;
complexList.push({
reg: new RegExp(`^ (${parts[0]})`, "i"),
concat,
prefix
});
}
});
if (simpleList.length) {
simpleList = sortByCountWordsDesc(simpleList);
complexList.push({
reg: new RegExp(`^ (${simpleList.join("|")})(\\b|\\s)`, "i")
});
}
return complexList;
},
firstnames: function (items) {
return items.length > 0 ? [new RegExp(`^(${items.join("|")})[ -]`)] : [];
}
};
function sortByCountWordsDesc(items) {
return items.sort((a, b) => b.split(/\s+/g).length - a.split(/\s+/g).length);
}
function getFileData(file) {
let content;
try {
content = fs.readFileSync(file, "utf8");
}
catch (e) {
return [];
}
content = content.replace(/\r+/g, "").trim();
return content.split(/\n/g).filter((item) => {
item = item.trim();
if (item.length < 1 || item[0] === "#") {
return false;
}
return true;
});
}
function load(name, lang, country) {
if (LANGUAGES.indexOf(lang) < 0) {
throw new Error("Invalid language: " + lang);
}
if (NAMES.indexOf(name) < 0) {
throw new Error("Invalid name: " + name);
}
let file = path.join(__dirname, "../data", lang, name + ".txt");
let data = getFileData(file);
if (country) {
file = path.join(__dirname, "../data", lang, country, name + ".txt");
data = data.concat(getFileData(file));
}
return data;
}
function build(name, lang, country) {
let data = load(name, lang, country);
let builder = builders[name];
if (builder) {
return builder(data);
}
return data;
}
function get(name, lang) {
if (!name) {
throw new Error("param `name` is required");
}
if (!lang) {
throw new Error("param `lang` is required");
}
const key = lang + "_" + name;
if (!DATA[key]) {
DATA[key] = build(name, lang);
}
return DATA[key];
}
exports.get = get;
function getConnectWords(lang) {
return get("connect_words", lang);
}
exports.getConnectWords = getConnectWords;
function getSplitWords(lang) {
return get("split_words", lang);
}
exports.getSplitWords = getSplitWords;
function getInvalidConcepts(lang) {
return get("invalid_concepts", lang);
}
exports.getInvalidConcepts = getInvalidConcepts;
function getInvalidPrefixes(lang) {
return get("invalid_prefixes", lang);
}
exports.getInvalidPrefixes = getInvalidPrefixes;
function getKnownConcepts(lang) {
return get("known_concepts", lang);
}
exports.getKnownConcepts = getKnownConcepts;
function getPartialConcepts(lang) {
return get("partial_concepts", lang);
}
exports.getPartialConcepts = getPartialConcepts;
function getValidPrefixes(lang) {
return get("valid_prefixes", lang);
}
exports.getValidPrefixes = getValidPrefixes;
function getValidSuffixes(lang) {
return get("valid_suffixes", lang);
}
exports.getValidSuffixes = getValidSuffixes;
function getLanguages() {
return LANGUAGES;
}
exports.getLanguages = getLanguages;
function getNames() {
return NAMES;
}
exports.getNames = getNames;
function getNameInfo(name) {
return NAMES_INFO[name];
}
exports.getNameInfo = getNameInfo;
;