UNPKG

concepts-parser

Version:

Concepts Extracting from text

github.com/entitizer/concepts-parser-js

entitizer/concepts-parser-js

203 lines (202 loc) • 6.31 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const fs = require("fs"); const path = require("path"); const LANGUAGES = [ "ro", "ru", "bg", "hu", "cs", "pl", "it", "en", "es" ]; const NAMES = [ "connect_words", "split_words", "invalid_concepts", "invalid_prefixes", "known_concepts", "partial_concepts", "valid_prefixes", "valid_suffixes", "firstnames" ]; const NAMES_INFO = { connect_words: { atonic: false, insensitive: false, sort: true }, split_words: { atonic: false, insensitive: false, sort: true }, invalid_concepts: { atonic: true, insensitive: false, sort: true }, invalid_prefixes: { atonic: true, insensitive: false, sort: true }, known_concepts: { atonic: false, insensitive: false, sort: true }, partial_concepts: { atonic: false, insensitive: false, sort: true }, valid_prefixes: { atonic: false, insensitive: false, sort: true }, valid_suffixes: { atonic: false, insensitive: false, sort: true }, firstnames: { atonic: false, insensitive: false, sort: true } }; const DATA = {}; const builders = { invalid_concepts: function (items) { return items.length > 0 ? [new RegExp(`^(${items.join("|")})$`, "i")] : []; }, invalid_prefixes: function (items) { return items.length > 0 ? [new RegExp(`^(${items.join("|")}) `, "i")] : []; }, known_concepts: function (items) { items = sortByCountWordsDesc(items); return items.map((item) => { return new RegExp(`(\\b|\\s)${item}(\\b|\\s)`, "ig"); }); }, partial_concepts: function (items) { return items.length > 0 ? [new RegExp(`^(${items.join("|")})$`, "i")] : []; }, valid_prefixes: function (items) { items = sortByCountWordsDesc(items); return items.length > 0 ? [new RegExp(`(^|\\b|\\s)(${items.join("|")}) $`, "i")] : []; }, valid_suffixes: function (items) { if (items.length === 0) { return []; } let simpleList = []; const complexList = []; items.forEach((item) => { item = item.trim(); const parts = item.split(/\s*\t\s*/g); if (parts.length === 0) { throw new Error(`Invalid suffix line`); } if (parts.length === 1) { simpleList.push(item); } else { const concat = parts[1] === "1" || (parts.length > 2 && parts[2] === "1") || undefined; const prefix = parts[1].length > 1 ? new RegExp(`(^|\\b|\\s)(${parts[1]})$`, "i") : undefined; complexList.push({ reg: new RegExp(`^ (${parts[0]})`, "i"), concat, prefix }); } }); if (simpleList.length) { simpleList = sortByCountWordsDesc(simpleList); complexList.push({ reg: new RegExp(`^ (${simpleList.join("|")})(\\b|\\s)`, "i") }); } return complexList; }, firstnames: function (items) { return items.length > 0 ? [new RegExp(`^(${items.join("|")})[ -]`)] : []; } }; function sortByCountWordsDesc(items) { return items.sort((a, b) => b.split(/\s+/g).length - a.split(/\s+/g).length); } function getFileData(file) { let content; try { content = fs.readFileSync(file, "utf8"); } catch (e) { return []; } content = content.replace(/\r+/g, "").trim(); return content.split(/\n/g).filter((item) => { item = item.trim(); if (item.length < 1 || item[0] === "#") { return false; } return true; }); } function load(name, lang, country) { if (LANGUAGES.indexOf(lang) < 0) { throw new Error("Invalid language: " + lang); } if (NAMES.indexOf(name) < 0) { throw new Error("Invalid name: " + name); } let file = path.join(__dirname, "../data", lang, name + ".txt"); let data = getFileData(file); if (country) { file = path.join(__dirname, "../data", lang, country, name + ".txt"); data = data.concat(getFileData(file)); } return data; } function build(name, lang, country) { let data = load(name, lang, country); let builder = builders[name]; if (builder) { return builder(data); } return data; } function get(name, lang) { if (!name) { throw new Error("param `name` is required"); } if (!lang) { throw new Error("param `lang` is required"); } const key = lang + "_" + name; if (!DATA[key]) { DATA[key] = build(name, lang); } return DATA[key]; } exports.get = get; function getConnectWords(lang) { return get("connect_words", lang); } exports.getConnectWords = getConnectWords; function getSplitWords(lang) { return get("split_words", lang); } exports.getSplitWords = getSplitWords; function getInvalidConcepts(lang) { return get("invalid_concepts", lang); } exports.getInvalidConcepts = getInvalidConcepts; function getInvalidPrefixes(lang) { return get("invalid_prefixes", lang); } exports.getInvalidPrefixes = getInvalidPrefixes; function getKnownConcepts(lang) { return get("known_concepts", lang); } exports.getKnownConcepts = getKnownConcepts; function getPartialConcepts(lang) { return get("partial_concepts", lang); } exports.getPartialConcepts = getPartialConcepts; function getValidPrefixes(lang) { return get("valid_prefixes", lang); } exports.getValidPrefixes = getValidPrefixes; function getValidSuffixes(lang) { return get("valid_suffixes", lang); } exports.getValidSuffixes = getValidSuffixes; function getLanguages() { return LANGUAGES; } exports.getLanguages = getLanguages; function getNames() { return NAMES; } exports.getNames = getNames; function getNameInfo(name) { return NAMES_INFO[name]; } exports.getNameInfo = getNameInfo;