entity-finder
Version:
Named entity finder
77 lines (76 loc) • 2.84 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.findTitles = void 0;
const debug = require("debug")("entity-finder");
const wikiApi = require("./wikipedia/api");
const searchTitles_1 = require("./searchTitles");
const utils_1 = require("./utils");
function findTitles(name, lang, options) {
options = options || {};
name = name.trim();
lang = (lang && lang.trim().toLowerCase()) || "en";
const limit = options.limit || 2;
const searchOptions = {
limit: limit + 50,
tags: options.tags,
timeout: options.timeout
};
return searchTitles_1.searchTitles(name, lang, searchOptions)
.then((titles) => {
titles = titles.slice(0, limit + 5);
return filterDezambiguizationTitles(titles, lang).then((filteredTitles) => titles
.map((item) => filteredTitles.find((it) => it.title === item.title))
.filter((item) => !!item));
})
.then((list) => list.slice(0, limit));
}
exports.findTitles = findTitles;
function filterDezambiguizationTitles(pageTitles, lang) {
if (pageTitles.length === 0) {
return Promise.resolve([]);
}
const titles = pageTitles.map((item) => item.title).join("|");
return wikiApi
.query(lang, {
titles: titles,
prop: "categories",
clshow: "!hidden",
cllimit: 50
})
.then((data) => {
if (!data.query) {
return Promise.reject(new Error(JSON.stringify(data)));
}
data = data.query.pages;
const filteredTitles = Object.keys(data)
.map((pageId) => ({
pageid: data[pageId].pageid,
title: data[pageId].title,
categories: data[pageId].categories &&
data[pageId].categories.map((item) => item.title)
}))
.filter((item) => !hasADezambiguizationCategory(item.categories, lang))
.map((item) => {
const title = pageTitles.find((it) => it.title === item.title);
title.categories = item.categories;
return title;
});
return filteredTitles;
});
}
function hasADezambiguizationCategory(categories, lang) {
return (categories &&
categories.findIndex((category) => isDezambiguizationCategory(category, lang)) > -1);
}
function isDezambiguizationCategory(category, lang) {
const disName = utils_1.getDisambiguationName(lang);
if (!disName) {
throw new Error(`No Disambiguation Name for language ${lang}`);
}
const disNameReg = new RegExp("(^|\\b)" + disName + "(\\b|$)", "i");
const isDis = disNameReg.test(category);
if (isDis) {
debug(`Category ${category} is a dizambiguization`);
}
return isDis;
}