site-metadata-extractor
Version:
web(site) resource metadata extractor
55 lines • 6.35 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const cache = {};
const candiateWords = (strippedInput) => {
return strippedInput.split(" ");
};
const getFilePath = (lang) => {
return path_1.default.resolve(__dirname, `../data/stopwords/stopwords-${lang}.txt`);
};
const removePunctuation = (content) => {
return content.replace(/[|@<>[\]"'.,-/#?!$%^&*+;:{}=\-_`~()]/g, "");
};
const stopwords = (content, lang = "en") => {
let filePath = getFilePath(lang);
let stopWords;
if (!fs_1.default.existsSync(filePath)) {
console.warn(`WARNING: No stopwords file found for '${lang}' - defaulting to English!`);
filePath = getFilePath("en");
}
if (Object.prototype.hasOwnProperty.call(cache, lang)) {
stopWords = cache[lang];
}
else {
stopWords = fs_1.default
.readFileSync(filePath)
.toString()
.split("\n")
.filter((str) => {
return str.length > 0;
});
cache[lang] = stopWords;
}
const strippedInput = removePunctuation(content);
const words = candiateWords(strippedInput);
const overlappingStopwords = [];
let count = 0;
words.forEach((word) => {
count++;
if (stopWords.indexOf(word.toLowerCase()) > -1) {
overlappingStopwords.push(word.toLowerCase());
}
});
return {
stopWordCount: overlappingStopwords.length,
stopWords: overlappingStopwords,
wordCount: count,
};
};
exports.default = stopwords;
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoic3RvcHdvcmRzLmpzIiwic291cmNlUm9vdCI6IiIsInNvdXJjZXMiOlsiLi4vc3JjL3N0b3B3b3Jkcy50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiOzs7OztBQUFBLDRDQUFvQjtBQUNwQixnREFBd0I7QUFReEIsTUFBTSxLQUFLLEdBQWdDLEVBQUUsQ0FBQztBQUU5QyxNQUFNLGFBQWEsR0FBRyxDQUFDLGFBQXFCLEVBQVksRUFBRTtJQUN4RCxPQUFPLGFBQWEsQ0FBQyxLQUFLLENBQUMsR0FBRyxDQUFDLENBQUM7QUFDbEMsQ0FBQyxDQUFDO0FBRUYsTUFBTSxXQUFXLEdBQUcsQ0FBQyxJQUFZLEVBQUUsRUFBRTtJQUNuQyxPQUFPLGNBQUksQ0FBQyxPQUFPLENBQUMsU0FBUyxFQUFFLCtCQUErQixJQUFJLE1BQU0sQ0FBQyxDQUFDO0FBQzVFLENBQUMsQ0FBQztBQUVGLE1BQU0saUJBQWlCLEdBQUcsQ0FBQyxPQUFlLEVBQUUsRUFBRTtJQUM1QyxPQUFPLE9BQU8sQ0FBQyxPQUFPLENBQUMsdUNBQXVDLEVBQUUsRUFBRSxDQUFDLENBQUM7QUFDdEUsQ0FBQyxDQUFDO0FBRUYsTUFBTSxTQUFTLEdBQUcsQ0FBQyxPQUFlLEVBQUUsSUFBSSxHQUFHLElBQUksRUFBYSxFQUFFO0lBQzVELElBQUksUUFBUSxHQUFHLFdBQVcsQ0FBQyxJQUFJLENBQUMsQ0FBQztJQUNqQyxJQUFJLFNBQW1CLENBQUM7SUFFeEIsSUFBSSxDQUFDLFlBQUUsQ0FBQyxVQUFVLENBQUMsUUFBUSxDQUFDLEVBQUU7UUFDNUIsT0FBTyxDQUFDLElBQUksQ0FDVix5Q0FBeUMsSUFBSSw0QkFBNEIsQ0FDMUUsQ0FBQztRQUNGLFFBQVEsR0FBRyxXQUFXLENBQUMsSUFBSSxDQUFDLENBQUM7S0FDOUI7SUFFRCxJQUFJLE1BQU0sQ0FBQyxTQUFTLENBQUMsY0FBYyxDQUFDLElBQUksQ0FBQyxLQUFLLEVBQUUsSUFBSSxDQUFDLEVBQUU7UUFDckQsU0FBUyxHQUFHLEtBQUssQ0FBQyxJQUFJLENBQUMsQ0FBQztLQUN6QjtTQUFNO1FBQ0wsU0FBUyxHQUFHLFlBQUU7YUFDWCxZQUFZLENBQUMsUUFBUSxDQUFDO2FBQ3RCLFFBQVEsRUFBRTthQUNWLEtBQUssQ0FBQyxJQUFJLENBQUM7YUFDWCxNQUFNLENBQUMsQ0FBQyxHQUFHLEVBQUUsRUFBRTtZQUNkLE9BQU8sR0FBRyxDQUFDLE1BQU0sR0FBRyxDQUFDLENBQUM7UUFDeEIsQ0FBQyxDQUFDLENBQUM7UUFDTCxLQUFLLENBQUMsSUFBSSxDQUFDLEdBQUcsU0FBUyxDQUFDO0tBQ3pCO0lBRUQsTUFBTSxhQUFhLEdBQUcsaUJBQWlCLENBQUMsT0FBTyxDQUFDLENBQUM7SUFDakQsTUFBTSxLQUFLLEdBQUcsYUFBYSxDQUFDLGFBQWEsQ0FBQyxDQUFDO0lBQzNDLE1BQU0sb0JBQW9CLEdBQWEsRUFBRSxDQUFDO0lBQzFDLElBQUksS0FBSyxHQUFHLENBQUMsQ0FBQztJQUVkLEtBQUssQ0FBQyxPQUFPLENBQUMsQ0FBQyxJQUFJLEVBQUUsRUFBRTtRQUNyQixLQUFLLEVBQUUsQ0FBQztRQUNSLElBQUksU0FBUyxDQUFDLE9BQU8sQ0FBQyxJQUFJLENBQUMsV0FBVyxFQUFFLENBQUMsR0FBRyxDQUFDLENBQUMsRUFBRTtZQUM5QyxvQkFBb0IsQ0FBQyxJQUFJLENBQUMsSUFBSSxDQUFDLFdBQVcsRUFBRSxDQUFDLENBQUM7U0FDL0M7SUFDSCxDQUFDLENBQUMsQ0FBQztJQUVILE9BQU87UUFDTCxhQUFhLEVBQUUsb0JBQW9CLENBQUMsTUFBTTtRQUMxQyxTQUFTLEVBQUUsb0JBQW9CO1FBQy9CLFNBQVMsRUFBRSxLQUFLO0tBQ2pCLENBQUM7QUFDSixDQUFDLENBQUM7QUFFRixrQkFBZSxTQUFTLENBQUMiLCJzb3VyY2VzQ29udGVudCI6WyJpbXBvcnQgZnMgZnJvbSBcImZzXCI7XG5pbXBvcnQgcGF0aCBmcm9tIFwicGF0aFwiO1xuXG5pbnRlcmZhY2UgU3RvcFdvcmRzIHtcbiAgc3RvcFdvcmRDb3VudDogbnVtYmVyO1xuICBzdG9wV29yZHM6IHN0cmluZ1tdO1xuICB3b3JkQ291bnQ6IG51bWJlcjtcbn1cblxuY29uc3QgY2FjaGU6IHsgW2tleTogc3RyaW5nXTogc3RyaW5nW10gfSA9IHt9O1xuXG5jb25zdCBjYW5kaWF0ZVdvcmRzID0gKHN0cmlwcGVkSW5wdXQ6IHN0cmluZyk6IHN0cmluZ1tdID0+IHtcbiAgcmV0dXJuIHN0cmlwcGVkSW5wdXQuc3BsaXQoXCIgXCIpO1xufTtcblxuY29uc3QgZ2V0RmlsZVBhdGggPSAobGFuZzogc3RyaW5nKSA9PiB7XG4gIHJldHVybiBwYXRoLnJlc29sdmUoX19kaXJuYW1lLCBgLi4vZGF0YS9zdG9wd29yZHMvc3RvcHdvcmRzLSR7bGFuZ30udHh0YCk7XG59O1xuXG5jb25zdCByZW1vdmVQdW5jdHVhdGlvbiA9IChjb250ZW50OiBzdHJpbmcpID0+IHtcbiAgcmV0dXJuIGNvbnRlbnQucmVwbGFjZSgvW3xAPD5bXFxdXCInLiwtLyM/ISQlXiYqKzs6e309XFwtX2B+KCldL2csIFwiXCIpO1xufTtcblxuY29uc3Qgc3RvcHdvcmRzID0gKGNvbnRlbnQ6IHN0cmluZywgbGFuZyA9IFwiZW5cIik6IFN0b3BXb3JkcyA9PiB7XG4gIGxldCBmaWxlUGF0aCA9IGdldEZpbGVQYXRoKGxhbmcpO1xuICBsZXQgc3RvcFdvcmRzOiBzdHJpbmdbXTtcblxuICBpZiAoIWZzLmV4aXN0c1N5bmMoZmlsZVBhdGgpKSB7XG4gICAgY29uc29sZS53YXJuKFxuICAgICAgYFdBUk5JTkc6IE5vIHN0b3B3b3JkcyBmaWxlIGZvdW5kIGZvciAnJHtsYW5nfScgLSBkZWZhdWx0aW5nIHRvIEVuZ2xpc2ghYCxcbiAgICApO1xuICAgIGZpbGVQYXRoID0gZ2V0RmlsZVBhdGgoXCJlblwiKTtcbiAgfVxuXG4gIGlmIChPYmplY3QucHJvdG90eXBlLmhhc093blByb3BlcnR5LmNhbGwoY2FjaGUsIGxhbmcpKSB7XG4gICAgc3RvcFdvcmRzID0gY2FjaGVbbGFuZ107XG4gIH0gZWxzZSB7XG4gICAgc3RvcFdvcmRzID0gZnNcbiAgICAgIC5yZWFkRmlsZVN5bmMoZmlsZVBhdGgpXG4gICAgICAudG9TdHJpbmcoKVxuICAgICAgLnNwbGl0KFwiXFxuXCIpXG4gICAgICAuZmlsdGVyKChzdHIpID0+IHtcbiAgICAgICAgcmV0dXJuIHN0ci5sZW5ndGggPiAwO1xuICAgICAgfSk7XG4gICAgY2FjaGVbbGFuZ10gPSBzdG9wV29yZHM7XG4gIH1cblxuICBjb25zdCBzdHJpcHBlZElucHV0ID0gcmVtb3ZlUHVuY3R1YXRpb24oY29udGVudCk7XG4gIGNvbnN0IHdvcmRzID0gY2FuZGlhdGVXb3JkcyhzdHJpcHBlZElucHV0KTtcbiAgY29uc3Qgb3ZlcmxhcHBpbmdTdG9wd29yZHM6IHN0cmluZ1tdID0gW107XG4gIGxldCBjb3VudCA9IDA7XG5cbiAgd29yZHMuZm9yRWFjaCgod29yZCkgPT4ge1xuICAgIGNvdW50Kys7XG4gICAgaWYgKHN0b3BXb3Jkcy5pbmRleE9mKHdvcmQudG9Mb3dlckNhc2UoKSkgPiAtMSkge1xuICAgICAgb3ZlcmxhcHBpbmdTdG9wd29yZHMucHVzaCh3b3JkLnRvTG93ZXJDYXNlKCkpO1xuICAgIH1cbiAgfSk7XG5cbiAgcmV0dXJuIHtcbiAgICBzdG9wV29yZENvdW50OiBvdmVybGFwcGluZ1N0b3B3b3Jkcy5sZW5ndGgsXG4gICAgc3RvcFdvcmRzOiBvdmVybGFwcGluZ1N0b3B3b3JkcyxcbiAgICB3b3JkQ291bnQ6IGNvdW50LFxuICB9O1xufTtcblxuZXhwb3J0IGRlZmF1bHQgc3RvcHdvcmRzO1xuIl19