UNPKG

site-metadata-extractor

Version:
55 lines 6.35 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const cache = {}; const candiateWords = (strippedInput) => { return strippedInput.split(" "); }; const getFilePath = (lang) => { return path_1.default.resolve(__dirname, `../data/stopwords/stopwords-${lang}.txt`); }; const removePunctuation = (content) => { return content.replace(/[|@<>[\]"'.,-/#?!$%^&*+;:{}=\-_`~()]/g, ""); }; const stopwords = (content, lang = "en") => { let filePath = getFilePath(lang); let stopWords; if (!fs_1.default.existsSync(filePath)) { console.warn(`WARNING: No stopwords file found for '${lang}' - defaulting to English!`); filePath = getFilePath("en"); } if (Object.prototype.hasOwnProperty.call(cache, lang)) { stopWords = cache[lang]; } else { stopWords = fs_1.default .readFileSync(filePath) .toString() .split("\n") .filter((str) => { return str.length > 0; }); cache[lang] = stopWords; } const strippedInput = removePunctuation(content); const words = candiateWords(strippedInput); const overlappingStopwords = []; let count = 0; words.forEach((word) => { count++; if (stopWords.indexOf(word.toLowerCase()) > -1) { overlappingStopwords.push(word.toLowerCase()); } }); return { stopWordCount: overlappingStopwords.length, stopWords: overlappingStopwords, wordCount: count, }; }; exports.default = stopwords; //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoic3RvcHdvcmRzLmpzIiwic291cmNlUm9vdCI6IiIsInNvdXJjZXMiOlsiLi4vc3JjL3N0b3B3b3Jkcy50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiOzs7OztBQUFBLDRDQUFvQjtBQUNwQixnREFBd0I7QUFReEIsTUFBTSxLQUFLLEdBQWdDLEVBQUUsQ0FBQztBQUU5QyxNQUFNLGFBQWEsR0FBRyxDQUFDLGFBQXFCLEVBQVksRUFBRTtJQUN4RCxPQUFPLGFBQWEsQ0FBQyxLQUFLLENBQUMsR0FBRyxDQUFDLENBQUM7QUFDbEMsQ0FBQyxDQUFDO0FBRUYsTUFBTSxXQUFXLEdBQUcsQ0FBQyxJQUFZLEVBQUUsRUFBRTtJQUNuQyxPQUFPLGNBQUksQ0FBQyxPQUFPLENBQUMsU0FBUyxFQUFFLCtCQUErQixJQUFJLE1BQU0sQ0FBQyxDQUFDO0FBQzVFLENBQUMsQ0FBQztBQUVGLE1BQU0saUJBQWlCLEdBQUcsQ0FBQyxPQUFlLEVBQUUsRUFBRTtJQUM1QyxPQUFPLE9BQU8sQ0FBQyxPQUFPLENBQUMsdUNBQXVDLEVBQUUsRUFBRSxDQUFDLENBQUM7QUFDdEUsQ0FBQyxDQUFDO0FBRUYsTUFBTSxTQUFTLEdBQUcsQ0FBQyxPQUFlLEVBQUUsSUFBSSxHQUFHLElBQUksRUFBYSxFQUFFO0lBQzVELElBQUksUUFBUSxHQUFHLFdBQVcsQ0FBQyxJQUFJLENBQUMsQ0FBQztJQUNqQyxJQUFJLFNBQW1CLENBQUM7SUFFeEIsSUFBSSxDQUFDLFlBQUUsQ0FBQyxVQUFVLENBQUMsUUFBUSxDQUFDLEVBQUU7UUFDNUIsT0FBTyxDQUFDLElBQUksQ0FDVix5Q0FBeUMsSUFBSSw0QkFBNEIsQ0FDMUUsQ0FBQztRQUNGLFFBQVEsR0FBRyxXQUFXLENBQUMsSUFBSSxDQUFDLENBQUM7S0FDOUI7SUFFRCxJQUFJLE1BQU0sQ0FBQyxTQUFTLENBQUMsY0FBYyxDQUFDLElBQUksQ0FBQyxLQUFLLEVBQUUsSUFBSSxDQUFDLEVBQUU7UUFDckQsU0FBUyxHQUFHLEtBQUssQ0FBQyxJQUFJLENBQUMsQ0FBQztLQUN6QjtTQUFNO1FBQ0wsU0FBUyxHQUFHLFlBQUU7YUFDWCxZQUFZLENBQUMsUUFBUSxDQUFDO2FBQ3RCLFFBQVEsRUFBRTthQUNWLEtBQUssQ0FBQyxJQUFJLENBQUM7YUFDWCxNQUFNLENBQUMsQ0FBQyxHQUFHLEVBQUUsRUFBRTtZQUNkLE9BQU8sR0FBRyxDQUFDLE1BQU0sR0FBRyxDQUFDLENBQUM7UUFDeEIsQ0FBQyxDQUFDLENBQUM7UUFDTCxLQUFLLENBQUMsSUFBSSxDQUFDLEdBQUcsU0FBUyxDQUFDO0tBQ3pCO0lBRUQsTUFBTSxhQUFhLEdBQUcsaUJBQWlCLENBQUMsT0FBTyxDQUFDLENBQUM7SUFDakQsTUFBTSxLQUFLLEdBQUcsYUFBYSxDQUFDLGFBQWEsQ0FBQyxDQUFDO0lBQzNDLE1BQU0sb0JBQW9CLEdBQWEsRUFBRSxDQUFDO0lBQzFDLElBQUksS0FBSyxHQUFHLENBQUMsQ0FBQztJQUVkLEtBQUssQ0FBQyxPQUFPLENBQUMsQ0FBQyxJQUFJLEVBQUUsRUFBRTtRQUNyQixLQUFLLEVBQUUsQ0FBQztRQUNSLElBQUksU0FBUyxDQUFDLE9BQU8sQ0FBQyxJQUFJLENBQUMsV0FBVyxFQUFFLENBQUMsR0FBRyxDQUFDLENBQUMsRUFBRTtZQUM5QyxvQkFBb0IsQ0FBQyxJQUFJLENBQUMsSUFBSSxDQUFDLFdBQVcsRUFBRSxDQUFDLENBQUM7U0FDL0M7SUFDSCxDQUFDLENBQUMsQ0FBQztJQUVILE9BQU87UUFDTCxhQUFhLEVBQUUsb0JBQW9CLENBQUMsTUFBTTtRQUMxQyxTQUFTLEVBQUUsb0JBQW9CO1FBQy9CLFNBQVMsRUFBRSxLQUFLO0tBQ2pCLENBQUM7QUFDSixDQUFDLENBQUM7QUFFRixrQkFBZSxTQUFTLENBQUMiLCJzb3VyY2VzQ29udGVudCI6WyJpbXBvcnQgZnMgZnJvbSBcImZzXCI7XG5pbXBvcnQgcGF0aCBmcm9tIFwicGF0aFwiO1xuXG5pbnRlcmZhY2UgU3RvcFdvcmRzIHtcbiAgc3RvcFdvcmRDb3VudDogbnVtYmVyO1xuICBzdG9wV29yZHM6IHN0cmluZ1tdO1xuICB3b3JkQ291bnQ6IG51bWJlcjtcbn1cblxuY29uc3QgY2FjaGU6IHsgW2tleTogc3RyaW5nXTogc3RyaW5nW10gfSA9IHt9O1xuXG5jb25zdCBjYW5kaWF0ZVdvcmRzID0gKHN0cmlwcGVkSW5wdXQ6IHN0cmluZyk6IHN0cmluZ1tdID0+IHtcbiAgcmV0dXJuIHN0cmlwcGVkSW5wdXQuc3BsaXQoXCIgXCIpO1xufTtcblxuY29uc3QgZ2V0RmlsZVBhdGggPSAobGFuZzogc3RyaW5nKSA9PiB7XG4gIHJldHVybiBwYXRoLnJlc29sdmUoX19kaXJuYW1lLCBgLi4vZGF0YS9zdG9wd29yZHMvc3RvcHdvcmRzLSR7bGFuZ30udHh0YCk7XG59O1xuXG5jb25zdCByZW1vdmVQdW5jdHVhdGlvbiA9IChjb250ZW50OiBzdHJpbmcpID0+IHtcbiAgcmV0dXJuIGNvbnRlbnQucmVwbGFjZSgvW3xAPD5bXFxdXCInLiwtLyM/ISQlXiYqKzs6e309XFwtX2B+KCldL2csIFwiXCIpO1xufTtcblxuY29uc3Qgc3RvcHdvcmRzID0gKGNvbnRlbnQ6IHN0cmluZywgbGFuZyA9IFwiZW5cIik6IFN0b3BXb3JkcyA9PiB7XG4gIGxldCBmaWxlUGF0aCA9IGdldEZpbGVQYXRoKGxhbmcpO1xuICBsZXQgc3RvcFdvcmRzOiBzdHJpbmdbXTtcblxuICBpZiAoIWZzLmV4aXN0c1N5bmMoZmlsZVBhdGgpKSB7XG4gICAgY29uc29sZS53YXJuKFxuICAgICAgYFdBUk5JTkc6IE5vIHN0b3B3b3JkcyBmaWxlIGZvdW5kIGZvciAnJHtsYW5nfScgLSBkZWZhdWx0aW5nIHRvIEVuZ2xpc2ghYCxcbiAgICApO1xuICAgIGZpbGVQYXRoID0gZ2V0RmlsZVBhdGgoXCJlblwiKTtcbiAgfVxuXG4gIGlmIChPYmplY3QucHJvdG90eXBlLmhhc093blByb3BlcnR5LmNhbGwoY2FjaGUsIGxhbmcpKSB7XG4gICAgc3RvcFdvcmRzID0gY2FjaGVbbGFuZ107XG4gIH0gZWxzZSB7XG4gICAgc3RvcFdvcmRzID0gZnNcbiAgICAgIC5yZWFkRmlsZVN5bmMoZmlsZVBhdGgpXG4gICAgICAudG9TdHJpbmcoKVxuICAgICAgLnNwbGl0KFwiXFxuXCIpXG4gICAgICAuZmlsdGVyKChzdHIpID0+IHtcbiAgICAgICAgcmV0dXJuIHN0ci5sZW5ndGggPiAwO1xuICAgICAgfSk7XG4gICAgY2FjaGVbbGFuZ10gPSBzdG9wV29yZHM7XG4gIH1cblxuICBjb25zdCBzdHJpcHBlZElucHV0ID0gcmVtb3ZlUHVuY3R1YXRpb24oY29udGVudCk7XG4gIGNvbnN0IHdvcmRzID0gY2FuZGlhdGVXb3JkcyhzdHJpcHBlZElucHV0KTtcbiAgY29uc3Qgb3ZlcmxhcHBpbmdTdG9wd29yZHM6IHN0cmluZ1tdID0gW107XG4gIGxldCBjb3VudCA9IDA7XG5cbiAgd29yZHMuZm9yRWFjaCgod29yZCkgPT4ge1xuICAgIGNvdW50Kys7XG4gICAgaWYgKHN0b3BXb3Jkcy5pbmRleE9mKHdvcmQudG9Mb3dlckNhc2UoKSkgPiAtMSkge1xuICAgICAgb3ZlcmxhcHBpbmdTdG9wd29yZHMucHVzaCh3b3JkLnRvTG93ZXJDYXNlKCkpO1xuICAgIH1cbiAgfSk7XG5cbiAgcmV0dXJuIHtcbiAgICBzdG9wV29yZENvdW50OiBvdmVybGFwcGluZ1N0b3B3b3Jkcy5sZW5ndGgsXG4gICAgc3RvcFdvcmRzOiBvdmVybGFwcGluZ1N0b3B3b3JkcyxcbiAgICB3b3JkQ291bnQ6IGNvdW50LFxuICB9O1xufTtcblxuZXhwb3J0IGRlZmF1bHQgc3RvcHdvcmRzO1xuIl19