unfluff

Version:

A web page content extractor

48 lines (36 loc) • 1.34 kB

text/coffeescript

path = require('path') fs = require('fs') _ = require('lodash') cache = {} getFilePath = (language) -> path.join(__dirname, "..", "data", "stopwords", "stopwords-#{language}.txt") # Given a language, loads a list of stop words for that language # and then returns which of those words exist in the given content module.exports = stopwords = (content, language = 'en') -> filePath = getFilePath(language) if !fs.existsSync(filePath) console.error("WARNING: No stopwords file found for '#{language}' - defaulting to English!") filePath = getFilePath('en') if cache.hasOwnProperty(language) stopWords = cache[language] else stopWords = fs.readFileSync(filePath).toString().split('\n') .filter((s) -> s.length > 0) cache[language] = stopWords strippedInput = removePunctuation(content) words = candiateWords(strippedInput) overlappingStopwords = [] count = 0 _.each words, (w) -> count += 1 if stopWords.indexOf(w.toLowerCase()) > -1 overlappingStopwords.push(w.toLowerCase()) { wordCount: count, stopwordCount: overlappingStopwords.length, stopWords: overlappingStopwords } removePunctuation = (content) -> content.replace(/[\|\@\<\>\[\]\"\'\.,-\/#\?!$%\^&\*\+;:{}=\-_`~()]/g,"") candiateWords = (strippedInput) -> strippedInput.split(' ')