UNPKG

text-readability

Version:

npm package to calculate statistics from text to determine readability, complexity and grade level of a particular corpus.

369 lines (345 loc) 13.1 kB
import { syllable } from 'syllable'; import pluralize from 'pluralize'; const punctuationRE = /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,\-./:;<=>?@[\]^_`{|}~]/g import easyWords from './easy_words.js'; const easyWordSet = new Set(easyWords) // extends Math object Math.copySign = (x, y) => { if (y === 0) return 0; return x * (y / Math.abs(y)) } Math.legacyRound = (number, points = 0) => { const p = 10 ** points // return float(math.floor((number * p) + math.copysign(0.5, number))) / p return Math.floor((number * p) + Math.copySign(0.5, number)) / p } class Readability { static getGradeSuffix (grade) { grade = Math.floor(grade) // poor function fix this, gives { 22th and 23th grade } const gradeMap = { 1: 'st', 2: 'nd', 3: 'rd' } return gradeMap[grade] ? gradeMap[grade] : 'th' } charCount (text, ignoreSpaces = true) { if (ignoreSpaces) text = text.replace(/ /g, '') return text.length } letterCount (text, ignoreSpaces = true) { if (ignoreSpaces) text = text.replace(/ /g, '') return this.removePunctuation(text).length } removePunctuation (text) { text = text.replace(punctuationRE, '') return text } static split (text) { text = text.split(/,| |\n|\r/g) text = text.filter(n => n) return text } lexiconCount (text, removePunctuation = true) { if (removePunctuation) text = this.removePunctuation(text) text = text.split(/,| |\n|\r/g) text = text.filter(n => n) return text.length } syllableCount (text, lang = 'en-US') { text = text.toLocaleLowerCase(lang) text = this.removePunctuation(text) if (!text) return 0 // eventually replace syllable const count = syllable(text) return count // js lib overs compared to python } sentenceCount (text) { let ignoreCount = 0 let sentences = text.split(/ *[.?!]['")\]]*[ |\n](?=[A-Z])/g) for (let sentence of sentences) { if (this.lexiconCount(sentence) <= 2) ignoreCount += 1 } const validSentences = sentences.length - ignoreCount return validSentences > 1 ? validSentences : 1 } averageSentenceLength (text) { const asl = this.lexiconCount(text) / this.sentenceCount(text) const returnVal = Math.legacyRound(asl, 1) return !isNaN(returnVal) ? returnVal : 0.0 } averageSyllablePerWord (text) { const syllables = this.syllableCount(text) const words = this.lexiconCount(text) const syllablePerWord = syllables / words const returnVal = Math.legacyRound(syllablePerWord, 1) return !isNaN(returnVal) ? returnVal : 0.0 } averageCharacterPerWord (text) { const charactersPerWord = this.charCount(text) / this.lexiconCount(text) const returnVal = Math.legacyRound(charactersPerWord, 2) return !isNaN(returnVal) ? returnVal : 0.0 } averageLetterPerWord (text) { const lettersPerWord = this.letterCount(text) / this.lexiconCount(text) const returnVal = Math.legacyRound(lettersPerWord, 2) return !isNaN(returnVal) ? returnVal : 0.0 } averageSentencePerWord (text) { const sentencesPerWord = this.sentenceCount(text) / this.lexiconCount(text) const returnVal = Math.legacyRound(sentencesPerWord, 2) return !isNaN(returnVal) ? returnVal : 0.0 } fleschReadingEase (text) { const sentenceLength = this.averageSentenceLength(text) const syllablesPerWord = this.averageSyllablePerWord(text) const flesch = 206.835 - (1.015 * sentenceLength) - (84.6 * syllablesPerWord) const returnVal = Math.legacyRound(flesch, 2) return returnVal } fleschReadingEaseToGrade (score) { if (score >= 90) return 5 else if (score < 90 && score >= 80) return 6 else if (score < 80 && score >= 70) return 7 else if (score < 70 && score >= 60) return 8.5 else if (score < 60 && score >= 50) return 11 else if (score < 50 && score >= 40) return 13 // college else if (score < 40 && score >= 30) return 15 else return 16 } fleschKincaidGrade (text) { const sentenceLength = this.averageSentenceLength(text) const syllablePerWord = this.averageSyllablePerWord(text) const flesch = 0.39 * sentenceLength + 11.8 * syllablePerWord - 15.59 const returnVal = Math.legacyRound(flesch, 1) return returnVal } polySyllableCount (text) { let count = 0 let wrds for (let word of Readability.split(text)) { wrds = this.syllableCount(word) if (wrds >= 3) count += 1 } return count } smogIndex (text) { const sentences = this.sentenceCount(text) if (sentences >= 3) { const polySyllab = this.polySyllableCount(text) const smog = 1.043 * (30 * (polySyllab / sentences)) ** 0.5 + 3.1291 const returnVal = Math.legacyRound(smog, 1) return !isNaN(returnVal) ? returnVal : 0.0 } return 0.0 } colemanLiauIndex (text) { const letters = Math.legacyRound(this.averageLetterPerWord(text) * 100, 2) const sentences = Math.legacyRound(this.averageSentencePerWord(text) * 100, 2) const coleman = 0.058 * letters - 0.296 * sentences - 15.8 return Math.legacyRound(coleman, 2) } automatedReadabilityIndex (text) { const characters = this.charCount(text) const words = this.lexiconCount(text) const sentences = this.sentenceCount(text) const averageCharacterPerWord = characters / words const averageWordPerSentence = words / sentences const readability = ( (4.71 * Math.legacyRound(averageCharacterPerWord, 2)) + (0.5 * Math.legacyRound(averageWordPerSentence, 2)) - 21.43 ) const returnVal = Math.legacyRound(readability, 1) return !isNaN(returnVal) ? returnVal : 0.0 } linsearWriteFormula (text) { let easyWord = 0 let difficultWord = 0 let textList = Readability.split(text).slice(0, 100) for (let word of textList) { if (this.syllableCount(word) < 3) { easyWord += 1 } else { difficultWord += 1 } } text = textList.join(' ') let number = (easyWord * 1 + difficultWord * 3) / this.sentenceCount(text) let returnVal = number <= 20 ? (number - 2) / 2 : number / 2 return Math.legacyRound(returnVal, 1) } presentTense(word) { // good enough for most long words -- we only care about "difficult" words // of two or more syllables anyway. // Doesn't work for words ending in "e" that aren't "easy" if (word.length < 6) return word if (word.endsWith('ed')) { if (easyWordSet.has(word.slice(0, -1))) return word.slice(0, -1) // "easy" word ending in e else return word.slice(0, -2) // assume we remove "ed" } if (word.endsWith('ing')) { const suffixIngToE = word.slice(0, -3) + "e" // e.g. forcing -> force if (easyWordSet.has(suffixIngToE)) return suffixIngToE else return word.slice(0, -3) } return word } difficultWords (text, syllableThreshold) { return [...this.difficultWordsSet(text, syllableThreshold)].length } difficultWordsSet (text, syllableThreshold = 2) { const textList = text.match(/[\w=‘’]+/g) const diffWordsSet = new Set() if (textList === null) return diffWordsSet for (let word of textList) { const normalized = this.presentTense(pluralize(word.toLocaleLowerCase(), 1)) // console.log(`difficultWords(${word}): norm=${normalized}, ` // `${this.syllableCount(word)} syllables, easy? ${easyWordSet.has(normalized)}`) if (!easyWordSet.has(normalized) && this.syllableCount(word) >= syllableThreshold) { diffWordsSet.add(word) } } return diffWordsSet } daleChallReadabilityScore (text) { const wordCount = this.lexiconCount(text) const count = wordCount - this.difficultWords(text) const per = (count / wordCount * 100) if (isNaN(per)) return 0.0 const difficultWords = 100 - per // console.log('difficult words : ', difficultWords) let score = (0.1579 * difficultWords) + (0.0496 * this.averageSentenceLength(text)) if (difficultWords > 5) score += 3.6365 return Math.legacyRound(score, 2) } daleChallToGrade (score) { if (score <= 4.9) return 4 if (score < 5.9) return 5 if (score < 6.9) return 7 if (score < 7.9) return 9 if (score < 8.9) return 11 if (score < 9.9) return 13 else return 16 } gunningFog (text) { const perDiffWords = (this.difficultWords(text, 3) / this.lexiconCount(text) * 100) if (isNaN(perDiffWords)) return 0.0 const grade = 0.4 * (this.averageSentenceLength(text) + perDiffWords) return Math.legacyRound(grade, 2) } lix (text) { const words = Readability.split(text) const wordsLen = words.length const longWords = words.filter(wrd => wrd.length > 6).length const perLongWords = longWords * 100 / wordsLen const asl = this.averageSentenceLength(text) const lix = asl + perLongWords return Math.legacyRound(lix, 2) } rix (text) { const words = Readability.split(text) const longWordsCount = words.filter(wrd => wrd.length > 6).length const sentencesCount = this.sentenceCount(text) const rix = longWordsCount / sentencesCount return !isNaN(rix) ? Math.legacyRound(rix, 2) : 0.0 } textStandard (text, floatOutput = null) { const grade = [] // Appending Flesch Kincaid Grade let lower = Math.legacyRound(this.fleschKincaidGrade(text)) let upper = Math.ceil(this.fleschKincaidGrade(text)) grade.push(Math.floor(lower)) grade.push(Math.floor(upper)) let score = this.fleschReadingEase(text) let freGrade = this.fleschReadingEaseToGrade(score) grade.push(freGrade) // console.log('grade till now: \n', grade) lower = Math.legacyRound(this.smogIndex(text)) upper = Math.ceil(this.smogIndex(text)) grade.push(Math.floor(lower)) grade.push(Math.floor(upper)) // Appending Coleman_Liau_Index lower = Math.legacyRound(this.colemanLiauIndex(text)) upper = Math.ceil(this.colemanLiauIndex(text)) grade.push(Math.floor(lower)) grade.push(Math.floor(upper)) // Appending Automated_Readability_Index lower = Math.legacyRound(this.automatedReadabilityIndex(text)) upper = Math.ceil(this.automatedReadabilityIndex(text)) grade.push(Math.floor(lower)) grade.push(Math.floor(upper)) // console.log('grade till now : 2 : \n', grade) // Appending Dale_Chall_Readability_Score lower = Math.legacyRound(this.daleChallToGrade(this.daleChallReadabilityScore(text))) upper = Math.ceil(this.daleChallToGrade(this.daleChallReadabilityScore(text))) grade.push(Math.floor(lower)) grade.push(Math.floor(upper)) // Appending linsearWriteFormula lower = Math.legacyRound(this.linsearWriteFormula(text)) upper = Math.ceil(this.linsearWriteFormula(text)) grade.push(Math.floor(lower)) grade.push(Math.floor(upper)) // Appending Gunning Fog Index lower = Math.legacyRound(this.gunningFog(text)) upper = Math.ceil(this.gunningFog(text)) grade.push(Math.floor(lower)) grade.push(Math.floor(upper)) // d = Counter(grade) // final_grade = d.most_common(1) // score = final_grade[0][0] // if float_output: // return float(score) // else: // lower_score = int(score) - 1 // upper_score = lower_score + 1 // return "{}{} and {}{} grade".format( // lower_score, get_grade_suffix(lower_score), // upper_score, get_grade_suffix(upper_score) // ) // Finding the Readability Consensus based upon all the above tests // console.log('grade List: ', grade) const counterMap = [...new Set(grade)].map(x => [x, grade.filter(y => y === x).length]) const finalGrade = counterMap.reduce((x, y) => y[1] >= x[1] ? y : x) score = finalGrade[0] if (floatOutput) return score const lowerScore = Math.floor(score) - 1 const upperScore = lowerScore + 1 return `${lowerScore}${Readability.getGradeSuffix(lowerScore)} and ${upperScore}${Readability.getGradeSuffix(upperScore)} grade` } textMedian (text) { const grade = [] // Appending Flesch Kincaid Grade grade.push(this.fleschKincaidGrade(text)) const score = this.fleschReadingEase(text) const freGrade = this.fleschReadingEaseToGrade(score) grade.push(freGrade) grade.push(this.smogIndex(text)) // Appending Coleman_Liau_Index grade.push(this.colemanLiauIndex(text)) // Appending Automated_Readability_Index grade.push(this.automatedReadabilityIndex(text)) // Appending Dale_Chall_Readability_Score grade.push(this.daleChallToGrade(this.daleChallReadabilityScore(text))) // Appending linsearWriteFormula grade.push(this.linsearWriteFormula(text)) // Appending Gunning Fog Index grade.push(this.gunningFog(text)) // compute median grade.sort(function(a, b) { return a - b }) let half = Math.floor(grade.length / 2) if (half & 0x1) return (grade[half-1] + grade[half])/2 else return grade[half] } } const readability = new Readability() export default readability;