UNPKG

@jti/antora-extension-repeated_words

Version:

An Antora extension that checks Asciidoc source files for repeated words in prose.

274 lines (234 loc) 7.14 kB
// An Antora extension that checks for repeated words in Asciidoc // content. 'use strict' const extensionName = 'repeated_words-extension' const chalk = require('chalk') const path = require('path') const u = require('@jti/utils') const validRepeats = { had: true, that: true, can: true, blah: true, beep: true, sapiens: true, tse: true, mau: true, } const longMacroRE = /(audio|footnote|https?|icon|link|mailto|menu|pass|video|xref):[^\[]+\[([^\]]*)]/ const shortMacroRE = /(btn|kbd|pass):\[([^\]]*)]/ const imageRE = /image:[^\[]+\["([^"]*)".*(]|$)/ const problems = {} const check = (contents, repeats = validRepeats) => { u.debug('Check repeated_words') if (!contents || !contents.length) return [] let results = [] let inSource = false let delimiter = '' let mg u.debug('Starting line processing...') const lines = contents.split(/\r?\n/) if (lines.length > 1) lines.pop() for (let index = 0; index < lines.length; index++) { let line = lines[index] u.debug(`Line ${index + 1}: -=-${line}=-=`) u.debug(u.myTypeOf(line)) // skip blank lines if (!line) continue u.debug(`not a blank line`) // avoid checking for repeats inside source blocks if (!inSource && line.match(/\[(source|shell|verbatim)[^\]]*/)) { inSource = true continue } if (inSource) { mg = line.match(/^([-=]+)$/) if (mg) { if (delimiter.length > 0 && delimiter === mg[1]) { // end of source block reached delimiter = '' inSource = false continue } if (delimiter === '') { // initial delimiter found delimiter = mg[1] continue } } // handle non-delimited blocks if (delimiter === '' && line.match(/^\s*$/)) { inSource = false continue } // just skip source block content continue } // Remove Asciidoc macros const origLine = line line = line.replace(imageRE, "$1") line = line.replace(longMacroRE, "$2") line = line.replace(shortMacroRE, "$1") u.debug('After macro removal', line) u.debug(`splitting into words`) // try to find repeats const words = line.split(' ') if (!words || !words.length) continue u.debug(`has ${words.length} words`) let previous = '' for (let i = 0; i < words.length; i++) { const word = words[i] u.debug(`checking word ${word}`) if ( previous && previous.toLowerCase() === word.toLowerCase() && !skip(word, repeats) ) { // word has been repeated results.push({ line: index + 1, offset: i, repeated: word, source: origLine, }) } previous = word } // check whether the last word of the current line repeats as the // first word of the next line u.debug(`starting next line check`) if (index < lines.length - 1) { u.debug(`Has a next line...`) const word1 = words[words.length - 1] const word2 = lines[index + 1].split(/ /)[0] if ( word1.toLowerCase() === word2.toLowerCase() && !skip(word2, repeats) ) { u.debug('Repeat found on second line!') // word has been repeated on the next line results.push({ line: index + 2, offset: 0, repeated: word2, source: lines[index + 1] }) } } } u.debug('Check complete, returning:', results) u.debug('') return results } // Should this word be ignored? const skip = (word, repeats) => { // If it is a known valid repeat, skip. if (word.toLowerCase() in repeats) return true // Is it an initialism? const first = word.charAt(0) if (first === first.toUpperCase()) { // Skip, for example, "D. D." if (word.length === 2 && word.charAt(1) === '.') return true // Skip simple, capitalized words. Think "Duran Duran" const tail = word.slice(1) if (tail === tail.toLowerCase()) return true } let mg = word.match(/^\|\s*(.*)$/) if (mg) { u.debug('Possibly in table cell') // skip empty table cells if (mg[1].length === 0) { u.debug('Nothing after cell marker') return true } // skip cells with only one word if (!mg[1].match(/\s/)) { u.debug('Only one word after cell marker') return true } } return false } const report = () => { let topics = 0 const issues = Object.keys(problems) issues.sort().map((file) => { topics++ const issues = problems[file] u.log(chalk.magenta(file)) issues.map((issue) => { let line = issue.source const RE = new RegExp( '^((\\b[^\\b]+?\\s+){' + (issue.offset) + '})([^\\s]+)(.*)$' ) let mg = line.match(RE) if (mg) { // u.log(mg) line = mg[1] + chalk.bold.yellow(mg[3]) + mg[4] } u.log(`${issue.line}: ${line}`) }) }) if (issues.length) { console.log(`\n${chalk.bold(issues.length)} repeated word issue${u.s(issues.length)} in ${topics} file${u.s(topics)} found!`) } } function register ({ config: { repeats = validRepeats, debug = false, remote = true, ...unknownOptions } }) { // const logger = this.getLogger(extensionName) if (Object.keys(unknownOptions).length) { const keys = Object.keys(unknownOptions) throw new Error( `Unrecognized option${u.s(keys.length)}` + ` specified for ${extensionName}: ${keys.join(', ')}` ) } // During contentClassified, check all the pages one by one this.on('contentClassified', ({ contentCatalog }) => { u.DEBUG = debug u.DEBUG_PREFIX = 'RWE' for (let file of contentCatalog.getFiles()) { if (file.src.extname !== '.adoc' || file.synthetic) continue if (!file.src.origin.worktree && !remote) continue u.debug(`worktree: ${file.src.origin.worktree}`) u.debug(`startPath: ${file.src.origin.startPath}`) u.debug(`path: ${file.src.path}`) const pagePath = path.join( file.src.origin.worktree || process.env.INIT_CWD || process.cwd(), file.src.origin.startPath || '', file.src.path ) u.debug(`pagePath: ${chalk.magenta(pagePath)}`) const reportPath = path.relative(file.src.origin.worktree, pagePath) u.debug(`reportPath: ${chalk.magenta(reportPath)}`) const results = check(file.contents.toString(), repeats) if (results.length) { u.debug(`Found repeats in ${reportPath}:`, results) problems[reportPath] = results } } if (Object.keys(problems).length) { u.log(chalk.red(`Repeated words? ${chalk.bold.red('Found!')}`)) report() } else { u.log(`Repeated words? ${chalk.bold.green('None found')}`) } }) this.on('documentsConverted', () => { if (Object.keys(problems).length) { u.log(chalk.bold('Problems reported, stopping build!')) this.stop(1) } }) } module.exports = { register: register, check } // vim: tw=0 ai et ts=2 sw=2