@jti/antora-extension-repeated_words
Version:
An Antora extension that checks Asciidoc source files for repeated words in prose.
274 lines (234 loc) • 7.14 kB
JavaScript
// An Antora extension that checks for repeated words in Asciidoc
// content.
const extensionName = 'repeated_words-extension'
const chalk = require('chalk')
const path = require('path')
const u = require('@jti/utils')
const validRepeats = {
had: true,
that: true,
can: true,
blah: true,
beep: true,
sapiens: true,
tse: true,
mau: true,
}
const longMacroRE = /(audio|footnote|https?|icon|link|mailto|menu|pass|video|xref):[^\[]+\[([^\]]*)]/
const shortMacroRE = /(btn|kbd|pass):\[([^\]]*)]/
const imageRE = /image:[^\[]+\["([^"]*)".*(]|$)/
const problems = {}
const check = (contents, repeats = validRepeats) => {
u.debug('Check repeated_words')
if (!contents || !contents.length) return []
let results = []
let inSource = false
let delimiter = ''
let mg
u.debug('Starting line processing...')
const lines = contents.split(/\r?\n/)
if (lines.length > 1) lines.pop()
for (let index = 0; index < lines.length; index++) {
let line = lines[index]
u.debug(`Line ${index + 1}: -=-${line}=-=`)
u.debug(u.myTypeOf(line))
// skip blank lines
if (!line) continue
u.debug(`not a blank line`)
// avoid checking for repeats inside source blocks
if (!inSource && line.match(/\[(source|shell|verbatim)[^\]]*/)) {
inSource = true
continue
}
if (inSource) {
mg = line.match(/^([-=]+)$/)
if (mg) {
if (delimiter.length > 0 && delimiter === mg[1]) {
// end of source block reached
delimiter = ''
inSource = false
continue
}
if (delimiter === '') {
// initial delimiter found
delimiter = mg[1]
continue
}
}
// handle non-delimited blocks
if (delimiter === '' && line.match(/^\s*$/)) {
inSource = false
continue
}
// just skip source block content
continue
}
// Remove Asciidoc macros
const origLine = line
line = line.replace(imageRE, "$1")
line = line.replace(longMacroRE, "$2")
line = line.replace(shortMacroRE, "$1")
u.debug('After macro removal', line)
u.debug(`splitting into words`)
// try to find repeats
const words = line.split(' ')
if (!words || !words.length) continue
u.debug(`has ${words.length} words`)
let previous = ''
for (let i = 0; i < words.length; i++) {
const word = words[i]
u.debug(`checking word ${word}`)
if (
previous &&
previous.toLowerCase() === word.toLowerCase() &&
!skip(word, repeats)
) {
// word has been repeated
results.push({
line: index + 1,
offset: i,
repeated: word,
source: origLine,
})
}
previous = word
}
// check whether the last word of the current line repeats as the
// first word of the next line
u.debug(`starting next line check`)
if (index < lines.length - 1) {
u.debug(`Has a next line...`)
const word1 = words[words.length - 1]
const word2 = lines[index + 1].split(/ /)[0]
if (
word1.toLowerCase() === word2.toLowerCase() &&
!skip(word2, repeats)
) {
u.debug('Repeat found on second line!')
// word has been repeated on the next line
results.push({
line: index + 2,
offset: 0,
repeated: word2,
source: lines[index + 1]
})
}
}
}
u.debug('Check complete, returning:', results)
u.debug('')
return results
}
// Should this word be ignored?
const skip = (word, repeats) => {
// If it is a known valid repeat, skip.
if (word.toLowerCase() in repeats) return true
// Is it an initialism?
const first = word.charAt(0)
if (first === first.toUpperCase()) {
// Skip, for example, "D. D."
if (word.length === 2 && word.charAt(1) === '.') return true
// Skip simple, capitalized words. Think "Duran Duran"
const tail = word.slice(1)
if (tail === tail.toLowerCase()) return true
}
let mg = word.match(/^\|\s*(.*)$/)
if (mg) {
u.debug('Possibly in table cell')
// skip empty table cells
if (mg[1].length === 0) {
u.debug('Nothing after cell marker')
return true
}
// skip cells with only one word
if (!mg[1].match(/\s/)) {
u.debug('Only one word after cell marker')
return true
}
}
return false
}
const report = () => {
let topics = 0
const issues = Object.keys(problems)
issues.sort().map((file) => {
topics++
const issues = problems[file]
u.log(chalk.magenta(file))
issues.map((issue) => {
let line = issue.source
const RE = new RegExp(
'^((\\b[^\\b]+?\\s+){'
+ (issue.offset)
+ '})([^\\s]+)(.*)$'
)
let mg = line.match(RE)
if (mg) {
// u.log(mg)
line = mg[1] + chalk.bold.yellow(mg[3]) + mg[4]
}
u.log(`${issue.line}: ${line}`)
})
})
if (issues.length) {
console.log(`\n${chalk.bold(issues.length)} repeated word issue${u.s(issues.length)} in ${topics} file${u.s(topics)} found!`)
}
}
function register ({
config: {
repeats = validRepeats,
debug = false,
remote = true,
...unknownOptions
}
}) {
// const logger = this.getLogger(extensionName)
if (Object.keys(unknownOptions).length) {
const keys = Object.keys(unknownOptions)
throw new Error(
`Unrecognized option${u.s(keys.length)}` +
` specified for ${extensionName}: ${keys.join(', ')}`
)
}
// During contentClassified, check all the pages one by one
this.on('contentClassified', ({ contentCatalog }) => {
u.DEBUG = debug
u.DEBUG_PREFIX = 'RWE'
for (let file of contentCatalog.getFiles()) {
if (file.src.extname !== '.adoc' || file.synthetic) continue
if (!file.src.origin.worktree && !remote) continue
u.debug(`worktree: ${file.src.origin.worktree}`)
u.debug(`startPath: ${file.src.origin.startPath}`)
u.debug(`path: ${file.src.path}`)
const pagePath = path.join(
file.src.origin.worktree || process.env.INIT_CWD || process.cwd(),
file.src.origin.startPath || '',
file.src.path
)
u.debug(`pagePath: ${chalk.magenta(pagePath)}`)
const reportPath = path.relative(file.src.origin.worktree, pagePath)
u.debug(`reportPath: ${chalk.magenta(reportPath)}`)
const results = check(file.contents.toString(), repeats)
if (results.length) {
u.debug(`Found repeats in ${reportPath}:`, results)
problems[reportPath] = results
}
}
if (Object.keys(problems).length) {
u.log(chalk.red(`Repeated words? ${chalk.bold.red('Found!')}`))
report()
}
else {
u.log(`Repeated words? ${chalk.bold.green('None found')}`)
}
})
this.on('documentsConverted', () => {
if (Object.keys(problems).length) {
u.log(chalk.bold('Problems reported, stopping build!'))
this.stop(1)
}
})
}
module.exports = { register: register, check }
// vim: tw=0 ai et ts=2 sw=2