@redpanda-data/docs-extensions-and-macros
Version:
Antora extensions and macros developed for Redpanda documentation.
640 lines (561 loc) • 23.4 kB
JavaScript
const path = require('path')
const os = require('os')
const yaml = require('js-yaml')
const { toMarkdownUrl } = require('../extension-utils/url-utils')
const { formatLlmsDirective } = require('../extension-utils/llms-utils')
const TurndownService = require('turndown')
const turndownPluginGfm = require('turndown-plugin-gfm')
const { gfm } = turndownPluginGfm
/**
* Decode HTML entities in a string
* @param {string} str - String with potential HTML entities
* @returns {string} Decoded string
*/
function decodeHtmlEntities (str) {
if (!str || typeof str !== 'string') return str
return str
.replace(/&#(\d+);/g, (_, dec) => String.fromCharCode(dec))
.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
.replace(/"/g, '"')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/'/g, "'")
}
/**
* Converts AsciiDoc page attributes to YAML frontmatter
* @param {Object} page - The page object with asciidoc attributes
* @returns {string} YAML frontmatter string or empty string if no attributes
*/
function generateFrontmatter(page) {
const frontmatter = {}
// Add title (decode HTML entities from AsciiDoc processing)
if (page.asciidoc?.doctitle) {
frontmatter.title = decodeHtmlEntities(page.asciidoc.doctitle)
}
// Add navigation title if different from doctitle
if (page.asciidoc?.navtitle && page.asciidoc.navtitle !== page.asciidoc?.doctitle) {
frontmatter.navtitle = decodeHtmlEntities(page.asciidoc.navtitle)
}
// Get all page attributes
const attrs = page.asciidoc?.attributes || {}
// Allowlist of attributes to include in frontmatter
// Explicitly opt-in to attributes that are useful for AI consumption
const allowedAttributes = [
'title',
'navtitle',
'description',
// Note: 'categories' removed - it's a massive object (all Connect components)
// that bloats frontmatter. Only include on pages that specifically need it.
'page-component-name',
'page-component-title',
'page-component-version',
'page-version',
'page-relative-src-path',
'page-edit-url',
'page-topic-type',
'personas',
'docname',
'page-beta',
'page-beta-text',
'page-is-nearing-eol',
'page-is-past-eol',
'page-eol-date',
'page-git-created-date',
'page-git-modified-date',
// Component-specific version attributes (from antora.yml)
'latest-redpanda-tag', // Redpanda docker tag (e.g., v25.3.5)
'latest-console-tag',
'latest-operator-version',
'latest-connect-version',
]
// Blocklist of large data attributes that should never be in frontmatter
// These are component-level data used by macros but shouldn't pollute markdown
const blockedAttributes = [
'categories', // Massive nested object with all Connect components
'connectCategoriesData', // Raw Connect categories data
'flatComponentsData', // Flattened Connect component data
'driverSupportData', // SQL driver support matrix
'cacheSupportData', // Cache support matrix
'csvData', // Raw CSV data from generate-rp-connect-info
'commercialNamesMap', // Commercial names mapping
]
// Add allowed page attributes to frontmatter
Object.keys(attrs).forEach(key => {
const value = attrs[key]
// Skip blocked attributes (large data objects)
if (blockedAttributes.includes(key)) return
// Allow all learning-objective-* attributes (learning-objective-1, -2, -3, etc.)
const isLearningObjective = key.startsWith('learning-objective-')
// Only include attributes in our allowlist or learning objectives
if (!allowedAttributes.includes(key) && !isLearningObjective) return
// Only include page-beta-text if page-beta is true
if (key === 'page-beta-text' && !attrs['page-beta']) {
return
}
// Skip empty attributes (AsciiDoc boolean flags)
if (value === '') {
// Special handling for version fields - use actual version from page source
if (key === 'page-version') {
frontmatter[key] = page.src?.version || 'master'
return
}
if (key === 'page-component-version') {
frontmatter[key] = page.src?.version || 'master'
return
}
// Preserve important boolean flags
if (key.startsWith('page-')) {
frontmatter[key] = true
}
return
}
// Include the attribute
frontmatter[key] = value
})
// Transform EOL fields to be more user-friendly
if (frontmatter['page-is-nearing-eol'] || frontmatter['page-is-past-eol']) {
let eolStatus = 'supported'
if (frontmatter['page-is-past-eol'] === 'true' || frontmatter['page-is-past-eol'] === true) {
eolStatus = 'past end-of-life'
} else if (frontmatter['page-is-nearing-eol'] === 'true' || frontmatter['page-is-nearing-eol'] === true) {
eolStatus = 'nearing end-of-life'
}
frontmatter['support-status'] = eolStatus
// Keep original fields for compatibility
}
// Transform beta fields to be more user-friendly
if (frontmatter['page-beta'] === 'true' || frontmatter['page-beta'] === true) {
let betaStatus = 'beta'
if (frontmatter['page-beta-text']) {
betaStatus = `beta - ${frontmatter['page-beta-text']}`
}
frontmatter['release-status'] = betaStatus
}
// Return empty string if no frontmatter
if (Object.keys(frontmatter).length === 0) return ''
// Convert to YAML format using js-yaml library for proper escaping
let yamlContent = yaml.dump(frontmatter, {
lineWidth: -1, // Disable line wrapping
noRefs: true, // Disable anchors/aliases
quotingType: '"', // Use double quotes
forceQuotes: false, // Only quote when necessary
})
// Add helpful comments for EOL (end-of-life) fields
// Find the first EOL-related field and add comment before it
if (frontmatter['page-is-nearing-eol'] || frontmatter['page-is-past-eol'] || frontmatter['support-status']) {
const eolFieldRegex = /^(page-is-nearing-eol:|page-is-past-eol:|support-status:)/m
if (!yamlContent.includes('# EOL =')) {
yamlContent = yamlContent.replace(
eolFieldRegex,
'# EOL = End-of-Life (support lifecycle status)\n$1'
)
}
}
// Add helpful comments for beta fields
if (frontmatter['page-beta'] || frontmatter['release-status']) {
const betaFieldRegex = /^(page-beta:|release-status:)/m
if (!yamlContent.includes('# Beta release')) {
yamlContent = yamlContent.replace(
betaFieldRegex,
'# Beta release status\n$1'
)
}
}
return `---\n${yamlContent}---\n\n`
}
module.exports.register = function () {
const logger = this.getLogger('convert-to-markdown-extension')
let playbook
// Shared Turndown configuration
const baseConfig = {
headingStyle: 'atx',
codeBlockStyle: 'fenced',
bulletListMarker: '-',
linkReferenceStyle: 'full',
}
// Factory: create a configured Turndown instance
function createTurndownBase() {
const td = new TurndownService(baseConfig)
td.use(gfm)
// Remove unwanted global elements (footers, modals, feedback, etc.)
td.addRule('remove-unwanted', {
filter: (node) => {
if (!node || !node.getAttribute) return false
const classAttr = (node.getAttribute('class') || '').toLowerCase()
const idAttr = (node.getAttribute('id') || '').toLowerCase()
const tag = node.nodeName.toLowerCase()
// Remove by tag
if (['script', 'style', 'footer', 'nav'].includes(tag)) return true
// Remove tracking or hidden images
if (
tag === 'img' &&
(classAttr.includes('tracking') ||
idAttr.includes('scarf') ||
node.getAttribute('role') === 'presentation' ||
node.style?.display === 'none')
) {
return true
}
// Remove by class or id
const toRemove = [
'thumbs',
'back-to-top',
'contributors-modal',
'feedback-section',
'feedback-toast',
'pagination',
'footer',
'nav-expand',
'banner-container',
'markdown-dropdown',
'version-selector', // Version dropdown (not relevant for LLMs)
'component-indicator', // Component header bar
'product-switcher', // Product switcher dropdown
'breadcrumb', // Breadcrumb navigation
'chat-panel', // AI chat interface
'kapa', // Kapa AI widget
]
return toRemove.some(
(x) => classAttr.includes(x) || idAttr.includes(x)
)
},
replacement: () => '',
})
// Keep critical content blocks only
td.keep(['div.openblock.tabs', 'article.doc'])
return td
}
// Factory: create page-specific Turndown converter
function createTurndownForPage(page) {
const outerTurndown = createTurndownBase()
const nestedTurndown = createTurndownBase()
// Helper to add custom rules
function addCustomRules(turndownInstance, isInner = false) {
// Determine heading depth for tab conversion
function findNearestHeadingLevel(el) {
let current = el.previousElementSibling
while (current) {
if (/^H[1-6]$/i.test(current.nodeName))
return parseInt(current.nodeName.substring(1))
current = current.previousElementSibling
}
let parent = el.parentElement
while (parent) {
const headings = Array.from(
parent.querySelectorAll('h1,h2,h3,h4,h5,h6')
)
if (headings.length > 0) {
const last = headings[headings.length - 1]
return parseInt(last.nodeName.substring(1))
}
parent = parent.parentElement
}
return 2
}
// Asciidoctor tab conversion
turndownInstance.addRule('asciidoctor-tabs', {
filter: (node) => {
if (node.nodeName !== 'DIV') return false
const classAttr = node.getAttribute?.('class') || node.className || ''
return classAttr.includes('openblock') && classAttr.includes('tabs')
},
replacement: function (_, node) {
function processTabGroup(group, parentHeadingLevel = null) {
const contentDiv = group.querySelector('.content') || group
const tabList = contentDiv.querySelectorAll('li.tab')
if (!tabList.length) return ''
const nearestLevel =
parentHeadingLevel != null
? parentHeadingLevel + 1
: findNearestHeadingLevel(group) + 1
const tabHeadingLevel = Math.min(nearestLevel, 6)
const headingPrefix = '#'.repeat(tabHeadingLevel)
let markdown = ''
tabList.forEach((tab) => {
const title =
tab.querySelector('p')?.textContent.trim() ||
tab.textContent.trim()
let panelId = tab.getAttribute('aria-controls')
if (!panelId && tab.id) panelId = tab.id + '--panel'
const panel = group.querySelector(`#${panelId}`)
if (!panel) return
const nestedTabs = panel.querySelectorAll('.openblock.tabs')
let nestedMdCombined = ''
nestedTabs.forEach((nested) => {
nestedMdCombined +=
'\n' + processTabGroup(nested, tabHeadingLevel) + '\n'
nested.remove()
})
const innerHtml = panel.innerHTML || ''
let md = ''
try {
const converter = isInner ? nestedTurndown : turndownInstance
md = converter.turndown(innerHtml)
} catch (e) {
logger.warn(`Turndown failed in nested tab: ${e.message}`)
}
markdown += `${headingPrefix} ${title}\n\n${md.trim()}\n${nestedMdCombined.trim()}\n\n`
})
return markdown.trim()
}
return '\n' + processTabGroup(node, null) + '\n'
},
})
// Admonition block conversion
turndownInstance.addRule('admonition', {
filter: (node) =>
node.nodeName === 'TABLE' &&
node.querySelector('td.icon') &&
node.querySelector('td.content'),
replacement: function (_, node) {
const iconCell = node.querySelector('td.icon')
const contentCell = node.querySelector('td.content')
if (!iconCell || !contentCell) return ''
const iconEl = iconCell.querySelector('i')
const classAttr = iconEl?.className || ''
const match = classAttr.match(/icon-([a-z]+)/i)
const type = match ? match[1].toUpperCase() : 'NOTE'
const titleEl =
node.querySelector('.title') ||
contentCell.querySelector('.title') ||
iconEl?.getAttribute('title')
const customTitle =
typeof titleEl === 'string'
? titleEl.trim()
: titleEl?.textContent?.trim() || ''
const emojiMap = {
CAUTION: '⚠️',
WARNING: '⚠️',
TIP: '💡',
NOTE: '📝',
IMPORTANT: '❗',
}
const emoji = emojiMap[type] || '📘'
const innerHtml = contentCell.innerHTML || ''
let innerMd = ''
try {
const converter = isInner ? nestedTurndown : turndownInstance
innerMd = converter.turndown(innerHtml).trim()
} catch (e) {
logger.warn(`Turndown failed in admonition: ${e.message}`)
}
const titleLower = customTitle.toLowerCase()
const typeLower = type.toLowerCase()
const header =
customTitle && titleLower !== typeLower
? `${emoji} **${type}: ${customTitle}**`
: `${emoji} **${type}**`
const quoted = innerMd
.split('\n')
.map((line) => (line.startsWith('>') ? line : `> ${line}`))
.join('\n')
return `\n> ${header}\n>\n${quoted}\n`
},
})
// Markdown table conversion
turndownInstance.addRule('tables', {
filter: (node) => {
if (node.nodeName !== 'TABLE') return false
if (node.querySelector('td.icon') && node.querySelector('td.content'))
return false
return true
},
replacement: function (content, node) {
const rows = Array.from(node.querySelectorAll('tr'))
if (!rows.length) return content
const tableRows = []
rows.forEach((row, index) => {
const cells = Array.from(row.querySelectorAll('th, td'))
const cellContents = cells.map((cell) =>
(cell.textContent || '').trim().replace(/\s+/g, ' ')
)
if (!cellContents.length) return
const rowLine = '| ' + cellContents.join(' | ') + ' |'
tableRows.push(rowLine)
if (index === 0) {
const separator =
'| ' + cellContents.map(() => '---').join(' | ') + ' |'
tableRows.push(separator)
}
})
return '\n' + tableRows.join('\n') + '\n'
},
})
}
addCustomRules(outerTurndown, false)
addCustomRules(nestedTurndown, true)
return outerTurndown
}
// Add marker attribute before UI rendering so templates can detect markdown availability
this.on('documentsConverted', ({ contentCatalog }) => {
const pages = contentCatalog.findBy({ family: 'page' })
logger.info(`Marking ${pages.length} pages as having markdown equivalents...`)
pages.forEach((page) => {
// Ensure attributes object exists
if (!page.asciidoc) page.asciidoc = {}
if (!page.asciidoc.attributes) page.asciidoc.attributes = {}
// Add marker that UI templates can check
page.asciidoc.attributes['page-has-markdown'] = ''
})
})
// Conversion pipeline
this.on('pagesComposed', async ({ playbook: pb, contentCatalog }) => {
playbook = pb
const siteUrl = playbook.site?.url || ''
const pages = contentCatalog.getPages()
logger.info(
`Converting ${pages.length} pages to Markdown${
siteUrl ? ` (site.url=${siteUrl})` : ''
}...`
)
const concurrency = Math.max(2, Math.floor(os.cpus().length / 2))
const queue = [...pages]
let convertedCount = 0
async function processQueue() {
while (queue.length) {
const page = queue.shift()
if (!page?.contents) continue
try {
const html = page.contents.toString().trim()
if (!html) continue
// Extract only the <article class="doc"> portion
const match = html.match(
/<article[^>]*class=["'][^"']*\bdoc\b[^"']*["'][^>]*>([\s\S]*?)<\/article>/i
)
if (!match || !match[1]) {
logger.info(`No <article class="doc"> found for ${page.src?.path}`)
continue
}
const articleHtml = match[1]
// Convert with Turndown
const td = createTurndownForPage(page)
let markdown = td.turndown(articleHtml).trim()
// Canonical source link
let canonicalUrl = ''
try {
if (siteUrl && page.pub?.url) {
const baseUrl = new URL(page.pub.url, siteUrl)
// Convert HTML URL to markdown URL using shared utility
baseUrl.pathname = toMarkdownUrl(baseUrl.pathname)
canonicalUrl = baseUrl.toString()
}
} catch (e) {
logger.debug(
`Failed to build canonical URL for ${page.src?.path}: ${e.message}`
)
}
// Remove escaped underscores from headings (TurndownService escapes them unnecessarily)
markdown = markdown.replace(/^(#{1,6}\s+.+)$/gm, (heading) => {
return heading.replace(/\\_/g, '_')
})
// Skip directive for field-only pages (marked by generate-fields-only-pages extension)
const isFieldOnlyPage = page.isFieldOnlyPage === true
// Field-only pages: basic markdown only, no frontmatter, no directive, no source comments
if (isFieldOnlyPage) {
// Strip anchor links from headings: [](#anchor-id)heading → heading
markdown = markdown.replace(/\[]\(#[^)]+\)/g, '')
// Just use the markdown as-is with basic cleanup
markdown = markdown.trim()
} else {
// Regular pages: full treatment with frontmatter and directive
// Generate YAML frontmatter from AsciiDoc attributes
const frontmatter = generateFrontmatter(page)
if (frontmatter) {
logger.debug(`Generated frontmatter for ${page.src?.path}`)
}
// Extract H1 heading if present (only at document start)
const h1Match = markdown.match(/^(#\s+.+?)(\n|$)/)
let h1Heading = ''
let restOfMarkdown = markdown
if (h1Match) {
h1Heading = h1Match[0]
restOfMarkdown = markdown.substring(h1Match[0].length).trimStart()
}
// Structure: H1 → llms.txt directive (blockquote) → frontmatter → source → content
// The directive must appear near the top for agent-friendly docs spec compliance
if (canonicalUrl) {
const componentName = page.src?.component || '';
// Use markdown blockquote format for the directive (visible, can be hidden with CSS)
const llmsDirective = formatLlmsDirective(componentName);
markdown = `${h1Heading}\n${llmsDirective}\n\n${frontmatter}<!-- Source: ${canonicalUrl} -->\n\n${restOfMarkdown}`
} else if (frontmatter) {
// If no canonical URL but we have frontmatter, still add directive after H1
const llmsDirective = formatLlmsDirective();
markdown = `${h1Heading}\n${llmsDirective}\n\n${frontmatter}${restOfMarkdown}`
}
}
// Convert relative URLs to absolute URLs (after directive is added)
if (siteUrl && page.pub?.url) {
try {
const baseUrl = new URL(siteUrl)
const pageUrl = new URL(page.pub.url, baseUrl)
// Convert absolute paths: [text](/path) → [text](https://domain/path)
markdown = markdown.replace(/\[([^\]]+)\]\(\/([^)]+)\)/g, (match, text, path) => {
try {
const fullUrl = new URL('/' + path, baseUrl).toString()
return `[${text}](${fullUrl})`
} catch (e) {
return match // Keep original if URL construction fails
}
})
// Convert relative paths: [text](../../path) → [text](https://domain/resolved/path)
markdown = markdown.replace(/\[([^\]]+)\]\((\.\.\/[^)]+)\)/g, (match, text, relativePath) => {
try {
// Resolve relative path against the current page URL
const fullUrl = new URL(relativePath, pageUrl).toString()
return `[${text}](${fullUrl})`
} catch (e) {
return match // Keep original if URL construction fails
}
})
} catch (e) {
logger.debug(`Failed to resolve relative URLs for ${page.src?.path}: ${e.message}`)
}
}
// Clean up unnecessary whitespace
if (markdown) {
// Remove excessive blank lines (more than 2 consecutive newlines)
markdown = markdown.replace(/\n{3,}/g, '\n\n')
// Remove trailing whitespace from lines
markdown = markdown.replace(/[ \t]+$/gm, '')
// Remove leading/trailing whitespace from the entire document
markdown = markdown.trim()
}
if (markdown) {
page.markdownContents = Buffer.from(markdown, 'utf8')
convertedCount++
}
} catch (err) {
logger.error(
`Error converting ${page.src?.path || 'unknown'}: ${err.message}`
)
logger.debug(err.stack)
}
}
}
const workers = Array.from({ length: concurrency }, processQueue)
await Promise.all(workers)
logger.info(`Converted ${convertedCount} Markdown files.`)
})
// Add Markdown files to site catalog
this.on('beforePublish', ({ siteCatalog, contentCatalog }) => {
const pages = contentCatalog.getPages((p) => p.markdownContents)
if (!pages.length) {
logger.info('No Markdown files to publish.')
return
}
logger.info(`Adding ${pages.length} Markdown files to site catalog...`)
for (const page of pages) {
const htmlOut = page.out?.path
if (!htmlOut) continue
// Convert HTML path to markdown path using shared utility
const mdOutPath = toMarkdownUrl(htmlOut)
siteCatalog.addFile({
contents: page.markdownContents,
out: { path: mdOutPath },
})
logger.debug(`Added Markdown: ${mdOutPath}`)
}
})
}