@redpanda-data/docs-extensions-and-macros
Version:
Antora extensions and macros developed for Redpanda documentation.
275 lines (219 loc) • 7.66 kB
JavaScript
/**
* Antora extension that generates markdown versions of sitemap.xml files.
*
* For each sitemap.xml in the published site, creates a corresponding sitemap.md
* with a human-readable and AI-friendly markdown format.
*
* Usage in playbook:
* ```yaml
* antora:
* extensions:
* - require: '@redpanda-data/docs-extensions-and-macros/extensions/convert-sitemap-to-markdown'
* ```
*/
const path = require('path')
const { parseStringPromise } = require('xml2js')
module.exports.register = function ({ config }) {
const logger = this.getLogger('convert-sitemap-to-markdown')
this
.on('beforePublish', async ({ playbook, siteCatalog }) => {
const startTime = Date.now()
logger.info('Sitemap to markdown converter starting...')
try {
// Find all sitemap files in the site catalog
const sitemapFiles = siteCatalog.getFiles().filter(file =>
/^sitemap(-[^/]+)?\.xml$/.test(path.basename(file.out.path))
)
if (sitemapFiles.length === 0) {
logger.info('No sitemap files found in site catalog')
return
}
logger.info(`Found ${sitemapFiles.length} sitemap file(s)`)
// Convert each sitemap and collect all URLs
const allUrls = []
for (const sitemapFile of sitemapFiles) {
const urls = await convertSitemapToMarkdown(sitemapFile, siteCatalog, logger)
if (urls) {
allUrls.push(...urls)
}
}
// Create combined master sitemap if we have multiple sitemaps
if (sitemapFiles.length > 1 && allUrls.length > 0) {
await createMasterSitemap(sitemapFiles, allUrls, siteCatalog, logger)
}
const duration = ((Date.now() - startTime) / 1000).toFixed(2)
logger.info(`Generated ${sitemapFiles.length} sitemap markdown file(s) in ${duration}s`)
} catch (error) {
logger.error(`Failed to generate sitemap markdown: ${error.message}`)
throw error
}
})
}
/**
* Create a master combined sitemap from all individual sitemaps
*/
async function createMasterSitemap(sitemapFiles, allUrls, siteCatalog, logger) {
const sitemapCount = sitemapFiles.length
const sitemapWord = sitemapCount === 1 ? 'sitemap' : 'sitemaps'
let markdown = '# Complete documentation sitemap\n\n'
markdown += `> Combined view of all ${allUrls.length.toLocaleString()} documentation pages from ${sitemapCount} ${sitemapWord}\n\n`
// Add overview of source sitemaps
markdown += '## Source sitemaps\n\n'
for (const sitemapFile of sitemapFiles) {
const basename = path.basename(sitemapFile.out.path)
const mdName = basename.replace(/\.xml$/, '.md')
markdown += `- [${basename}](${mdName})\n`
}
markdown += '\n'
// Add all URLs grouped by component
if (allUrls.length > 0) {
markdown += convertUrlsetToMarkdown(allUrls)
}
// Add to site catalog
siteCatalog.addFile({
contents: Buffer.from(markdown, 'utf8'),
out: { path: 'sitemap-all.md' },
})
logger.info(`Generated master sitemap: sitemap-all.md (${allUrls.length} pages)`)
}
/**
* Convert a sitemap file to markdown
* Returns the URLs found in this sitemap for aggregation
*/
async function convertSitemapToMarkdown(sitemapFile, siteCatalog, logger) {
const xmlContent = sitemapFile.contents.toString('utf8')
const basename = path.basename(sitemapFile.out.path)
const outputPath = basename.replace(/\.xml$/, '.md')
try {
const parsed = await parseStringPromise(xmlContent)
let markdown = '# Sitemap\n\n'
markdown += `> Documentation sitemap generated from ${basename}\n\n`
let urls = []
// Handle standard sitemap
if (parsed.urlset && parsed.urlset.url) {
urls = parsed.urlset.url
markdown += convertUrlsetToMarkdown(urls)
}
// Handle sitemap index
if (parsed.sitemapindex && parsed.sitemapindex.sitemap) {
markdown += convertSitemapIndexToMarkdown(parsed.sitemapindex.sitemap)
}
// Add to site catalog
siteCatalog.addFile({
contents: Buffer.from(markdown, 'utf8'),
out: { path: outputPath },
})
logger.debug(`Generated ${outputPath}`)
return urls
} catch (error) {
logger.warn(`Failed to parse ${basename}: ${error.message}`)
return []
}
}
/**
* Convert URL entries to markdown
*/
function convertUrlsetToMarkdown(urls) {
let markdown = '## Pages\n\n'
markdown += `Total pages: ${urls.length.toLocaleString()}\n\n`
// Group URLs by component/path
const groups = groupUrlsByPath(urls)
for (const [groupName, groupUrls] of Object.entries(groups)) {
markdown += `### ${groupName}\n\n`
for (const url of groupUrls) {
const loc = url.loc ? url.loc[0] : ''
const lastmod = url.lastmod ? url.lastmod[0] : ''
const changefreq = url.changefreq ? url.changefreq[0] : ''
const priority = url.priority ? url.priority[0] : ''
if (loc) {
markdown += `- [${extractPageTitle(loc)}](${loc})`
const metadata = []
if (lastmod) metadata.push(`modified: ${lastmod}`)
if (changefreq) metadata.push(`frequency: ${changefreq}`)
if (priority) metadata.push(`priority: ${priority}`)
if (metadata.length > 0) {
markdown += ` (${metadata.join(', ')})`
}
markdown += '\n'
}
}
markdown += '\n'
}
return markdown
}
/**
* Convert sitemap index to markdown
*/
function convertSitemapIndexToMarkdown(sitemaps) {
const count = sitemaps.length
const sitemapWord = count === 1 ? 'sub-sitemap' : 'sub-sitemaps'
let markdown = '## Sitemap index\n\n'
markdown += `This sitemap index contains ${count} ${sitemapWord}:\n\n`
for (const sitemap of sitemaps) {
const loc = sitemap.loc ? sitemap.loc[0] : ''
const lastmod = sitemap.lastmod ? sitemap.lastmod[0] : ''
if (loc) {
// Convert .xml URL to .md for markdown version
const mdUrl = loc.replace(/\.xml$/, '.md')
const basename = path.basename(loc)
markdown += `- [${basename}](${mdUrl})`
if (lastmod) {
markdown += ` (modified: ${lastmod})`
}
markdown += '\n'
}
}
markdown += '\n'
return markdown
}
/**
* Group URLs by their path prefix for better organization
*/
function groupUrlsByPath(urls) {
const groups = {}
for (const url of urls) {
const loc = url.loc ? url.loc[0] : ''
if (!loc) continue
// Extract component from URL path
const urlPath = new URL(loc).pathname
const parts = urlPath.split('/').filter(p => p)
let groupName = 'Root'
if (parts.length > 0) {
// Use first meaningful path segment as group
groupName = parts[0]
// Capitalize and format
groupName = groupName
.split('-')
.map(word => word.charAt(0).toUpperCase() + word.slice(1))
.join(' ')
}
if (!groups[groupName]) {
groups[groupName] = []
}
groups[groupName].push(url)
}
// Sort groups alphabetically
const sortedGroups = {}
Object.keys(groups).sort().forEach(key => {
sortedGroups[key] = groups[key]
})
return sortedGroups
}
/**
* Extract a readable page title from URL
*/
function extractPageTitle(url) {
const urlPath = new URL(url).pathname
const parts = urlPath.split('/').filter(p => p)
if (parts.length === 0) return 'Home'
// Get the last meaningful part
let title = parts[parts.length - 1]
// Remove .html extension
title = title.replace(/\.html$/, '')
// Convert dashes to spaces and capitalize
title = title
.split('-')
.map(word => word.charAt(0).toUpperCase() + word.slice(1))
.join(' ')
return title
}