@redpanda-data/docs-extensions-and-macros
Version:
Antora extensions and macros developed for Redpanda documentation.
168 lines (140 loc) • 5.47 kB
JavaScript
/**
* Adds markdown URL entries to sitemap.xml files for AI-friendly documentation.
*
* This extension enhances Antora's generated sitemaps by adding <url> entries
* for markdown versions of pages alongside the HTML versions. This improves
* compatibility with agent-friendly documentation tools that expect markdown
* URLs to be discoverable in sitemaps.
*
* The extension:
* - Finds all sitemap XML files in the site catalog
* - For each HTML URL entry, adds a corresponding .md URL entry
* - Preserves lastmod dates from the HTML versions
* - Works with both main sitemaps and component-specific sitemaps
*
* @see https://agentdocsspec.com/spec/#llms-txt-freshness
*/
const { parseStringPromise } = require('xml2js')
const { toMarkdownUrl } = require('../extension-utils/url-utils')
module.exports.register = function () {
const logger = this.getLogger('add-markdown-urls-to-sitemap-extension')
this.on('beforePublish', async ({ siteCatalog }) => {
try {
// Find all sitemap XML files
// Includes both sitemap.xml (single component) and sitemap-*.xml (multiple components)
const sitemapFiles = siteCatalog.getFiles().filter(file => {
const path = file.out.path
// Include sitemap.xml OR sitemap-*.xml (but not the sitemap index which would be handled separately)
return path.endsWith('.xml') && (path === 'sitemap.xml' || path.startsWith('sitemap-'))
})
if (sitemapFiles.length === 0) {
logger.info('No component sitemap files found')
return
}
logger.info(`Processing ${sitemapFiles.length} sitemap file(s)...`)
let totalAdded = 0
for (const sitemapFile of sitemapFiles) {
const added = await addMarkdownUrlsToSitemap(sitemapFile, logger)
totalAdded += added
}
logger.info(`Added ${totalAdded} markdown URL entries across ${sitemapFiles.length} sitemap(s)`)
} catch (error) {
logger.error(`Failed to add markdown URLs to sitemaps: ${error.message}`)
// Don't throw - sitemap enhancement is not critical
}
})
}
/**
* Add markdown URL entries to a single sitemap file
* @param {Object} sitemapFile - The sitemap file from site catalog
* @param {Object} logger - Logger instance
* @returns {number} Number of markdown URLs added
*/
async function addMarkdownUrlsToSitemap(sitemapFile, logger) {
try {
const xmlContent = sitemapFile.contents.toString('utf8')
const parsed = await parseStringPromise(xmlContent, {
explicitArray: true,
xmlns: false, // Don't create namespace objects
tagNameProcessors: [], // Keep tag names as-is
})
if (!parsed || !parsed.urlset || !parsed.urlset.url) {
logger.debug(`No URLs found in ${sitemapFile.out.path}`)
return 0
}
const urlEntries = parsed.urlset.url
const newEntries = []
// Collect all existing loc strings to prevent duplicates
const existingLocs = new Set()
for (const entry of urlEntries) {
if (entry.loc && entry.loc[0]) {
let url = entry.loc[0]
if (typeof url === 'object' && url._) {
url = url._
}
if (typeof url === 'string') {
existingLocs.add(url)
}
}
}
// For each HTML URL, create a markdown URL entry
for (const entry of urlEntries) {
if (!entry.loc || !entry.loc[0]) continue
// xml2js might parse loc as object or string, handle both
let htmlUrl = entry.loc[0]
if (typeof htmlUrl === 'object' && htmlUrl._) {
htmlUrl = htmlUrl._
}
if (typeof htmlUrl !== 'string') {
logger.debug(`Skipping non-string URL: ${JSON.stringify(htmlUrl)}`)
continue
}
// Skip if it's already a markdown URL or special file
if (htmlUrl.endsWith('.md') || htmlUrl.endsWith('.txt') || htmlUrl.endsWith('.xml')) {
continue
}
// Convert HTML URL to markdown URL
const urlObj = new URL(htmlUrl)
const mdPath = toMarkdownUrl(urlObj.pathname)
const mdUrl = `${urlObj.origin}${mdPath}`
// Skip if this markdown URL already exists in the sitemap or was already added
if (existingLocs.has(mdUrl)) {
logger.debug(`Skipping duplicate markdown URL: ${mdUrl}`)
continue
}
// Create new entry for markdown URL with same lastmod
const mdEntry = {
loc: [mdUrl],
}
if (entry.lastmod && entry.lastmod[0]) {
mdEntry.lastmod = entry.lastmod
}
newEntries.push(mdEntry)
existingLocs.add(mdUrl) // Track this URL to prevent duplicates within this run
}
if (newEntries.length === 0) {
logger.debug(`No markdown URLs to add for ${sitemapFile.out.path}`)
return 0
}
// Add markdown entries to the sitemap
parsed.urlset.url.push(...newEntries)
// Rebuild XML with xml2js builder
const builder = new (require('xml2js')).Builder({
xmldec: { version: '1.0', encoding: 'UTF-8' },
xmlns: true,
renderOpts: {
pretty: true,
indent: ' ',
},
})
const newXml = builder.buildObject(parsed)
// Update the file contents
sitemapFile.contents = Buffer.from(newXml, 'utf8')
logger.debug(`Added ${newEntries.length} markdown URLs to ${sitemapFile.out.path}`)
return newEntries.length
} catch (error) {
logger.error(`Error processing ${sitemapFile.out.path}: ${error.message}`)
return 0
}
}