@antora/lunr-extension
Version:
An Antora extension that adds offline, full-text search powered by Lunr to your Antora documentation site.
248 lines (218 loc) • 9.55 kB
JavaScript
const lunr = require('lunr')
const {
parseDocument,
DomUtils: { find, isTag, removeElement, textContent },
} = require('htmlparser2')
const HEADING_TAG_NAMES = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
const PARSE_OPTS = { lowerCaseTags: false, lowerCaseAttributeNames: false }
/**
* Generate a Lunr index.
*
* Iterates over the specified pages and creates a Lunr index.
*
* @memberof lunr-extension
*
* @param {Object} playbook - The configuration object for Antora.
* @param {Object} contentCatalog - The Antora content catalog, with pages and metadata.
* @param {Object} [config={}] - Configuration options
* @param {Boolean} config.indexLatestOnly - If true, only index the latest version of any given page.
* @param {Array<String>} config.languages - List of index languages
* @param {Array<String>} config.extraStopWords - List of extra stop words
* @param {Object} config.logger - Logger to use
* @typedef {Object} SearchIndexData
* @property {lunr.Index} index - a Lunr index
* @property {Object} store - the documents store
* @returns {SearchIndexData} A data object that contains the Lunr index and documents store
*/
function generateIndex (playbook, contentCatalog, config = {}) {
let { indexLatestOnly = false, languages = ['en'], extraStopWords = [], logger } = config
if (!logger) logger = process.env.NODE_ENV === 'test' ? { info: () => undefined } : console
logger.info('Building search index with the language(s): %s', languages.join(', '))
// Select indexable pages
const pages = contentCatalog.getPages((page) => {
if (!page.out || page.asciidoc?.attributes?.noindex != null) return
if (indexLatestOnly) {
const component = contentCatalog.getComponent(page.src.component)
if (contentCatalog.getComponentVersion(component, page.src.version) !== component.latest) return
}
return true
})
if (!pages.length) return {}
// Use short numerical identifiers (as ref) to keep the Lunr index as small as possible.
// Since it's an identifier (and not an index) we start at 1.
let id = 1
// Extract document objects from indexable pages
const documents = pages.reduce((accum, page) => {
const dom = parseDocument(String(page.contents ?? ''), PARSE_OPTS)
const html = dom.children.find(isTagNamed.bind(null, 'html'))
if (html) {
const head = html.children.find(isTagNamed.bind(null, 'head'))
if (head) {
const noindex = head.children.find(
(it) => isTagNamed('meta', it) && it.attribs.name === 'robots' && it.attribs.content === 'noindex'
)
// Only index page if not marked as "noindex" by "robots" meta tag
if (noindex) return accum
}
}
return accum.push({ id: id++, ...extractIndexContent(page, dom) }) && accum
}, [])
// Extracts the language codes from the langauge-stop words object
const languageCodes = languages.map((it) => (it[Object.keys(it)] ? Object.keys(it)[0] : it))
if (languageCodes.length > 1 || !languageCodes.includes('en')) {
if (languageCodes.length > 1 && !lunr.multiLanguage) {
// required, otherwise lunr.multiLanguage will be undefined
require('lunr-languages/lunr.multi')(lunr)
}
// required, to load additional languages
require('lunr-languages/lunr.stemmer.support')(lunr)
languageCodes.forEach((languageCode) => {
if (languageCode === 'ja' && !lunr.TinySegmenter) {
require('lunr-languages/tinyseg')(lunr) // needed for Japanese Support
}
if (languageCode === 'th' && !lunr.wordcut) {
lunr.wordcut = require('lunr-languages/wordcut') // needed for Thai support
}
if (languageCode !== 'en' && !lunr[languageCode]) {
require(`lunr-languages/lunr.${languageCode}`)(lunr)
}
})
}
// Map of Lunr ref (id) to document
const store = {
documents: {},
components: {},
}
// If extra stop words defined create new pipeline function to add them
const addExtraStopWords = extraStopWords.length > 0 ? lunr.generateStopWordFilter(extraStopWords) : undefined
// If extra stop words function defined register it with the pipeline
if (addExtraStopWords != null) lunr.Pipeline.registerFunction(addExtraStopWords, 'extraStopWords')
// Filters out langauge definitions without any stop words defined
// Creates a new pipeline stop words function for each language that has stop words defined
const languageStopWordsFunctions = languages
.filter((it) => it[Object.keys(it)[0]].extraStopWords != null)
.map((it) => lunr.generateStopWordFilter(it[Object.keys(it)[0]].extraStopWords))
// Registers each of the language specific stop words functions as a pipeline function
languageStopWordsFunctions.forEach((fn, idx) =>
lunr.Pipeline.registerFunction(fn, 'extraStopWords-' + languageCodes[idx])
)
// Construct the Lunr index from the extracted content
const index = lunr(function () {
if (languageCodes.length > 1) {
this.use(lunr.multiLanguage(...languageCodes))
} else if (!languageCodes.includes('en')) {
this.use(lunr[languageCodes[0]])
}
// Finds the stop words stage so the functions with extra stop words can be inserted after
// If 'en' is being used as a language find the default 'en' stop words stage else find the
// stop words stage of the first other langauge passed in
const stopWordFilter = languageCodes.includes('en') ? lunr.stopWordFilter : lunr[languageCodes[0]].stopWordFilter
// If there are global stop words add the function to the pipeline after the default stop words functions
if (addExtraStopWords != null) this.pipeline.after(stopWordFilter, addExtraStopWords)
// Adds the language specific stop words functions to the pipeline after the default stop words functions
languageStopWordsFunctions.forEach((fn) => this.pipeline.after(lunr.stopWordFilter, fn))
this.ref('id')
this.field('title', { boost: 10 })
this.field('name')
this.field('text')
this.field('component')
this.field('keyword', { boost: 5 })
documents.forEach((doc) => {
doc.titles.forEach((title) => {
this.add({ id: `${doc.id}-${title.id}`, title: title.text })
})
this.add(doc)
store.documents[doc.id] = doc
})
})
const componentVersions = {}
const components = contentCatalog.getComponents()
for (const component of components) {
for (const version of component.versions) {
componentVersions[`${component.name}/${version.version}`] = version
}
}
store.componentVersions = componentVersions
return { index, store }
}
/**
* Extract the index content for a given page.
* @param {Object<Page>} page Full text input to clean irrelevant material from.
* @param {*} $ Cheerio representation of the page.
* @returns {Object} Indexable content for a given page.
*/
function extractIndexContent (page, dom) {
// Fetch just the article content, so we don't index the TOC and other on-page text
// Remove any found headings, to improve search results
let contextNodes = dom.children
const article = find((it) => isTagNamedWithClass('article', 'doc', it), contextNodes, true, 1)[0]
if (article) {
contextNodes = article.children
// don't index navigation elements for pagination on each page
// as these are the titles of other pages and it would otherwise pollute the index.
const pagination = find((it) => isTagNamedWithClass('nav', 'pagination', it), contextNodes, true, 1)[0]
if (pagination) removeElement(pagination)
}
let pageTitle
find(
(it) => {
if (!isTagNamed('h1', it)) return false
if (hasClass('page', it)) return !!(pageTitle = it)
pageTitle ??= it
return false
},
contextNodes,
true,
1
)
const documentTitle = pageTitle
? textContent(removeElement(pageTitle) ?? pageTitle.children)
: (page.title ?? '') && textContent(parseDocument(page.title, PARSE_OPTS).children)
const titles = []
const keywords = page.asciidoc.attributes?.keywords
let id = 1
find(
(it) => {
if (!isTagNamed(HEADING_TAG_NAMES, it)) return false
titles.push({ text: textContent(it.children), hash: it.attribs.id, id: id++ })
removeElement(it)
},
contextNodes,
true
)
// Pull the text from the article
const text = textContent(contextNodes).replace(/\s+/g, ' ').trim()
// Return the indexable content, organized by type
return {
text: text,
title: documentTitle,
component: page.src.component,
version: page.src.version,
name: page.src.stem,
url: page.pub.url,
titles: titles, // TODO get title id to be able to use fragment identifier
keyword: keywords,
}
}
// Helper function allowing Antora to create a site asset containing the index
function createIndexFile (index) {
return {
mediaType: 'application/javascript',
contents: Buffer.from(`antoraSearch.initSearch(lunr, ${JSON.stringify(index)})`),
src: { stem: 'search-index' },
out: { path: 'search-index.js' },
pub: { url: '/search-index.js', rootPath: '' },
}
}
function hasClass (className, node) {
return node.attribs.class?.split(/\s+/).includes(className)
}
function isTagNamed (tagName, node) {
return isTag(node) && (Array.isArray(tagName) ? tagName.includes(node.name) : node.name === tagName)
}
function isTagNamedWithClass (tagName, className, node) {
return isTagNamed(tagName, node) && hasClass(className, node)
}
module.exports = generateIndex
module.exports.createIndexFile = createIndexFile