@antora/lunr-extension
Version:
An Antora extension that adds offline, full-text search powered by Lunr to your Antora documentation site.
318 lines (283 loc) • 11.8 kB
JavaScript
const lunr = require('lunr')
const {
parseDocument,
DomUtils: { find, innerText, isTag, removeElement },
} = require('htmlparser2')
const HEADING_TAG_NAMES = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
const PARSE_OPTS = { lowerCaseTags: false, lowerCaseAttributeNames: false }
/**
* Generate a Lunr index.
*
* Iterates over the specified pages and creates a Lunr index.
*
* @memberof lunr-extension
*
* @param {Object} playbook - The configuration object for Antora.
* @param {Object} contentCatalog - The Antora content catalog, with pages and metadata.
* @param {Object} [config={}] - Configuration options
* @param {Boolean} config.indexLatestOnly - If true, only index the latest version of any given page.
* @param {Array<String>} config.languages - List of index languages
* @param {Array<String>} config.extraStopWords - List of extra stop words
* @param {Boolean} config.indexByHeading - If true, index each heading as a separate searchable chunk instead of indexing the whole page.
* @param {Object} config.logger - Logger to use
* @typedef {Object} SearchIndexData
* @property {lunr.Index} index - a Lunr index
* @property {Object} store - the documents store
* @returns {SearchIndexData} A data object that contains the Lunr index and documents store
*/
function generateIndex (playbook, contentCatalog, config = {}) {
let { indexLatestOnly = false, languages = ['en'], extraStopWords = [], indexByHeading, logger } = config
if (!logger) logger = playbook.env.NODE_ENV === 'test' ? { info: () => undefined } : console
logger.info('Building search index with the language(s): %s', languages.join(', '))
// Select indexable pages
const pages = contentCatalog.getPages((page) => {
if (!page.out || page.asciidoc?.attributes?.noindex != null) return
if (indexLatestOnly) {
const component = contentCatalog.getComponent(page.src.component)
if (contentCatalog.getComponentVersion(component, page.src.version) !== component.latest) return
}
return true
})
if (!pages.length) return {}
// Use short numerical identifiers (as ref) to keep the Lunr index as small as possible.
// Since it's an identifier (and not an index) we start at 1.
let id = 1
// Extract document objects from indexable pages
const documents = pages.reduce((accum, page) => {
const dom = parseDocument(String(page.contents ?? ''), PARSE_OPTS)
const html = dom.children.find(isTagNamed.bind(null, 'html'))
if (html) {
const head = html.children.find(isTagNamed.bind(null, 'head'))
if (head) {
const noindex = head.children.find(
(it) => isTagNamed('meta', it) && it.attribs.name === 'robots' && it.attribs.content === 'noindex'
)
// Only index page if not marked as "noindex" by "robots" meta tag
if (noindex) return accum
}
}
return accum.push({ id: id++, ...extractIndexContent(page, dom, indexByHeading) }) && accum
}, [])
// Extracts the language codes from the language-stop words object
const languageCodes = languages.map((it) => (it[Object.keys(it)] ? Object.keys(it)[0] : it))
if (languageCodes.length > 1 || !languageCodes.includes('en')) {
if (languageCodes.length > 1 && !lunr.multiLanguage) {
// required, otherwise lunr.multiLanguage will be undefined
require('lunr-languages/lunr.multi')(lunr)
}
// required, to load additional languages
require('lunr-languages/lunr.stemmer.support')(lunr)
languageCodes.forEach((languageCode) => {
if (languageCode === 'ja' && !lunr.TinySegmenter) {
require('lunr-languages/tinyseg')(lunr) // needed for Japanese Support
}
if (languageCode === 'th' && !lunr.wordcut) {
lunr.wordcut = require('lunr-languages/wordcut') // needed for Thai support
}
if (languageCode !== 'en' && !lunr[languageCode]) {
require(`lunr-languages/lunr.${languageCode}`)(lunr)
}
})
}
// Map of Lunr ref (id) to document
const store = {
documents: {},
components: {},
}
// If extra stop words defined create new pipeline function to add them
const addExtraStopWords = extraStopWords.length > 0 ? lunr.generateStopWordFilter(extraStopWords) : undefined
// If extra stop words function defined register it with the pipeline
if (addExtraStopWords != null) lunr.Pipeline.registerFunction(addExtraStopWords, 'extraStopWords')
// Filters out langauge definitions without any stop words defined
// Creates a new pipeline stop words function for each language that has stop words defined
const languageStopWordsFunctions = languages
.filter((it) => it[Object.keys(it)[0]].extraStopWords != null)
.map((it) => lunr.generateStopWordFilter(it[Object.keys(it)[0]].extraStopWords))
// Registers each of the language specific stop words functions as a pipeline function
languageStopWordsFunctions.forEach((fn, idx) => {
lunr.Pipeline.registerFunction(fn, 'extraStopWords-' + languageCodes[idx])
})
// Construct the Lunr index from the extracted content
const index = lunr(function () {
if (languageCodes.length > 1) {
this.use(lunr.multiLanguage(...languageCodes))
} else if (!languageCodes.includes('en')) {
this.use(lunr[languageCodes[0]])
}
// Finds the stop words stage so the functions with extra stop words can be inserted after
// If 'en' is being used as a language find the default 'en' stop words stage else find the
// stop words stage of the first other langauge passed in
const stopWordFilter = languageCodes.includes('en') ? lunr.stopWordFilter : lunr[languageCodes[0]].stopWordFilter
// If there are global stop words add the function to the pipeline after the default stop words functions
if (addExtraStopWords != null) this.pipeline.after(stopWordFilter, addExtraStopWords)
// Adds the language specific stop words functions to the pipeline after the default stop words functions
languageStopWordsFunctions.forEach((fn) => {
this.pipeline.after(lunr.stopWordFilter, fn)
})
this.ref('id')
this.field('title', { boost: 10 })
this.field('name')
this.field('text')
this.field('component')
this.field('keyword', { boost: 5 })
documents.forEach((doc) => {
doc.titles.forEach((title) => {
indexByHeading
? this.add({ id: `${doc.id}-${title.id}`, title: title.title, text: title.text })
: this.add({ id: `${doc.id}-${title.id}`, title: title.text })
})
this.add(doc)
store.documents[doc.id] = doc
})
})
const componentVersions = {}
const components = contentCatalog.getComponents()
for (const component of components) {
for (const version of component.versions) {
componentVersions[`${component.name}/${version.version}`] = version
}
}
store.componentVersions = componentVersions
return { index, store }
}
/**
* Extract the index content for a given page.
* @param {Object<Page>} page Full text input to clean irrelevant material from.
* @param {*} $ Cheerio representation of the page.
* @param {Boolean} indexByHeading If true, index each heading as a separate searchable chunk instead of indexing the whole page.
* @returns {Object} Indexable content for a given page.
*/
function extractIndexContent (page, dom, indexByHeading) {
// Fetch just the article content, so we don't index the TOC and other on-page text
// Remove any found headings, to improve search results
let contextNodes = dom.children
const article = find((it) => isTagNamedWithClass('article', 'doc', it), contextNodes, true, 1)[0]
if (article) {
contextNodes = article.children
// don't index navigation elements for pagination on each page
// as these are the titles of other pages and it would otherwise pollute the index.
const pagination = find((it) => isTagNamedWithClass('nav', 'pagination', it), contextNodes, true, 1)[0]
if (pagination) removeElement(pagination)
}
let pageTitle
find(
(it) => {
if (!isTagNamed('h1', it)) return false
if (hasClass('page', it)) return !!(pageTitle = it)
pageTitle ??= it
return false
},
contextNodes,
true,
1
)
const documentTitle = pageTitle
? innerText(removeElement(pageTitle) ?? pageTitle.children)
: (page.title ?? '') && innerText(parseDocument(page.title, PARSE_OPTS).children)
const titles = []
const keywords = page.asciidoc?.attributes?.keywords
let id = 1
let text = ''
if (indexByHeading) {
const sections = []
let idCounter = 1
function collectNodesUntilHeading (startNodes, startIdx) {
let stop = false
function traverse (nodes) {
const collected = []
for (let i = 0; i < nodes.length && !stop; i++) {
const node = nodes[i]
if (isTagNamed(HEADING_TAG_NAMES, node)) {
stop = true
return collected
}
if (node.children?.length) {
const childCollected = traverse(node.children)
if (stop) {
collected.push({ ...node, children: childCollected })
return collected
}
collected.push(node)
} else {
collected.push(node)
}
}
return collected
}
return traverse(startNodes.slice(startIdx))
}
function processNodes (nodes) {
for (let i = 0; i < nodes.length; i++) {
const node = nodes[i]
if (isTagNamed(HEADING_TAG_NAMES, node)) {
const sectionNodes = collectNodesUntilHeading(nodes, i + 1)
const filtered = sectionNodes.filter(
(n, idx, arr) => !arr.some((other, j) => j !== idx && isDescendant(n, other))
)
sections.push({
title: innerText(node.children),
text: innerText(filtered).replace(/\s+/g, ' ').trim(),
hash: node.attribs.id,
id: idCounter++,
})
}
if (node.children?.length) processNodes(node.children)
}
}
function isDescendant (node, ancestor) {
let p = node.parent
while (p) {
if (p === ancestor) return true
p = p.parent
}
return false
}
processNodes(contextNodes)
titles.push(...sections)
} else {
find(
(it) => {
if (!isTagNamed(HEADING_TAG_NAMES, it)) return false
titles.push({ text: innerText(it.children), hash: it.attribs.id, id: id++ })
removeElement(it)
},
contextNodes,
true
)
// Pull the text from the article
text = innerText(contextNodes).replace(/\s+/g, ' ').trim()
}
// Return the indexable content, organized by type
return {
text: text,
title: documentTitle,
component: page.src.component,
version: page.src.version,
name: page.src.stem,
url: page.pub.url,
titles: titles, // TODO get title id to be able to use fragment identifier
keyword: keywords,
}
}
// Helper function allowing Antora to create a site asset containing the index
function createIndexFile (index) {
return {
mediaType: 'application/javascript',
contents: Buffer.from(`antoraSearch.initSearch(lunr, ${JSON.stringify(index)})`),
src: { stem: 'search-index' },
out: { path: 'search-index.js' },
pub: { url: '/search-index.js', rootPath: '' },
}
}
function hasClass (className, node) {
return node.attribs.class?.split(/\s+/).includes(className)
}
function isTagNamed (tagName, node) {
return isTag(node) && (Array.isArray(tagName) ? tagName.includes(node.name) : node.name === tagName)
}
function isTagNamedWithClass (tagName, className, node) {
return isTagNamed(tagName, node) && hasClass(className, node)
}
module.exports = generateIndex
module.exports.createIndexFile = createIndexFile