UNPKG

@antora/lunr-extension

Version:

An Antora extension that adds offline, full-text search powered by Lunr to your Antora documentation site.

318 lines (283 loc) • 11.8 kB

JavaScript

'use strict' const lunr = require('lunr') const { parseDocument, DomUtils: { find, innerText, isTag, removeElement }, } = require('htmlparser2') const HEADING_TAG_NAMES = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] const PARSE_OPTS = { lowerCaseTags: false, lowerCaseAttributeNames: false } /** * Generate a Lunr index. * * Iterates over the specified pages and creates a Lunr index. * * @memberof lunr-extension * * @param {Object} playbook - The configuration object for Antora. * @param {Object} contentCatalog - The Antora content catalog, with pages and metadata. * @param {Object} [config={}] - Configuration options * @param {Boolean} config.indexLatestOnly - If true, only index the latest version of any given page. * @param {Array<String>} config.languages - List of index languages * @param {Array<String>} config.extraStopWords - List of extra stop words * @param {Boolean} config.indexByHeading - If true, index each heading as a separate searchable chunk instead of indexing the whole page. * @param {Object} config.logger - Logger to use * @typedef {Object} SearchIndexData * @property {lunr.Index} index - a Lunr index * @property {Object} store - the documents store * @returns {SearchIndexData} A data object that contains the Lunr index and documents store */ function generateIndex (playbook, contentCatalog, config = {}) { let { indexLatestOnly = false, languages = ['en'], extraStopWords = [], indexByHeading, logger } = config if (!logger) logger = playbook.env.NODE_ENV === 'test' ? { info: () => undefined } : console logger.info('Building search index with the language(s): %s', languages.join(', ')) // Select indexable pages const pages = contentCatalog.getPages((page) => { if (!page.out || page.asciidoc?.attributes?.noindex != null) return if (indexLatestOnly) { const component = contentCatalog.getComponent(page.src.component) if (contentCatalog.getComponentVersion(component, page.src.version) !== component.latest) return } return true }) if (!pages.length) return {} // Use short numerical identifiers (as ref) to keep the Lunr index as small as possible. // Since it's an identifier (and not an index) we start at 1. let id = 1 // Extract document objects from indexable pages const documents = pages.reduce((accum, page) => { const dom = parseDocument(String(page.contents ?? ''), PARSE_OPTS) const html = dom.children.find(isTagNamed.bind(null, 'html')) if (html) { const head = html.children.find(isTagNamed.bind(null, 'head')) if (head) { const noindex = head.children.find( (it) => isTagNamed('meta', it) && it.attribs.name === 'robots' && it.attribs.content === 'noindex' ) // Only index page if not marked as "noindex" by "robots" meta tag if (noindex) return accum } } return accum.push({ id: id++, ...extractIndexContent(page, dom, indexByHeading) }) && accum }, []) // Extracts the language codes from the language-stop words object const languageCodes = languages.map((it) => (it[Object.keys(it)] ? Object.keys(it)[0] : it)) if (languageCodes.length > 1 || !languageCodes.includes('en')) { if (languageCodes.length > 1 && !lunr.multiLanguage) { // required, otherwise lunr.multiLanguage will be undefined require('lunr-languages/lunr.multi')(lunr) } // required, to load additional languages require('lunr-languages/lunr.stemmer.support')(lunr) languageCodes.forEach((languageCode) => { if (languageCode === 'ja' && !lunr.TinySegmenter) { require('lunr-languages/tinyseg')(lunr) // needed for Japanese Support } if (languageCode === 'th' && !lunr.wordcut) { lunr.wordcut = require('lunr-languages/wordcut') // needed for Thai support } if (languageCode !== 'en' && !lunr[languageCode]) { require(`lunr-languages/lunr.${languageCode}`)(lunr) } }) } // Map of Lunr ref (id) to document const store = { documents: {}, components: {}, } // If extra stop words defined create new pipeline function to add them const addExtraStopWords = extraStopWords.length > 0 ? lunr.generateStopWordFilter(extraStopWords) : undefined // If extra stop words function defined register it with the pipeline if (addExtraStopWords != null) lunr.Pipeline.registerFunction(addExtraStopWords, 'extraStopWords') // Filters out langauge definitions without any stop words defined // Creates a new pipeline stop words function for each language that has stop words defined const languageStopWordsFunctions = languages .filter((it) => it[Object.keys(it)[0]].extraStopWords != null) .map((it) => lunr.generateStopWordFilter(it[Object.keys(it)[0]].extraStopWords)) // Registers each of the language specific stop words functions as a pipeline function languageStopWordsFunctions.forEach((fn, idx) => { lunr.Pipeline.registerFunction(fn, 'extraStopWords-' + languageCodes[idx]) }) // Construct the Lunr index from the extracted content const index = lunr(function () { if (languageCodes.length > 1) { this.use(lunr.multiLanguage(...languageCodes)) } else if (!languageCodes.includes('en')) { this.use(lunr[languageCodes[0]]) } // Finds the stop words stage so the functions with extra stop words can be inserted after // If 'en' is being used as a language find the default 'en' stop words stage else find the // stop words stage of the first other langauge passed in const stopWordFilter = languageCodes.includes('en') ? lunr.stopWordFilter : lunr[languageCodes[0]].stopWordFilter // If there are global stop words add the function to the pipeline after the default stop words functions if (addExtraStopWords != null) this.pipeline.after(stopWordFilter, addExtraStopWords) // Adds the language specific stop words functions to the pipeline after the default stop words functions languageStopWordsFunctions.forEach((fn) => { this.pipeline.after(lunr.stopWordFilter, fn) }) this.ref('id') this.field('title', { boost: 10 }) this.field('name') this.field('text') this.field('component') this.field('keyword', { boost: 5 }) documents.forEach((doc) => { doc.titles.forEach((title) => { indexByHeading ? this.add({ id: `${doc.id}-${title.id}`, title: title.title, text: title.text }) : this.add({ id: `${doc.id}-${title.id}`, title: title.text }) }) this.add(doc) store.documents[doc.id] = doc }) }) const componentVersions = {} const components = contentCatalog.getComponents() for (const component of components) { for (const version of component.versions) { componentVersions[`${component.name}/${version.version}`] = version } } store.componentVersions = componentVersions return { index, store } } /** * Extract the index content for a given page. * @param {Object<Page>} page Full text input to clean irrelevant material from. * @param {*} $ Cheerio representation of the page. * @param {Boolean} indexByHeading If true, index each heading as a separate searchable chunk instead of indexing the whole page. * @returns {Object} Indexable content for a given page. */ function extractIndexContent (page, dom, indexByHeading) { // Fetch just the article content, so we don't index the TOC and other on-page text // Remove any found headings, to improve search results let contextNodes = dom.children const article = find((it) => isTagNamedWithClass('article', 'doc', it), contextNodes, true, 1)[0] if (article) { contextNodes = article.children // don't index navigation elements for pagination on each page // as these are the titles of other pages and it would otherwise pollute the index. const pagination = find((it) => isTagNamedWithClass('nav', 'pagination', it), contextNodes, true, 1)[0] if (pagination) removeElement(pagination) } let pageTitle find( (it) => { if (!isTagNamed('h1', it)) return false if (hasClass('page', it)) return !!(pageTitle = it) pageTitle ??= it return false }, contextNodes, true, 1 ) const documentTitle = pageTitle ? innerText(removeElement(pageTitle) ?? pageTitle.children) : (page.title ?? '') && innerText(parseDocument(page.title, PARSE_OPTS).children) const titles = [] const keywords = page.asciidoc?.attributes?.keywords let id = 1 let text = '' if (indexByHeading) { const sections = [] let idCounter = 1 function collectNodesUntilHeading (startNodes, startIdx) { let stop = false function traverse (nodes) { const collected = [] for (let i = 0; i < nodes.length && !stop; i++) { const node = nodes[i] if (isTagNamed(HEADING_TAG_NAMES, node)) { stop = true return collected } if (node.children?.length) { const childCollected = traverse(node.children) if (stop) { collected.push({ ...node, children: childCollected }) return collected } collected.push(node) } else { collected.push(node) } } return collected } return traverse(startNodes.slice(startIdx)) } function processNodes (nodes) { for (let i = 0; i < nodes.length; i++) { const node = nodes[i] if (isTagNamed(HEADING_TAG_NAMES, node)) { const sectionNodes = collectNodesUntilHeading(nodes, i + 1) const filtered = sectionNodes.filter( (n, idx, arr) => !arr.some((other, j) => j !== idx && isDescendant(n, other)) ) sections.push({ title: innerText(node.children), text: innerText(filtered).replace(/\s+/g, ' ').trim(), hash: node.attribs.id, id: idCounter++, }) } if (node.children?.length) processNodes(node.children) } } function isDescendant (node, ancestor) { let p = node.parent while (p) { if (p === ancestor) return true p = p.parent } return false } processNodes(contextNodes) titles.push(...sections) } else { find( (it) => { if (!isTagNamed(HEADING_TAG_NAMES, it)) return false titles.push({ text: innerText(it.children), hash: it.attribs.id, id: id++ }) removeElement(it) }, contextNodes, true ) // Pull the text from the article text = innerText(contextNodes).replace(/\s+/g, ' ').trim() } // Return the indexable content, organized by type return { text: text, title: documentTitle, component: page.src.component, version: page.src.version, name: page.src.stem, url: page.pub.url, titles: titles, // TODO get title id to be able to use fragment identifier keyword: keywords, } } // Helper function allowing Antora to create a site asset containing the index function createIndexFile (index) { return { mediaType: 'application/javascript', contents: Buffer.from(`antoraSearch.initSearch(lunr, ${JSON.stringify(index)})`), src: { stem: 'search-index' }, out: { path: 'search-index.js' }, pub: { url: '/search-index.js', rootPath: '' }, } } function hasClass (className, node) { return node.attribs.class?.split(/\s+/).includes(className) } function isTagNamed (tagName, node) { return isTag(node) && (Array.isArray(tagName) ? tagName.includes(node.name) : node.name === tagName) } function isTagNamedWithClass (tagName, className, node) { return isTagNamed(tagName, node) && hasClass(className, node) } module.exports = generateIndex module.exports.createIndexFile = createIndexFile