wiki-server
Version:
A Federated Wiki Server
395 lines (369 loc) • 11.9 kB
JavaScript
/*
* Federated Wiki : Node Server
*
* Copyright Ward Cunningham and other contributors
* Licensed under the MIT license.
* https://github.com/fedwiki/wiki-server/blob/master/LICENSE.txt
*/
// **search.js**
import fs from 'node:fs'
import path from 'node:path'
import events from 'node:events'
import url from 'node:url'
import writeFileAtomic from 'write-file-atomic'
import miniSearch from 'minisearch'
export default argv => {
const wikiName = new URL(argv.url).hostname
let siteIndex = []
const queue = []
let searchPageHandler = null
// ms since last update we will remove index from memory
// orig - searchTimeoutMs = 1200000
const searchTimeoutMs = 120000 // temp reduce to 2 minutes
let searchTimeoutHandler = null
const siteIndexLoc = path.join(argv.status, 'site-index.json')
const indexUpdateFlag = path.join(argv.status, 'index-updated')
let working = false
const touch = (file, cb) => {
fs.stat(file, (err, stats) => {
if (err === null) return cb()
fs.open(file, 'w', (err, fd) => {
if (err) cb(err)
fs.close(fd, err => {
cb(err)
})
})
})
}
const searchPageUpdate = (slug, page, cb) => {
// to update we have to remove the page first, and then readd it
let pageText
try {
pageText = page.story.reduce(extractPageText, '')
} catch (err) {
console.log(`SITE INDEX *** ${wikiName} reduce to extract the text on ${slug} failed`, err.message)
pageText = ''
}
if (siteIndex.has(slug)) {
siteIndex.replace({
id: slug,
title: page.title,
content: pageText,
})
} else {
siteIndex.add({
id: slug,
title: page.title,
content: pageText,
})
}
cb()
}
const searchPageRemove = (slug, cb) => {
// remove page from index
try {
siteIndex.discard(slug)
} catch (err) {
// swallow error, if the page was not in index
if (!err.message.includes('not in the index')) {
console.log(`removing ${slug} from index ${wikiName} failed`, err)
}
}
cb()
}
const searchSave = (siteIndex, cb) => {
// save index to file
fs.access(argv.status, fs.constants.F_OK, err => {
if (!err) {
writeFileAtomic(siteIndexLoc, JSON.stringify(siteIndex), e => {
if (e) return cb(e)
touch(indexUpdateFlag, () => {
cb()
})
})
} else {
fs.mkdir(argv.status, { recursive: true }, () => {
writeFileAtomic(siteIndexLoc, JSON.stringify(siteIndex), e => {
if (e) return cb(e)
touch(indexUpdateFlag, () => {
cb()
})
})
})
}
})
}
const searchRestore = cb => {
// restore index, or create if it doesn't already exist
fs.access(siteIndexLoc, fs.constants.F_OK, err => {
if (!err) {
fs.readFile(siteIndexLoc, (err, data) => {
if (err) return cb(err)
try {
siteIndex = miniSearch.loadJSON(data, {
fields: ['title', 'content'],
})
} catch (e) {
return cb(e)
}
process.nextTick(() => {
serial(queue.shift())
})
})
}
})
}
const serial = item => {
if (item) {
switch (item.action) {
case 'update':
itself.start()
searchPageUpdate(item.slug, item.page, () => {
process.nextTick(() => {
serial(queue.shift())
})
})
break
case 'remove':
itself.start()
searchPageRemove(item.slug, () => {
process.nextTick(() => {
serial(queue.shift())
})
})
break
default:
console.log(`SITE INDEX *** unexpected action ${item.action} for ${item.page}`)
process.nextTick(() => {
serial(queue.shift)
})
}
} else {
searchSave(siteIndex, e => {
if (e) console.log('SITE INDEX *** save failed: ' + e)
itself.stop()
})
}
}
const extractItemText = text => {
return text
.replace(/\[([^\]]*?)\][[(].*?[\])]/g, ' $1 ')
.replace(/\[{2}|\[(?:[\S]+)|\]{1,2}/g, ' ')
.replace(/\n/g, ' ')
.replace(/<style.*?<\/style>/g, ' ')
.replace(/<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>/g, ' ')
.replace(/<(?:[^>])+>/g, ' ')
.replace(/(https?:.*?)(?=\p{White_Space}|\p{Quotation_Mark}|$)/gu, match => {
try {
const myUrl = new URL(match)
return myUrl.hostname
} catch {
return ' '
}
})
.replace(/[\p{P}\p{Emoji}\p{Symbol}}]+/gu, ' ')
.replace(/[\p{White_Space}\n\t]+/gu, ' ')
}
const extractPageText = (pageText, currentItem, currentIndex, array) => {
// console.log('extractPageText', pageText, currentItem, currentIndex, array)
try {
if (currentItem.text) {
switch (currentItem.type) {
case 'paragraph':
case 'markdown':
case 'html':
case 'reference':
case 'image':
case 'pagefold':
case 'math':
case 'mathjax':
case 'code':
pageText += ' ' + extractItemText(currentItem.text)
break
case 'audio':
case 'video':
case 'frame':
pageText +=
' ' +
extractItemText(
currentItem.text
.split(/\r\n?|\n/)
.map(line => {
const firstWord = line.split(/\p{White_Space}/u)[0]
if (
firstWord.startsWith('http') ||
firstWord.toUpperCase() === firstWord ||
firstWord.startsWith('//')
) {
// line is markup
return ''
} else {
return line
}
})
.join(' '),
)
}
}
} catch (err) {
throw new Error(`Error extracting text from ${currentIndex}, ${JSON.stringify(currentItem)} ${err}, ${err.stack}`)
}
return pageText
}
// #### Public stuff ####
var itself = new events.EventEmitter()
itself.start = () => {
clearTimeout(searchTimeoutHandler)
working = true
return itself.emit('indexing')
}
itself.stop = () => {
const clearsearch = () => {
console.log(`SITE INDEX ${wikiName} : removed from memory`)
siteIndex = []
clearTimeout(searchTimeoutHandler)
}
searchTimeoutHandler = setTimeout(clearsearch, searchTimeoutMs)
working = false
return itself.emit('indexed')
}
itself.isWorking = () => {
return working
}
itself.createIndex = pagehandler => {
itself.start()
// we save the pagehandler, so we can recreate the site index if it is removed
searchPageHandler = searchPageHandler ?? pagehandler
//timeLabel = `SITE INDEX ${wikiName} : Created`
//console.time timeLabel
pagehandler.slugs((e, slugs) => {
if (e) {
console.log(`SITE INDEX *** createIndex ${wikiName} error:`, e)
itself.stop()
return e
}
siteIndex = new miniSearch({
fields: ['title', 'content'],
})
const indexPromises = slugs.map(slug => {
return new Promise(resolve => {
pagehandler.get(slug, (err, page) => {
if (err) {
console.log(`SITE INDEX *** ${wikiName}: error reading page`, slug)
return
}
// page
let pageText
try {
pageText = page.story.reduce(extractPageText, '')
} catch (err) {
console.log(`SITE INDEX *** ${wikiName} reduce to extract text on ${slug} failed`, err.message)
// console.log "page", page
pageText = ''
}
siteIndex.add({
id: slug,
title: page.title,
content: pageText,
})
resolve()
})
})
})
Promise.all(indexPromises).then(() => {
// console.timeEnd timeLabel
process.nextTick(() => {
serial(queue.shift())
})
})
})
}
itself.removePage = slug => {
const action = 'remove'
queue.push({ action, slug })
if (Array.isArray(siteIndex) && !working) {
itself.start()
searchRestore(e => {
if (e) console.log(`SITE INDEX *** Problems restoring search index ${wikiName}:` + e)
itself.createIndex(searchPageHandler)
})
} else {
if (!working) serial(queue.shift())
}
}
itself.update = (slug, page) => {
const action = 'update'
queue.push({ action, slug, page })
if (Array.isArray(siteIndex) && !working) {
itself.start()
searchRestore(e => {
if (e) console.log(`SITE INDEX *** Problems restoring search index ${wikiName}:` + e)
itself.createIndex(searchPageHandler)
})
} else {
if (!working) serial(queue.shift())
}
}
itself.startUp = pagehandler => {
// called on server startup, here we check if wiki already is index
// we only create an index if there is either no index or there have been updates since last startup
console.log(`SITE INDEX ${wikiName} : StartUp`)
fs.stat(siteIndexLoc, (err, stats) => {
if (err === null) {
// site index exists, but has it been updated?
fs.stat(indexUpdateFlag, (err, stats) => {
if (!err) {
// index has been updated, so recreate it.
itself.createIndex(pagehandler)
// remove the update flag once the index has been created
itself.once('indexed', () => {
fs.unlink(indexUpdateFlag, err => {
if (err) console.log(`+++ SITE INDEX ${wikiName} : unable to delete update flag`)
})
})
} else {
// not been updated, but is it the correct version?
fs.readFile(siteIndexLoc, (err, data) => {
if (!err) {
let testIndex
try {
testIndex = JSON.parse(data)
} catch (err) {
testIndex = {}
}
if (testIndex.serializationVersion != 2)
console.log(`+++ SITE INDEX ${wikiName} : updating to latest version.`)
itself.createIndex(pagehandler)
// remove the update flag once the index has been created
itself.once('indexed', () => {
fs.unlink(indexUpdateFlag, err => {
if (err) console.log(`+++ SITE INDEX ${wikiName} : unable to delete update flag`)
})
})
} else {
console.log(`+++ SITE INDEX ${wikiName} : error reading index - attempting creating`)
itself.createIndex(pagehandler)
// remove the update flag once the index has been created
itself.once('indexed', () => {
fs.unlink(indexUpdateFlag, err => {
if (err) console.log(`+++ SITE INDEX ${wikiName} : unable to delete update flag`)
})
})
}
})
}
})
} else {
// index does not exist, so create it
itself.createIndex(pagehandler)
// remove the update flag once the index has been created
itself.once('indexed', () => {
fs.unlink(indexUpdateFlag, err => {
if (err) console.log(`+++ SITE INDEX ${wikiName} : unable to delete update flag`)
})
})
}
})
}
return itself
}