UNPKG

@telios/nebula

Version:

Real-time distributed file and data storage.

217 lines (171 loc) 4.99 kB
const natural = require('natural') const Corestore = require('corestore') const Hyperbee = require('hyperbee') const tokenizer = new natural.AggressiveTokenizer() const pump = require('pump') const concat = require('concat-stream') const BSON = require('bson') const { ObjectID } = BSON /** * This implementation of full text search on hypercore was refactored from * Paul Frazee's Hyper search experiment (https://github.com/pfrazee/hyper-search-experiments) */ class HyperFTS { constructor(storage, encryptionKey) { this.store = new Corestore(storage) this.encryptionKey = encryptionKey this.indexes = new Map() this.indexInProgress = false this.deIndexInProgress = false this.indexQueue = [] this.deIndexQueue = [] this.stopwords = [ "the", "a", "an", "and", "as", "i", "if", "it", "its", "it's" ] } async ready() { await this.store.ready() } async index({ name, props, docs }) { if(this.indexInProgress) { this.indexQueue.push({ name, props, docs }) return } this.indexInProgress = true const bee = this._getDB(name) const promises = [] await bee.put('idxProps', props) if (!bee.tx) bee.tx = bee.batch() const tx = bee.tx for(const doc of docs) { let text = '' for(const prop of props) { if(doc[prop]) text += doc[prop] } const id = ObjectID(doc._id).toHexString() if(!text) { throw 'Property to index cannot be null' } const tokens = this._toTokens(text) for (let token of tokens) { promises.push(tx.put(`idx:${token}:${id}`, {})) } } await Promise.all(promises) await tx.flush() this.indexInProgress = false if(this.indexQueue.length > 0) { const indexQueue = [...this.indexQueue] this.indexQueue = [] for(const item of indexQueue) { await this.index(item) } } } async deIndex({ db, name, query }) { if(this.deIndexInProgress) { this.deIndexQueue.push({ db, name, query }) return } this.deIndexInProgress = true const bee = this._getDB(name) const props = await bee.get('idxProps') if(!props) return const batch = await bee.batch() const doc = await db.findOne(query) let text = '' let _id = doc._id if(typeof _id !== 'string') _id = doc._id.toHexString() for(const prop of props.value) { if(doc[prop]) text += doc[prop] } const tokens = this._toTokens(text) for (let token of tokens) { await batch.del(`idx:${token}:${_id}`) } await batch.flush() this.deIndexInProgress = false if(this.deIndexQueue.length > 0) { const deIndexQueue = [...this.deIndexQueue] this.deIndexQueue = [] for(const item of deIndexQueue) { await this.deIndex(item) } } } async search({ db, name, query, opts }) { const bee = this._getDB(name) const queryTokens = this._toTokens(query) const listsPromises = [] const limit = opts && opts.limit || 10 for (let qt of queryTokens) { listsPromises.push(bee.list({gt: `idx:${qt}:\x00`, lt: `idx:${qt}:\xff`})) } const listsResults = await Promise.all(listsPromises) const docIdHits = {} for (let listResults of listsResults) { for (let item of listResults) { const docId = item.key.split(':')[2] if(docIdHits[docId] === undefined) { docIdHits[docId] = 0 } else { docIdHits[docId] += 1 } } } const docIdsSorted = Object.keys(docIdHits).sort((a, b) => docIdHits[b] - docIdHits[a]) let results = await db.find({ _id: { $in: docIdsSorted.slice(0, limit) } }) return results.map(result => { let hits = docIdHits[result._id] for(const prop in result) { query = query.toLowerCase() if(typeof result[prop] === 'string') { hits += natural.JaroWinklerDistance(result[prop], query) } } return { ...result, hits } }) .sort((a,b) => b.hits - a.hits) } async close() { await this.store.close() } _getDB(name) { const core = this.store.get({ name, encryptionKey: this.encryptionKey }) let bee = new Hyperbee(core, { keyEncoding: 'utf-8', valueEncoding: 'json' }) bee.list = async (opts) => { let stream = await bee.createReadStream(opts) return new Promise((resolve, reject) => { pump( stream, concat(resolve), err => { if (err) reject(err) } ) }) } this.indexes.set(name, bee) return bee } _toTokens (str) { let arr = Array.isArray(str) ? str : tokenizer.tokenize(str) return [...new Set(arr.map(token => token.toLowerCase()))].map(token => natural.Metaphone.process(token)) } } module.exports = HyperFTS