UNPKG

surch

Version:

Create and query searchable document indices.

gitlab.com/philbooth/surch

366 lines (310 loc) • 10.5 kB

JavaScript

'use strict' const { assert } = require('check-types') const INSIGNIFICANT_CHARACTERS = [ { from: 0, to: 47 }, { from: 58, to: 64 }, { from: 91, to: 96 }, { from: 123, to: 191 }, { from: 215, to: 215 }, { from: 247, to: 247 } ].reduce((chars, range) => { for (let code = range.from; code <= range.to; ++code) { chars.add(String.fromCharCode(code)) } return chars }, new Set()) const WHITESPACE = new Set([ ' ', '\u00a0', '\t', '\v', '\f', '\r', '\n' ]) module.exports = { /** * Create a searchable document index. * * @param {String} targetKey * The target property key to be indexed for each document. * * @param {Object} [options] * Optional arguments. * * @param {String} [options.idKey] * The identity key for each document. Default is '_id'. * * @param {String} [options.minLength] * The minimum queryable substring length. Default is 3. * * @param {Boolean} [options.caseSensitive] * Indicates whether queries should be case sensitive. Default is false. * * @param {Boolean} [options.strict] * Indicates whether queries should be strictly matched. Default is false. * * @param {Boolean} [options.fuzzy] * Indicates whether queries should be fuzzy matched. Default is false. * * @param {Function} [options.coerceId] * Coercion function for sane handling of object-based ids. Default is id => id. * * @returns {Index} */ create ( targetKey, { idKey = '_id', minLength = 3, caseSensitive = false, strict = false, fuzzy = false, coerceId = id => id } = {} ) { assert.nonEmptyString(targetKey, 'Invalid argument, "targetKey".') assert.nonEmptyString(idKey, 'Invalid option, "idKey".') assert.integer(minLength, 'Invalid option, "minLength".') assert(minLength > 0, 'Invalid option, "minLength".') assert.boolean(caseSensitive, 'Invalid option, "caseSensitive".') assert.boolean(strict, 'Invalid option, "strict".') assert.boolean(fuzzy, 'Invalid option, "fuzzy".') assert.function(coerceId, 'Invalid option, "coerceId".') assert.hasLength(coerceId, 1, 'Invalid option, "coerceId".') const FULL_STRINGS = new Map() const N_GRAMS = new Map() /** * @typedef Index * @property {Function} add * @property {Function} delete * @property {Function} update * @property {Function} search * @property {Function} clear */ return { /** * Add a document to the index. * * @param {Object} document * The document to be added to the index. */ add (document) { const value = document[targetKey] const documentId = coerceId(document[idKey]) if (! value) { return } assert.string(value, 'Invalid property.') assert.assigned(documentId, 'Invalid document id.') assert.equal(FULL_STRINGS.has(documentId), false, 'Duplicate document id.') const characters = Array.from(value.normalize('NFKC')) if (characters.length < minLength) { return } FULL_STRINGS.set(documentId, value) split(characters).forEach(item => { const substring = item.substring const ngram = N_GRAMS.get(substring) const index = { documentId, position: item.position, index: item.index } if (ngram) { ngram.push(index) } else { N_GRAMS.set(substring, [ index ]) } }) }, /** * Delete a document from the index. * * @param documentId * Id of the document to be removed from the index. */ delete (documentId) { documentId = coerceId(documentId) assert.string(FULL_STRINGS.get(documentId), 'Invalid document id.') FULL_STRINGS.delete(documentId) N_GRAMS.forEach((indices, key) => { indices = indices.filter(index => index.documentId !== documentId) if (indices.length === 0) { N_GRAMS.delete(key) } else { N_GRAMS.set(key, indices) } }) }, /** * Update a document in the index. * * @param {Object} document * The document to be updated. */ update (document) { this.delete(document[idKey]) this.add(document) }, /** * Search the index for documents matching a query string. * * @param {String} query * The query string. * * @returns {Array} results * Matching documents. The most relevant documents will be at the * beginning of the array and the least relevant will be at the end. */ search (query) { assert.string(query, 'Invalid argument, "query".') const characters = Array.from(query.normalize('NFKC')) assert(characters.length >= minLength, 'Invalid argument length, "query".') return filter(split(characters)) .reduce(dedupe.bind(new Map(), characters), []) .sort((lhs, rhs) => { if (lhs.score === rhs.score) { return lhs.indices[0] - rhs.indices[0] } return rhs.score - lhs.score }) }, /** * Delete all documents from the index. */ clear () { FULL_STRINGS.clear() N_GRAMS.clear() } } function split (characters, index = 0, skipCount = 0, substrings = []) { const stringLength = characters.length if (index + skipCount > stringLength - minLength) { return substrings } const position = index + skipCount let character = normalise(characters[position]) if (INSIGNIFICANT_CHARACTERS.has(character)) { return split(characters, index, skipCount + 1, substrings) } substrings[index] = { substring: character, index, position, tokenStart: ! strict && WHITESPACE.has(characters[position - 1]) } let i = 1, substringSkipCount = 0 while (i < minLength) { if (position + i + substringSkipCount < stringLength) { character = normalise(characters[position + i + substringSkipCount]) if (strict || ! WHITESPACE.has(character)) { if (INSIGNIFICANT_CHARACTERS.has(character)) { ++substringSkipCount continue } substrings[index].substring += character ++i continue } } substrings.pop() return split(characters, index, skipCount + 1, substrings) } return split(characters, index + 1, skipCount, substrings) } function normalise (string) { if (caseSensitive || ! string) { return string } return string.toLowerCase() } function filter (subqueries, documentId = null, results = []) { let candidates for (let i = 0; subqueries.length > 0; ++i) { const subquery = subqueries[0] const matches = N_GRAMS.get(subquery.substring) || [] if (i > 0) { if (candidates.length === 0 && !fuzzy) { return [] } if (subquery.tokenStart) { if (documentId) { return filter(subqueries.slice(), documentId, results.concat(candidates)) } let groupedCandidates if (fuzzy) { groupedCandidates = groupByDocumentId(candidates.concat(matches)) } else { groupedCandidates = groupByDocumentId(candidates, matches) } // eslint-disable-next-line no-loop-func return [ ...groupedCandidates ].reduce((res, [ id, documentCandidates ]) => { return res.concat(filter(subqueries.slice(), id, results.concat(documentCandidates))) }, []) } if (fuzzy) { if (documentId) { candidates = candidates.concat(matches.filter(match => match.documentId === documentId)) } else { candidates = candidates.concat(matches) } } else { candidates = candidates.filter(candidate => { return matches.some(match => { return match.documentId === candidate.documentId && match.index === candidate.index + i }) }) } if (candidates.length === 0 && !fuzzy) { results = [] } } else if (documentId) { candidates = matches.filter(match => match.documentId === documentId) } else { candidates = matches } subqueries.shift() } return results.concat(candidates) } function groupByDocumentId (candidates, matches) { if (matches) { matches = groupByDocumentId(matches) } return candidates.reduce((documentIds, candidate) => { const documentId = candidate.documentId if (! matches || matches.has(documentId)) { const documentCandidates = documentIds.get(documentId) if (documentCandidates) { documentCandidates.push(candidate) } else { documentIds.set(documentId, [ candidate ]) } } return documentIds }, new Map()) } function dedupe (characters, deduped, result) { const ids = this const { documentId, position } = result const match = FULL_STRINGS.get(documentId) let score if (fuzzy) { const normalised = Array.from(match.normalize('NFKC')) score = Math.round(Array.from(characters).filter((c) => normalised.includes(c)).length / normalised.length * 100) } else { score = Math.round(characters.length / Array.from(match.normalize('NFKC')).length * 100) } if (ids.has(documentId)) { const canonicalResult = deduped[ids.get(documentId)] if (canonicalResult.indices.every((index) => index !== position)) { canonicalResult.indices.push(position) if (strict) { canonicalResult.score += score } } } else { ids.set(documentId, deduped.length) deduped.push({ id: documentId, indices: [ position ], match, score }) } return deduped } } }