hyperbeedeebee
Version:
A MongoDB-like database built on top of Hyperbee with support for indexing
900 lines (741 loc) • 22.5 kB
JavaScript
const BSON = require('bson')
const { ObjectID } = BSON
const cbor = require('cbor')
// Version of the indexing algorithm
// Will be incremented for breaking changes
// In the future we'll want to support multiple versions?
const INDEX_VERSION = '2.0'
const OLD_INDEX_VERSION = '1.0'
const QUERY_TYPES = {
// Math stuff
$gt: compareGt,
$lt: compareLt,
$gte: compareGte,
$lte: compareLte,
// Array stuff
$in: compareIn,
$all: compareAll,
// Equality
$eq: compareEq,
$exists: compareExists
}
const UPDATE_TYPES = {
// Field set/unset
$set: updateSet,
$unset: updateUnset,
$rename: updateRename,
// Math stuff
$inc: updateInc,
$mul: updateMul,
// Array stuff
$addToSet: updateAddToSet,
$pop: updatePop,
$pull: updatePull,
$push: updatePush
}
class DB {
constructor (bee) {
this.bee = bee
this.collections = new Map()
}
collection (name) {
if (!this.collections.has(name)) {
const sub = this.bee.sub(name)
const collection = new Collection(name, sub)
this.collections.set(name, collection)
}
return this.collections.get(name)
}
async close () {
// TODO: This looks kinda stange. PR a close method on bee?
return this.bee.feed.close()
}
}
class Collection {
constructor (name, bee) {
this.name = name
this.bee = bee
this.docs = bee.sub('doc')
this.idxs = bee.sub('idxs')
this.idx = bee.sub('idx')
}
async insert (rawDoc) {
let doc = rawDoc
if (!doc) throw new TypeError('No Document Supplied')
if (!doc._id) {
doc = {
...doc,
_id: new ObjectID()
}
}
// Get _id as buffer
const key = doc._id.id
const exists = await this.docs.get(key)
if (exists) throw new Error('Duplicate Key error, try using .update?')
const value = BSON.serialize(doc)
await this.docs.put(key, value)
const indexes = await this.listIndexes()
for (const { fields, name } of indexes) {
// TODO: Cache index subs
const bee = this.idx.sub(name)
await this._indexDocument(bee, fields, doc)
}
return doc
}
async update (query = {}, update = {}, options = {}) {
const {
upsert = false,
multi = false,
hint = null
} = options
let nMatched = 0
let nUpserted = 0
let nModified = 0
let cursor = this.find(query)
if (hint) cursor = cursor.hint(hint)
if (!multi) cursor = cursor.limit(1)
const indexes = await this.listIndexes()
for await (const doc of cursor) {
nMatched++
const newDoc = performUpdate(doc, update)
const key = doc._id.id
const value = BSON.serialize(newDoc)
await this.docs.put(key, value)
for (const { fields, name } of indexes) {
// TODO: Cache index subs
const bee = this.idx.sub(name)
await this._deIndexDocument(bee, fields, doc)
await this._indexDocument(bee, fields, newDoc)
}
nModified++
}
if (!nModified && upsert) {
const initialDoc = {}
for (const queryField of Object.keys(query)) {
const queryValue = query[queryField]
if ('$eq' in queryValue) initialDoc[queryField] = queryValue.$eq
else if (!isQueryObject(queryValue)) initialDoc[queryField] = queryValue
}
const newDoc = performUpdate(initialDoc, update)
await this.insert(newDoc)
nUpserted++
}
return {
nMatched,
nUpserted,
nModified
}
}
async findOne (query = {}) {
const results = await (this.find(query).limit(1))
const [doc] = results
if (!doc) throw new Error('not found')
return doc
}
find (query = {}) {
return new Cursor(query, this)
}
async createIndex (fields, { rebuild = false, version = INDEX_VERSION, ...opts } = {}) {
const name = fields.join(',')
const exists = await this.indexExists(name)
// Don't rebuild index if it's already set
if (exists && !rebuild) {
const existing = await this.getIndex(name)
// If the existing index is an older version, we should upgrade it
// If it's the same version, don't bother re building
if (existing.version === version) {
return
}
}
const index = {
version,
name,
fields,
opts
}
await this.idxs.put(name, BSON.serialize(index))
await this.reIndex(name)
return name
}
async indexExists (name) {
const exists = await this.idxs.get(name)
return exists !== null
}
async getIndex (name) {
const data = await this.idxs.get(name)
if (!data) throw new Error('Invalid index')
return BSON.deserialize(data.value)
}
async reIndex (name) {
const { fields } = await this.getIndex(name)
// TODO: Cache index subs
const bee = this.idx.sub(name)
for await (const doc of this.find()) {
await this._indexDocument(bee, fields, doc)
}
}
// This is a private API, don't depend on it
async _indexDocument (bee, fields, doc) {
if (!hasFields(doc, fields)) return
const idxValue = doc._id.id
const batch = bee.batch()
for (const flattened of flattenDocument(doc, fields)) {
const idxKey = makeIndexKeyV2(flattened, fields)
await batch.put(idxKey, idxValue)
}
await batch.flush()
}
async _deIndexDocument (bee, fields, doc) {
if (!hasFields(doc, fields)) return
const batch = bee.batch()
for (const flattened of flattenDocument(doc, fields)) {
const idxKey = makeIndexKeyV2(flattened, fields)
await batch.del(idxKey)
}
await batch.flush()
}
// TODO: Cache indexes?
async listIndexes () {
const stream = this.idxs.createReadStream()
const indexes = []
for await (const { value } of stream) {
const index = BSON.deserialize(value)
indexes.push(index)
}
return indexes
}
}
class Cursor {
constructor (query = {}, collection, opts = {
limit: Infinity,
skip: 0,
sort: null,
hint: null
}) {
this.query = query
this.collection = collection
// TODO: Validate opts
this.opts = opts
}
async count () {
let count = 0
// Item isn't being used but eslint will complain about it
for await (const item of this) { // eslint-disable-line
count++
}
return count
}
hint (hint) {
return new Cursor(this.query, this.collection, { ...this.opts, hint })
}
limit (limit) {
return new Cursor(this.query, this.collection, { ...this.opts, limit })
}
skip (skip) {
return new Cursor(this.query, this.collection, { ...this.opts, skip })
}
sort (field, direction = 1) {
return new Cursor(this.query, this.collection, {
...this.opts,
sort: {
field,
direction
}
})
}
async getIndex () {
const { sort, hint } = this.opts
const query = this.query
const queryFields = Object.keys(query)
// Filter out fields with `$exists: false` since we can't index non-existance
const existingFields = queryFields.filter((field) => {
return isQueryObject(query[field]) ? query[field].$exists !== false : true
})
const eqS = existingFields.filter((name) => {
const queryValue = query[name]
if (!isQueryObject(queryValue)) return true
return ('$eq' in queryValue)
})
if (hint) {
const hintIndex = await this.collection.getIndex(hint)
const { fields } = hintIndex
if (sort) {
const sortIndex = fields.indexOf(sort.field)
if (sortIndex === -1) throw new Error("Hinted Index doesn't match required sort")
const consecutive = consecutiveSubset(fields, eqS)
if (consecutive !== sortIndex) throw new Error("Hinted index doesn't match required sort")
}
const prefixFields = fields.slice(0, consecutiveSubset(fields, eqS))
return {
index: hintIndex,
prefixFields,
eqS
}
}
const allIndexes = await this.collection.listIndexes()
const matchingIndexes = allIndexes
.filter(({ fields, version }) => {
if (version !== INDEX_VERSION && version !== OLD_INDEX_VERSION) {
// Only select indexes we support
return false
}
if (sort) {
// At the very least we _need_ to have the sort field
const sortIndex = fields.indexOf(sort.field)
if (sortIndex === -1) return false
// All the fields before the sort should be $eq fields
const consecutive = consecutiveSubset(fields, eqS)
return consecutive === sortIndex
} else {
// Ensure the fields have _some_ of the query fields
return fields.some((field) => existingFields.includes(field))
}
})
// Sort by most $eq fields at the beginning
.sort(({ fields: fieldsA }, { fields: fieldsB }) => {
return consecutiveSubset(fieldsB, eqS) - consecutiveSubset(fieldsA, eqS)
})
// The best is the one with the most eqS
const index = matchingIndexes[0]
if (!index) {
return null
}
const { fields } = index
// TODO: Use $gt/$lt fields in the prefix if after $eqs (and doesn't conflict with sort)
const prefixFields = fields.slice(0, consecutiveSubset(fields, eqS))
return {
index,
eqS,
prefixFields
}
}
async then (resolve, reject) {
try {
const results = []
for await (const item of this) {
results.push(item)
}
return Promise.resolve(resolve(results))
} catch (e) {
reject(e)
}
}
async * [Symbol.asyncIterator] () {
if (this.query._id && (this.query._id instanceof ObjectID)) {
// Doc IDs are unique, so we can query against them without doing a search
const key = this.query._id.id
const found = await this.collection.docs.get(key)
// Exit premaurely
if (!found) return
const { value: rawDoc } = found
if (!rawDoc) {
// Not found?
return
}
const doc = BSON.deserialize(rawDoc)
if (!matchesQuery(doc, this.query)) {
return
}
yield doc
} else {
const {
limit = Infinity,
skip = 0,
sort
} = this.opts
const query = this.query
const seen = new Set()
let count = 0
let skipped = 0
const toSkip = skip
const bestIndex = await this.getIndex()
function processDoc (doc) {
let shouldYield = null
let shouldBreak = false
// If we've seen this document before, ignore it
if (!seen.has(doc._id.toString())) {
if (matchesQuery(doc, query)) {
if (toSkip > skipped) {
skipped++
} else {
seen.add(doc._id.toString())
count++
shouldYield = doc
if (count >= limit) shouldBreak = true
}
}
}
return {
shouldBreak,
shouldYield
}
}
// If there is an index we should use
if (bestIndex) {
const { index, prefixFields, version } = bestIndex
let makeIndexKey = makeIndexKeyV2
let makeDocFromIndex = makeDocFromIndexV2
if (version === OLD_INDEX_VERSION) {
makeIndexKey = makeIndexKeyV1
makeDocFromIndex = makeDocFromIndexV1
}
// TODO: Support $all and $in more efficiently
// $all can't be used with just the fields in the index
// We need to fetch the entire document to test this field
const subQueryFields = index.fields.filter((field) => {
return isQueryObject(query[field]) ? !('$all' in query[field]) : true
})
const subQuery = getSubset(query, subQueryFields)
const gt = makeIndexKeyFromQuery(query, prefixFields, index.fields, makeIndexKey)
const opts = {
reverse: (sort?.direction === -1)
}
if (gt && gt.length) {
opts.gt = gt
// Add a `less than` range to constrain the search
const lt = Buffer.alloc(gt.length)
opts.lt = lt
gt.copy(lt)
// Set to MAX byte to only use keys with this prefix
lt[lt.length - 1] = 0xFF
}
const stream = this.collection.idx.sub(index.name).createReadStream(opts)
for await (const { key, value: rawId } of stream) {
const keyDoc = makeDocFromIndex(key, index.fields)
// Test the fields agains the index to avoid fetching the doc
if (!matchesQuery(keyDoc, subQuery)) continue
const { value: rawDoc } = await this.collection.docs.get(rawId)
const doc = BSON.deserialize(rawDoc)
// TODO: Avoid needing to double-process the values
// TODO: Support "projection" when the fields are all in the index
const { shouldYield, shouldBreak } = processDoc(doc)
if (shouldYield) yield shouldYield
if (shouldBreak) break
}
} else if (sort === null) {
// If we aren't sorting, and don't have an index, iterate over all docs
const stream = this.collection.docs.createReadStream()
for await (const { value: rawDoc } of stream) {
// TODO: Can we avoid iterating over keys that should be skipped?
const doc = BSON.deserialize(rawDoc)
const { shouldYield, shouldBreak } = processDoc(doc)
if (shouldYield) yield shouldYield
if (shouldBreak) break
}
} else {
throw new Error(`No indexes found to sort for field "${sort.field}"`)
}
}
}
}
function performUpdate (doc, update) {
if (Array.isArray(update)) {
return update.reduce(performUpdate, doc)
}
const newDoc = { ...doc }
for (const key of Object.keys(update)) {
if (UPDATE_TYPES[key]) {
UPDATE_TYPES[key](newDoc, update[key])
} else {
newDoc[key] = update[key]
}
}
return newDoc
}
function matchesQuery (doc, query) {
for (const key of Object.keys(query)) {
const queryValue = query[key]
const docValue = doc[key]
if (!queryCompare(docValue, queryValue)) return false
}
return true
}
function queryCompare (docValue, queryValue) {
if (isQueryObject(queryValue)) {
for (const queryType of Object.keys(queryValue)) {
const compare = QUERY_TYPES[queryType]
// TODO: Validate somewhere else?
if (!compare) throw new Error('Invalid Query Type ' + queryType)
if (!compare(docValue, queryValue[queryType])) return false
}
return true
} else return compareEq(docValue, queryValue)
}
function compareAll (docValue, queryValue) {
// TODO: Add query validator function to detect this early.
if (!Array.isArray(queryValue)) throw new Error('$all must be set to an array')
if (Array.isArray(docValue)) {
return queryValue.every((fromQuery) => docValue.some((fromDoc) => compareEq(fromDoc, fromQuery)))
} else {
return false
}
}
function compareIn (docValue, queryValue) {
// TODO: Add query validator function to detect this early.
if (!Array.isArray(queryValue)) throw new Error('$in must be set to an array')
if (Array.isArray(docValue)) {
return docValue.some((fromDoc) => queryValue.some((fromQuery) => compareEq(fromDoc, fromQuery)))
} else {
return queryValue.some((fromQuery) => compareEq(docValue, fromQuery))
}
}
function compareGt (docValue, queryValue) {
return ensureComparable(docValue) > ensureComparable(queryValue)
}
function compareLt (docValue, queryValue) {
return ensureComparable(docValue) < ensureComparable(queryValue)
}
function compareGte (docValue, queryValue) {
return ensureComparable(docValue) >= ensureComparable(queryValue)
}
function compareLte (docValue, queryValue) {
return ensureComparable(docValue) <= ensureComparable(queryValue)
}
function ensureComparable (value) {
if (value instanceof Date) return value.getTime()
return value
}
function compareEq (docValue, queryValue) {
if (Array.isArray(docValue)) {
return docValue
.some((item) => compareEq(item, queryValue))
} else if (typeof docValue?.equals === 'function') {
return docValue.equals(queryValue)
} else {
return queryValue === docValue
}
}
function compareExists (docValue, queryValue) {
return (docValue !== undefined) === queryValue
}
function updatePull (doc, fields) {
for (const key of Object.keys(fields)) {
const value = doc[key]
if (!Array.isArray(value)) continue
const query = fields[key]
doc[key] = value.filter((item) => !queryCompare(item, query))
}
}
function updatePop (doc, fields) {
for (const key of Object.keys(fields)) {
const value = doc[key]
if (!Array.isArray(value)) continue
const direction = fields[key]
if (direction > 0) {
value.pop()
} else if (direction < 0) {
value.shift()
}
}
}
function updatePush (doc, fields) {
for (const key of Object.keys(fields)) {
const toPush = fields[key]
if (!(key in doc)) {
doc[key] = toPush
} else {
const value = doc[key]
if (!Array.isArray(value)) continue
if (toPush.$each) {
for (const item of toPush.$each) {
value.push(item)
}
} else {
value.push(toPush)
}
}
}
}
function updateAddToSet (doc, fields) {
for (const key of Object.keys(fields)) {
if (!(key in doc)) {
doc[key] = fields[key]
} else {
const value = doc[key]
const toAdd = fields[key]
// if (!Array.isArray(value)) throw new Error(`Cannot use $addToSet with non-array field ${key}`)
if (!Array.isArray(value)) continue
if (toAdd.$each) {
for (const item of toAdd.$each) {
if (!value.includes(item)) value.push(item)
}
} else if (!value.includes(toAdd)) value.push(toAdd)
}
}
}
function updateUnset (doc, fields) {
for (const key of Object.keys(fields)) {
delete doc[key]
}
}
function updateSet (doc, fields) {
for (const key of Object.keys(fields)) {
doc[key] = fields[key]
}
}
function updateRename (doc, fields) {
for (const key of Object.keys(fields)) {
if (!(key in doc)) continue
const name = fields[key]
const value = doc[key]
delete doc[key]
doc[name] = value
}
}
function updateInc (doc, fields) {
for (const key of Object.keys(fields)) {
const value = fields[key]
if (!(key in doc)) {
doc[key] = value
} else {
doc[key] += value
}
}
}
function updateMul (doc, fields) {
for (const key of Object.keys(fields)) {
const value = fields[key]
if (!(key in doc)) {
doc[key] = 0
} else {
doc[key] *= value
}
}
}
function hasFields (doc, fields) {
return fields.every((field) => (field in doc) && (field !== undefined))
}
function makeIndexKeyV1 (doc, fields) {
// TODO: Does BSON array work well for ordering?
// TODO: Maybe use a custom encoding?
// Serialize the data into a BSON array
const buffer = BSON.serialize(
// Take all the indexed fields
fields.map((field) => doc[field])
// Add the document ID
.concat(doc._id || [])
)
// Get rid of the length prefix, we don't need it.
const noPrefix = buffer.slice(4)
return noPrefix
}
function makeDocFromIndexV1 (key, fields) {
const buffer = Buffer.alloc(key.length + 4)
key.copy(buffer, 4)
// Write a valid length prefix to the buffer for BSON decoding
buffer.writeInt32LE(buffer.length)
// Should be a JSON object with numbered key (a BSON array)
const parsed = BSON.deserialize(buffer)
const doc = {}
for (const index of Object.keys(parsed)) {
const field = fields[index] || '_id'
doc[field] = parsed[index]
}
return doc
}
function makeIndexKeyV2 (doc, fields, allFields = fields) {
// CBOR encode fields
const keyValues = fields.map((field) => {
const value = doc[field]
// Detect ObjectID
if (value instanceof ObjectID) {
return value.id
}
return value
})
if (doc._id) keyValues.push(doc._id.id)
let toRemove = 0
// If the number of fields in the index is greater than what we're generating
// We should pad the list with some null bytes
// Then we should remove these bytes to get the real prefix
while (keyValues.length < (allFields.length + 1)) {
keyValues.push(0)
toRemove++
}
let key = cbor.encode(keyValues)
if (toRemove) {
key = key.subarray(0, key.length - toRemove)
}
return key
}
function makeDocFromIndexV2 (key, fields) {
// CBOR decode fields
const decoded = cbor.decode(key)
const doc = {}
for (const [index, value] of decoded.entries()) {
const field = fields[index] || '_id'
if (Buffer.isBuffer(value) && value.length === 12) {
try {
doc[field] = new ObjectID(value)
} catch {
doc[field] = value
}
} else {
doc[field] = value
}
}
return doc
}
function getSubset (doc, fields) {
return fields.reduce((res, field) => {
if (field in doc) {
res[field] = doc[field]
}
return res
}, {})
}
function * flattenDocument (doc, fields) {
let hadArray = false
for (const key of fields) {
const values = doc[key]
if (Array.isArray(values) && values.length) {
hadArray = true
const copy = { ...doc }
delete copy[key]
const remainingFields = fields.filter((field) => field !== key)
for (const value of values) {
for (const flattened of flattenDocument(copy, remainingFields)) {
yield { ...flattened, [key]: value }
}
}
}
}
if (!hadArray) yield doc
}
function makeIndexKeyFromQuery (query, fields, indexFields, makeIndexKey) {
// TODO: Account for $eq and $gt fields
const doc = fields.reduce((res, field) => {
const value = query[field]
if (isQueryObject(value)) {
if ('$eq' in value) {
res[field] = value.$eq
} else if ('$gt' in value) {
res[field] = value.$gt
}
} else {
res[field] = value
}
return res
}, {})
return makeIndexKey(doc, fields, indexFields)
}
function isQueryObject (object) {
return (typeof object === 'object') && has$Keys(object)
}
function has$Keys (object) {
return Object.keys(object).some((key) => key.startsWith('$'))
}
function consecutiveSubset (origin, values) {
let counter = 0
for (const item of origin) {
if (!values.includes(item)) return counter
counter++
}
return counter
}
module.exports = {
DB,
Collection,
Cursor,
BSON
}