UNPKG

sieving

Version:

Query-Based Item-List Reduction for JavaScript

github.com/rse/sieving

521 lines (481 loc) • 19.9 kB

JavaScript

/* ** Sieving -- Query-Based Item-List Reduction ** Copyright (c) 2018-2024 Dr. Ralf S. Engelschall <rse@engelschall.com> ** ** Permission is hereby granted, free of charge, to any person obtaining ** a copy of this software and associated documentation files (the ** "Software"), to deal in the Software without restriction, including ** without limitation the rights to use, copy, modify, merge, publish, ** distribute, sublicense, and/or sell copies of the Software, and to ** permit persons to whom the Software is furnished to do so, subject to ** the following conditions: ** ** The above copyright notice and this permission notice shall be included ** in all copies or substantial portions of the Software. ** ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* external requirements */ const ASTY = require("asty-astq") const PEG = require("pegjs-otf") const PEGUtil = require("pegjs-util") const chalk = require("chalk") const objectHash = require("object-hash") const minimatch = require("minimatch") const dice = require("dice-coefficient") const levenshtein = require("fast-levenshtein") const Tokenizr = require("tokenizr") /* pre-parse PEG grammar (replaced by browserify) */ const PEGparser = PEG.generateFromFile(`${__dirname}/sieving.pegjs`, { optimize: "size", trace: false }) /* the API class */ class Sieving { /* create API instance */ constructor (options = {}) { /* determine options */ this.options = { fieldsVal: [ "value" ], fieldId: "id", ...options } /* initialize internal state */ this.query = null this.lts = null this.ast = null } /* parse query string into Abstract Syntax Tree (AST) */ parse (query, options = {}) { /* sanity check argument */ if (typeof query !== "string") throw new Error("parse: invalid \"query\" argument") if (typeof options !== "object") throw new Error("parse: invalid \"options\" argument") /* determine options */ options = { lts: true, ast: true, ...options } /* store query */ this.query = query /* * ==== tokenize query string into a Linear Token Stream (LTS) ==== */ this.lts = null if (options.lts) { /* define tokenizer */ const tokenizer = new Tokenizr() tokenizer.rule(/\s+/, (ctx, match) => { ctx.accept("ws") }) tokenizer.rule(/\+/, (ctx, match) => { ctx.accept("union", "+") }) tokenizer.rule(/-/, (ctx, match) => { ctx.accept("subtraction", "-") }) tokenizer.rule(/,/, (ctx, match) => { ctx.accept("union", ",") }) tokenizer.rule(/[$#%@&]/, (ctx, match) => { ctx.accept("namespace") }) tokenizer.rule(/([a-zA-Z_][a-zA-Z_0-9]*):/, (ctx, match) => { ctx.accept("namespace", match[1]) }) tokenizer.rule(/"((?:\\\"|[^"])*)"/, (ctx, match) => { const value = match[1] .replace(/\\\\/g, "\\").replace(/\\"/g, "\"").replace(/\\"/g, "\"") .replace(/\\b/g, "\b").replace(/\\v/g, "\v").replace(/\\f/g, "\f") .replace(/\\t/g, "\t").replace(/\\n/g, "\n").replace(/\\r/g, "\r") .replace(/\\e/g, "\e") .replace(/\\x([0-9a-fA-f]{2})/g, (_, num) => String.fromCharCode(parseInt(num, 16))) .replace(/\\u([0-9a-fA-f]{4})/g, (_, num) => String.fromCharCode(parseInt(num, 16))) ctx.accept("dquoted", value) }) tokenizer.rule(/'((?:\\\'|[^'])*)'/, (ctx, match) => { const value = match[1].replace(/\\'/g, "'") ctx.accept("squoted", value) }) tokenizer.rule(/\/((?:\\\/|[^/])*)\//, (ctx, match) => { let value = null try { value = new RegExp(match[1]) } catch (ex) { value = null } if (value !== null) ctx.accept("regexp", value) else ctx.reject() }) tokenizer.rule(/[^*?\[\]{}\r\n\t\v\f (),^+-]*[*?\[\]{}][^\r\n\t\v\f (),^+-]*/, (ctx, match) => { ctx.accept("glob") }) /* tokenizer.rule(/[^*?\[\]{}\r\n\t\v\f (),^+-]+/, (ctx, match) => { ctx.accept("bareword") }) */ tokenizer.rule(/[a-zA-ZäöüÄÖÜß0-9_]+/, (ctx, match) => { ctx.accept("bareword") }) tokenizer.rule(/\^(\d*)/, (ctx, match) => { ctx.accept("boost", match[1] ? parseInt(match[1]) : 1) }) tokenizer.rule(/[()]/, (ctx, match) => { ctx.accept("group") }) tokenizer.rule(/.+$/, (ctx, match) => { ctx.accept("error") }) /* tokenize the query */ try { tokenizer.input(query) this.lts = tokenizer.tokens() } catch (err) { throw new Error(`parse: query tokenizing failed: ${err}`) } } /* * ==== parse query string into Abstract Syntax Tree (AST) ==== */ this.ast = null if (options.ast) { /* parse specification into Abstract Syntax Tree (AST) */ const asty = new ASTY() const result = PEGUtil.parse(PEGparser, query, { startRule: "root", makeAST: (line, column, offset, args) => { return asty.create.apply(asty, args).pos(line, column, offset) } }) if (result.error !== null) throw new Error("parse: query parsing failure:\n" + PEGUtil.errorMessage(result.error, true).replace(/^/mg, "ERROR: ") + "\n") this.ast = result.ast /* post-process AST: sanity check structure */ let nodes = this.ast.query(` .// term [ @op == "subtraction" && @boost ] `) if (nodes.length > 0) { const node = nodes[0] const { line, column } = node.pos() throw new Error("parse: boosting not allowed on negated term " + `(line ${line}, column ${column}): "${node.get("value")}"`) } nodes = this.ast.query(` .// query [ / term [ @op == "subtraction" ] && count(/ term [ @op != "subtraction" ]) == 0 ] `) if (nodes.length > 0) { const node = nodes[0] const { line, column } = node.pos() throw new Error("parse: negated terms only not allowed " + `(line ${line}, column ${column})`) } nodes = this.ast.query(` .// term / namespace `) if (nodes.length > 0) { for (const node of nodes) { const ns = node.get("value") if (this.options.fieldsVal.indexOf(ns) < 0) { const { line, column } = node.pos() if (ns.match(/^[$#%@&]$/)) throw new Error(`parse: namespace symbol "${ns}" not allowed ` + `(line ${line}, column ${column})`) else throw new Error(`parse: namespace identifier "${ns}" not allowed ` + `(line ${line}, column ${column})`) } } } } return this } /* dump the Linear Token Stream (LTS) and Abstract Syntax Tree (AST) with colorization */ dump (colorize = true) { let output = "" /* dump query */ let title = "Query String:" output += `${colorize ? chalk.inverse.bold(title) : title}\n` if (colorize) output += chalk.blue(this.query) else output += this.query output += "\n\n" /* dump LST */ title = "Linear Token Stream:" output += `${colorize ? chalk.inverse.bold(title) : title}\n` if (this.lts === null) output += "(still no LTS available)" else { for (const token of this.lts) { output += token.toString((type, text) => { if (colorize) { switch (type) { case "type": text = chalk.blue(text); break case "value": text = chalk.green(text); break case "text": text = chalk.yellow(text); break case "pos": text = chalk.yellow(text); break case "line": text = chalk.yellow(text); break case "column": text = chalk.yellow(text); break default: } } return text }) + "\n" } output += "\n" } /* dump AST */ title = "Abstract Syntax Tree:" output += `${colorize ? chalk.inverse.bold(title) : title}\n` if (this.ast === null) output += "(still no AST available)" else { output += this.ast.dump(Infinity, (type, text) => { if (colorize) { switch (type) { case "tree": text = chalk.grey(text); break case "type": text = chalk.blue(text); break case "value": text = chalk.yellow(text); break case "position": text = chalk.grey(text); break default: } } return text }) } return output } /* format query string into a linear token stream */ format (format = "text") { /* sanity check argument */ if (!(typeof format === "string" && format.match(/^(?:text|xml|html|json)/))) throw new Error("format: invalid \"format\" argument") /* sanity check context */ if (this.lts === null) throw new Error("format: still no LTS of query available") /* iterate over all tokens */ let output = "" const xmlEscape = (text) => { return text .replace(/\&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") } for (const token of this.lts) { if (token.type === "EOF") continue if (format === "text") output += token.text else if (format === "xml") output += `<${token.type}>${xmlEscape(token.text)}</${token.type}>\n` else if (format === "html") output += `<span class="${token.type}">${xmlEscape(token.text)}</span>\n` else if (format === "json") output += `{ type: "${token.type}", text: ${JSON.stringify(token.text)} },\n` } if (format === "xml") output = `<query>\n${output}</query>\n` else if (format === "html") output = `<span class="query">\n${output}</span>\n` else if (format === "json") output = `[\n${output}]\n` return output } /* evaluate the Abstract Syntax Tree (AST) */ evaluate (queryResults) { /* sanity check arguments */ if (typeof queryResults !== "function") throw new Error("evaluate: invalid argument") /* sanity check context */ if (this.ast === null) throw new Error("evaluate: still no AST of query available") /* initialize internal state */ const mapId = new Map() const mapPrio = new Map() /* perform set-operations on result lists */ const listUnion = (a, b) => { const r = [] const idx = {} a.forEach((x) => { r.push(x) idx[mapId.get(x)] = true }) b.forEach((x) => { if (!idx[mapId.get(x)]) r.push(x) }) return r } const listIntersection = (a, b) => { const r = [] const idx = {} b.forEach((x) => { idx[mapId.get(x)] = x }) a.forEach((x) => { if (idx[mapId.get(x)]) r.push(x) }) return r } const listSubtraction = (a, b) => { const r = [] const idx = {} b.forEach((x) => { idx[mapId.get(x)] = x }) a.forEach((x) => { if (!idx[mapId.get(x)]) r.push(x) }) return r } /* evaluate an AST node */ const evaluateNode = (node) => { let result = null if (node.type() === "union") { const [ n1, n2 ] = node.childs() const r1 = evaluateNode(n1) /* RECURSION */ const r2 = evaluateNode(n2) /* RECURSION */ result = listUnion(r1, r2) } else if (node.type() === "intersection") { const [ n1, n2 ] = node.childs() const r1 = evaluateNode(n1) /* RECURSION */ const r2 = evaluateNode(n2) /* RECURSION */ result = listIntersection(r1, r2) } else if (node.type() === "subtraction") { const [ n1, n2 ] = node.childs() const r1 = evaluateNode(n1) /* RECURSION */ const r2 = evaluateNode(n2) /* RECURSION */ result = listSubtraction(r1, r2) } else if (node.type() === "group") { const n1 = node.child(0) result = evaluateNode(n1) /* RECURSION */ } else if (node.type() === "term") { const nN = node.query("/ namespace") const nM = node.query("/ squoted, / dquoted, / regexp, / glob, / bareword") const nB = node.query("/ boost") /* gather information */ const ns = nN.length === 1 ? nN[0].get("value") : this.options.fieldsVal[0] const type = nM[0].type() const value = nM[0].get("value") const boost = nB.length === 1 ? nB[0].get("value") : 0 /* retrieve single result list via callback */ result = queryResults(ns, type, value) /* post-process result */ result.forEach((item) => { /* remember id */ if (item[this.options.fieldId] !== undefined) mapId.set(item, item[this.options.fieldId]) else mapId.set(item, objectHash(item)) /* remember priority, optionally boosted */ mapPrio.set(item, 1 + boost) }) } if (result === null) result = [] return result } /* evaluate AST from the root node */ let result = evaluateNode(this.ast) /* sort result according to (boosted) priority */ result = result.sort((a, b) => mapPrio.get(b) - mapPrio.get(a)) return result } /* sieve items by evaluating against the Abstract Syntax Tree (AST) */ sieve (items, options = {}) { /* sanity check arguments */ if (!(typeof items === "object" && items instanceof Array)) throw new Error("sieve: invalid items argument (expected array)") if (typeof options !== "object") throw new Error("sieve: invalid options argument (expected object)") /* sanity check context */ if (this.ast === null) throw new Error("sieve: still no AST of query available") /* determine options */ options = { fuzzy: false, nocase: false, maxLS: 2, minDC: 0.50, ...options } /* evaluate the AST */ return this.evaluate((ns, type, value) => { const valueOfItem = (item) => { if (typeof item === "string") return item else if (typeof item === "object" && ns !== "" && item[ns] !== undefined) return item[ns] else return undefined } const cache = new Map() return items.filter((item) => { const itemValue = valueOfItem(item) if (itemValue === undefined) return false if (type === "regexp" && !options.nocase) return value.exec(itemValue) else if (type === "regexp" && options.nocase) { let regexp = cache.get(value) if (regexp === undefined) { regexp = new RegExp(value, "i") cache.set(value, regexp) } return regexp.exec(itemValue) } else if (type === "glob") return minimatch(itemValue, `*${value}*`, { nocase: options.nocase }) else if (type === "dquoted" || type === "squoted") return ((!options.nocase && itemValue === value) || (options.nocase && itemValue.toLowerCase() === value.toLowerCase()) || (options.fuzzy && (dice(itemValue, value) >= options.minDC || levenshtein.get(itemValue, value) <= options.maxLS))) else if (type === "bareword") return ((!options.nocase && itemValue.indexOf(value) >= 0) || (options.nocase && itemValue.toLowerCase().indexOf(value.toLowerCase()) >= 0) || (options.fuzzy && (dice(itemValue, value) >= options.minDC || levenshtein.get(itemValue, value) <= options.maxLS))) else throw new Error("sieve: invalid type") }) }) } /* static function for all-in-one sieving */ static sieve (items, query, options = {}) { const sieving = new Sieving(options) sieving.parse(query) if (options.debug) console.log(sieving.dump()) return sieving.sieve(items, options) } } /* export the API class */ module.exports = Sieving