UNPKG

js-solr-highlighter

Version:

A JavaScript library for highlighting HTML text based on the query in the lucene/solr query syntax

188 lines (187 loc) 7.13 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.highlightByQuery = exports.isStopWord = exports.STOP_WORDS = void 0; // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore const text_annotator_1 = require("text-annotator"); const STOP_WORDS = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on', 'or', 's', 'such', 't', 'that', 'the', 'their', 'then', 'there', 'these', 'they', 'this', 'to', 'was', 'will', 'with']; exports.STOP_WORDS = STOP_WORDS; function isStopWord(string) { return STOP_WORDS.includes(string.toLowerCase()); } exports.isStopWord = isStopWord; // validFields are those parsed as fields. If undefined, all will be parsed as fields if they are like x:x // highlightedFields are those among validFields whose values will be highlighted. If undefined, the values of all valid fields will be highlighted function highlightByQuery(query, content, options = {}) { // can allow more options of text-annotator*** const { validFields, highlightAll, highlightClass, highlightedFields, highlightIdPattern, caseSensitive } = options; const searchFunc = highlightAll === undefined || highlightAll ? 'searchAll' : 'search'; let words = []; const lucene = require('lucene'); // [\+\-\!\(\)\{\}\[\]\^\"\?\:\\\&\|\'\/\s\*\~] const esc = (s, c) => { const regex = new RegExp(c, 'g'); return s.replace(regex, char => { return '\\' + char; }); }; const unesc = (s, c) => { const regex = new RegExp('\\\\([' + c + '])', 'g'); return s.replace(regex, (match, char) => { return char; }); }; // escape invalid fields let q = query; const fieldVals = []; const fieldVals2 = []; // /([^:\s]+):([^:\s]+)/g // deal with cases like xxx:xxx, xxx: xxx const regex = /([^(\s]+):\s?([^\s)"]+)/g; let res; while ((res = regex.exec(q)) !== null) { const field = res[1]; const fieldVal = res[0]; if (validFields !== undefined && !validFields.includes(field)) { fieldVals2.push(fieldVal); } } // /([a-zA-Z]+)(\s+):(\s+)([a-zA-Z]+)/g // deal with cases like xxx:"xxx", xxx:"xxx const regex2 = /([^\s(]+):\s?("[^"]+"?[^)])/g; while ((res = regex2.exec(q)) !== null) { const field = res[1]; const fieldVal = res[0]; if (validFields === undefined || validFields.includes(field)) { // remove invalid " if (res[2].startsWith('"') && !res[2].endsWith('"')) { fieldVals.push([fieldVal, res[1] + ':' + res[2].substring(1)]); } else if (!res[2].startsWith('"') && res[2].endsWith('"')) { fieldVals.push(fieldVal, res[1] + ':' + res[2].substring(0, res[2].length - 1)); } } else { fieldVals2.push(fieldVal); } } fieldVals.forEach(fv => { q = q.replace(fv[0], fv[1]); }); fieldVals2.forEach(fv => { q = q.replace(fv, esc(fv, ':')); }); q = esc(q, '/'); // parse the query const ast = lucene.parse(q); // add terms to be highlighted const { start, left, right, operator } = ast; const addTerm = (words, term, quoted) => { term = unesc(term, ':'); term = unesc(term, '/'); // if quoted, should change nothing inside if (quoted) { return words.concat([term]); } else { // remove any char that is neither letter nor number at the start and end of each term const terms = term.split(/\s/).map(t => t.replace(/^[^a-zA-Z0-9]+/, '').replace(/[^a-zA-Z0-9\*]+$/, '')); return words.concat(terms); } }; const astString = JSON.stringify(ast); const allOperators = astString.match(/"operator":"([^(,)]+)"/g); const allFields = astString.match(/"field":"([^(,)]+)"/g); // the !left.quoted condition is not elegant*** if (allOperators && allOperators.every(operator => operator === '"operator":"<implicit>"') && allFields && allFields.every(field => field === '"field":"<implicit>"') && !left.quoted) { words = addTerm(words, q, false); } else { const allParentheses = astString.match(/"parenthesized":true/g); if (highlightedFields !== undefined && !highlightedFields.includes('<implicit>')) { highlightedFields.push('<implicit>'); } const canHighlight = field => highlightedFields === undefined ? field : highlightedFields.includes(field); // not an elegant solution*** if (!canHighlight(left.field) && operator === '<implicit>' && right && right.field === '<implicit>' && !allParentheses) { words = addTerm(words, q, true); } else { if (start !== 'NOT') { if (canHighlight(left.field)) { words = addTerm(words, left.term, left.quoted); } else { if (left.left && canHighlight(left.left.field)) { words = addTerm(words, left.left.term, left.left.quoted); } if (left.operator !== 'NOT' && left.right && canHighlight(left.right.field)) { words = addTerm(words, left.right.term, left.right.quoted); } } } if (operator !== 'NOT' && right) { if (canHighlight(right.field)) { words = addTerm(words, right.term, right.quoted); } else if ((!right.right || !canHighlight(right.right.field)) && right.left && canHighlight(right.left.field)) { words = addTerm(words, right.left.term, right.left.quoted); } } } } // some filters may be moved up*** words = words.filter(word => word.length && !isStopWord(word) && !['AND', 'OR', 'NOT'].includes(word)); for (let i = 0; i < words.length; i++) { if (words[i].endsWith('*')) { words[i] = words[i].slice(0, words[i].length - 1); let index = content.indexOf(words[i]) + words[i].length; while (content[index] !== ' ' && index !== content.length - 1) { words[i] += content[index]; index++; } } } // highlight one word by another let newContent = content; if (words.length) { const highlighter = new text_annotator_1.default({ content }); words.forEach(word => { let res = highlighter[searchFunc](word, { directSearchOptions: { caseSensitive: caseSensitive !== undefined && caseSensitive } }); res = searchFunc === 'search' ? [res] : res; res.forEach(highlightIndex => { const loc = highlighter.highlights[highlightIndex].loc; const text = highlighter.stripedHTML; const fixVaild = c => { const letters = /^[0-9a-zA-Z]+$/; return !c.match(letters); }; // make sure we do not highlight part of a word // this logic may be moved up*** const prevCharValid = loc[0] === 0 || fixVaild(text.charAt(loc[0] - 1)); const nextCharValid = loc[1] === text.length - 1 || fixVaild(text.charAt(loc[1])); if (prevCharValid && nextCharValid) { newContent = highlighter.highlight(highlightIndex, { highlightIdPattern, highlightClass }); } }); }); } return newContent; } exports.highlightByQuery = highlightByQuery;