js-solr-highlighter
Version:
A JavaScript library for highlighting HTML text based on the query in the lucene/solr query syntax
234 lines (233 loc) • 8.33 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.highlightByQuery = exports.isStopWord = exports.STOP_WORDS = void 0;
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
const text_annotator_1 = require("text-annotator");
const STOP_WORDS = [
'a',
'an',
'and',
'are',
'as',
'at',
'be',
'but',
'by',
'for',
'if',
'in',
'into',
'is',
'it',
'no',
'not',
'of',
'on',
'or',
's',
'such',
't',
'that',
'the',
'their',
'then',
'there',
'these',
'they',
'this',
'to',
'was',
'will',
'with',
];
exports.STOP_WORDS = STOP_WORDS;
function isStopWord(string) {
return STOP_WORDS.includes(string.toLowerCase());
}
exports.isStopWord = isStopWord;
// validFields are those parsed as fields. If undefined, all will be parsed as fields if they are like x:x
// highlightedFields are those among validFields whose values will be highlighted. If undefined, the values of all valid fields will be highlighted
function highlightByQuery(query, content, options = {}) {
// can allow more options of text-annotator***
const { validFields, highlightAll, highlightClass, highlightedFields, highlightIdPattern, caseSensitive, } = options;
const searchFunc = highlightAll === undefined || highlightAll ? 'searchAll' : 'search';
let words = [];
const lucene = require('lucene');
// [\+\-\!\(\)\{\}\[\]\^\"\?\:\\\&\|\'\/\s\*\~]
const esc = (s, c) => {
const regex = new RegExp(c, 'g');
return s.replace(regex, (char) => {
return '\\' + char;
});
};
const unesc = (s, c) => {
const regex = new RegExp('\\\\([' + c + '])', 'g');
return s.replace(regex, (match, char) => {
return char;
});
};
// escape invalid fields
let q = query;
const fieldVals = [];
const fieldVals2 = [];
// /([^:\s]+):([^:\s]+)/g
// deal with cases like xxx:xxx, xxx: xxx
const regex = /([^(\s]+):\s?([^\s)"]+)/g;
let res;
while ((res = regex.exec(q)) !== null) {
const field = res[1];
const fieldVal = res[0];
if (validFields !== undefined && !validFields.includes(field)) {
fieldVals2.push(fieldVal);
}
}
// /([a-zA-Z]+)(\s+):(\s+)([a-zA-Z]+)/g
// deal with cases like xxx:"xxx", xxx:"xxx
const regex2 = /([^\s(]+):\s?("[^"]+"?[^)])/g;
while ((res = regex2.exec(q)) !== null) {
const field = res[1];
const fieldVal = res[0];
if (validFields === undefined || validFields.includes(field)) {
// remove invalid "
if (res[2].startsWith('"') && !res[2].endsWith('"')) {
fieldVals.push([fieldVal, res[1] + ':' + res[2].substring(1)]);
}
else if (!res[2].startsWith('"') && res[2].endsWith('"')) {
fieldVals.push(fieldVal, res[1] + ':' + res[2].substring(0, res[2].length - 1));
}
}
else {
fieldVals2.push(fieldVal);
}
}
fieldVals.forEach((fv) => {
q = q.replace(fv[0], fv[1]);
});
fieldVals2.forEach((fv) => {
q = q.replace(fv, esc(fv, ':'));
});
q = esc(q, '/');
// parse the query
const ast = lucene.parse(q);
// add terms to be highlighted
const { start, left, right, operator } = ast;
const addTerm = (words, term, quoted) => {
term = unesc(term, ':');
term = unesc(term, '/');
// if quoted, should change nothing inside
if (quoted) {
return words.concat([term]);
}
else {
// remove any char that is neither letter nor number at the start and end of each term
const terms = term
.split(/\s/)
.map((t) => t.replace(/^[^a-zA-Z0-9]+/, '').replace(/[^a-zA-Z0-9\*]+$/, ''));
return words.concat(terms);
}
};
const astString = JSON.stringify(ast);
const allOperators = astString.match(/"operator":"([^(,)]+)"/g);
const allFields = astString.match(/"field":"([^(,)]+)"/g);
// the !left.quoted condition is not elegant***
if (allOperators &&
allOperators.every((operator) => operator === '"operator":"<implicit>"') &&
allFields &&
allFields.every((field) => field === '"field":"<implicit>"') &&
!left.quoted) {
words = addTerm(words, q, false);
}
else {
const allParentheses = astString.match(/"parenthesized":true/g);
if (highlightedFields !== undefined &&
!highlightedFields.includes('<implicit>')) {
highlightedFields.push('<implicit>');
}
const canHighlight = (field) => highlightedFields === undefined
? field
: highlightedFields.includes(field);
// not an elegant solution***
if (!canHighlight(left.field) &&
operator === '<implicit>' &&
right &&
right.field === '<implicit>' &&
!allParentheses) {
words = addTerm(words, q, true);
}
else {
if (start !== 'NOT') {
if (canHighlight(left.field)) {
words = addTerm(words, left.term, left.quoted);
}
else {
if (left.left && canHighlight(left.left.field)) {
words = addTerm(words, left.left.term, left.left.quoted);
}
if (left.operator !== 'NOT' &&
left.right &&
canHighlight(left.right.field)) {
words = addTerm(words, left.right.term, left.right.quoted);
}
}
}
if (operator !== 'NOT' && right) {
if (canHighlight(right.field)) {
words = addTerm(words, right.term, right.quoted);
}
else if ((!right.right || !canHighlight(right.right.field)) &&
right.left &&
canHighlight(right.left.field)) {
words = addTerm(words, right.left.term, right.left.quoted);
}
}
}
}
// some filters may be moved up***
words = words.filter((word) => word.length && !isStopWord(word) && !['AND', 'OR', 'NOT'].includes(word));
for (let i = 0; i < words.length; i++) {
if (words[i].endsWith('*')) {
words[i] = words[i].slice(0, words[i].length - 1);
let index = content.indexOf(words[i]) + words[i].length;
while (content[index] !== ' ' && index !== content.length - 1) {
words[i] += content[index];
index++;
}
}
}
// highlight one word by another
let newContent = content;
if (words.length) {
const highlighter = new text_annotator_1.default({
content,
});
words.forEach((word) => {
let res = highlighter[searchFunc](word, {
directSearchOptions: {
caseSensitive: caseSensitive !== undefined && caseSensitive,
},
});
res = searchFunc === 'search' ? [res] : res;
res.forEach((highlightIndex) => {
const loc = highlighter.highlights[highlightIndex].loc;
const text = highlighter.stripedHTML;
const fixVaild = (c) => {
const letters = /^[0-9a-zA-Z]+$/;
return !c.match(letters);
};
// make sure we do not highlight part of a word
// this logic may be moved up***
const prevCharValid = loc[0] === 0 || fixVaild(text.charAt(loc[0] - 1));
const nextCharValid = loc[1] === text.length - 1 || fixVaild(text.charAt(loc[1]));
if (prevCharValid && nextCharValid) {
newContent = highlighter.highlight(highlightIndex, {
highlightIdPattern,
highlightClass,
});
}
});
});
}
return newContent;
}
exports.highlightByQuery = highlightByQuery;