UNPKG

scv-bilara

Version:

SuttaCentral bilara-data library

1,356 lines (1,298 loc) 40.8 kB
(function (exports) { const fs = require("fs"); const path = require("path"); const { logger } = require("log-instance"); const { exec } = require("child_process"); const util = require("util"); const execPromise = util.promisify(exec); const { ScApi } = require("suttacentral-api"); const { MerkleJson } = require("merkle-json"); const { Memoizer, Files } = require("memo-again"); const FuzzyWordSet = require("./fuzzy-word-set"); const { BilaraPath, SuttaRef, AuthorsV2 } = require("scv-esm"); const MLDoc = require("./ml-doc"); const Pali = require("./pali"); const Unicode = require("./unicode"); const English = require("./english"); const BilaraData = require("./bilara-data"); const { SuttaCentralId } = require("scv-esm"); const BILARA_PATH = path.join(Files.LOCAL_DIR, "bilara-data"); const TRANSLATION_PATH = path.join(BILARA_PATH, "translation"); const MAXBUFFER = 10 * 1024 * 1024; const TCMAP = require("./seeker-tcmap.json"); const MAX_DOC = 50; // Hard limit for reasonable use of resources const { DBG, DBG_GREP, } = require("./defines.cjs"); var wscount = 0; class Seeker { constructor(opts = {}) { const msg = "Seeker.ctor()"; const dbg = DBG.SEEKER; (opts.logger || logger).logInstance(this, opts); let bilaraData = (this.bilaraData = opts.bilaraData || new BilaraData(opts)); let root = (this.root = opts.root || bilaraData.root || BILARA_PATH); dbg && console.log(msg, '[1]root', root, this.bilaraData.root); this.includeUnpublished = opts.includeUnpublished || this.bilaraData.includeUnpublished; this.lang = opts.lang || "en"; this.author = opts.author; this.languages = opts.languages || ["pli", "en"]; this.scApi = opts.scApi || new ScApi(); this.unicode = opts.unicode || new Unicode(); this.paliWords = opts.paliWords; this.patPrimary = opts.patPrimary || "/sutta/"; this.mj = new MerkleJson(); this.exampleCache = opts.exampleCache; this.memoizer = opts.memoizer || new Memoizer({ writeMem: false, // avoid monotonic increasing memory usage writeFile: opts.writeFile == null ? true : opts.writeFile, // only cache examples! readFile: opts.readFile, serialize: Seeker.serialize, deserialize: Seeker.deserialize, storeName: opts.memoStore, logger: this, }); this.enWords = opts.enWords; this.matchColor = opts.matchColor == null ? 121 : opts.matchColor; this.matchHighlight = opts.matchHighlight === undefined ? `\u001b[38;5;${this.matchColor}m$&\u001b[0m` : ""; this.matchWordEnd = opts.matchWordEnd; this.maxResults = opts.maxResults == null ? 1000 : opts.maxResults; this.maxDoc = opts.maxDoc == null ? MAX_DOC : opts.maxDoc; this.minLang = opts.minLang || 2; this.trilingual = opts.trilingual || false; } static reWord(lang = this.lang) { if (lang === "jpn") { return ""; } return "\\b"; } static sanitizePattern(pattern) { if (!pattern) { throw new Error("search pattern is required"); } const MAX_PATTERN = 1024; var excess = pattern.length - MAX_PATTERN; if (excess > 0) { throw new Error(`Search text too long by ${excess} characters.`); } // replace quotes (code injection on grep argument) pattern = pattern.replace(/["']/g, "."); // eliminate tabs, newlines and carriage returns pattern = pattern.replace(/\s/g, " "); // remove control characters pattern = pattern.replace(/[\u0000-\u001f\u007f]+/g, ""); // must be valid new RegExp(pattern); return pattern; } static normalizePattern(pattern) { // normalize white space to space pattern = pattern.trim().replace(/[\s]+/g, " ").toLowerCase(); return pattern; } get initialized() { return ( this.paliWords != null && this.enWords != null && this.bilaraData.initialized ); } initialize(msg = "") { var that = this; var { paliWords, enWords } = this; if (paliWords && enWords) { return Promise.resolve(that); } return new Promise((resolve, reject) => { (async function () { try { var p_pali = !paliWords && Pali.wordSet(); var p_en = !enWords && English.wordSet(); p_pali && (paliWords = await p_pali); p_en && (enWords = await p_en); that.paliWords = paliWords; that.enWords = enWords; await that.bilaraData.initialize(); //that.log(`Seeker.initialize resolve ${msg}`); resolve(that); } catch (e) { reject(e); } })(); }); } static buildExampleCache(examples) { let exampleCache = {}; let keys = Object.keys(examples).filter( (k) => k !== "authors" && k !== "comment" ); keys.map((lang) => { let eg = examples[lang]; return eg.reduce((a, e) => { let eLower = e.toLowerCase(); a[eLower] = 1; let eClean = Seeker.normalizePattern(Seeker.sanitizePattern(e)); a[eClean] = 1; return a; }, exampleCache); }); return exampleCache; } isExample(pattern = "") { var examples = this.bilaraData.examples; let { exampleCache } = this; if (!exampleCache) { this.exampleCache = exampleCache = Seeker.buildExampleCache(examples); } return !!exampleCache[pattern.toLowerCase()]; } patternLanguage(pattern, lang = this.lang) { const msg = "Seeker.patternLanguage() "; this.validate(); if (SuttaCentralId.test(pattern)) { var langs = SuttaCentralId.languages(pattern); return langs.length === 0 || langs.indexOf(lang) >= 0 ? lang : langs[0]; } var keywords = pattern.split(/ +/); let searchLang = keywords.reduce((a, k) => { if (this.enWords.contains(k)) { (!a || a === "pli") && (a = "en"); } else if (this.paliWords.contains(k)) { a = a || "pli"; } else { a = lang; } return a; }, null); return searchLang || lang; } validate() { if (!this.initialized) { throw new Error(`initialize() is required`); } return this; } grepComparator(a, b) { var cmp = b.count - a.count; if (cmp === 0) { cmp = a.fpath.localeCompare(b.fpath); } return cmp; } patternKeywords(pattern) { // + was inserted by normalizePattern() return pattern.split(/ \+?/); } keywordPattern(keyword, lang) { var pat = `\\b${keyword}`; if (this.paliWords.contains(keyword)) { var romKeyword = this.unicode.romanize(keyword); pat = keyword === romKeyword ? `\\b${Pali.romanizePattern(keyword)}` : keyword; } if (this.matchWordEnd === true) { pat += "\\b"; } return pat; } tipitakaRegExp(tc = "sutta") { var tcParts = tc.toLowerCase().split(","); //console.log({tc}); var pats = tcParts.reduce((a, p) => { let pat = TCMAP[p]; if (pat == null) { throw new Error(`tipitakaRegExp(): invalid category:${p}`); } a.push(pat); return a; }, []); let re; if (pats.length) { re = new RegExp(`(${pats.join("|")})`, "iu"); } re && this.debug(`tiptakaCategories`, re.toString()); return re; } grep(opts = {}) { const msg = "Seeker.grep() "; const dbg = DBG_GREP; var { author, pattern, maxResults, lang, language, // DEPRECATED searchMetadata, // TODO tipitakaCategories, patPrimary, } = opts; if (!author) { let emsg = `${msg} author is required`; console.trace(emsg); throw new Error(emsg); } var reTipCat = this.tipitakaRegExp(tipitakaCategories); lang = lang || language || this.lang; var root = this.root.replace(`${Files.APP_DIR}/`, ""); var slowOpts = { author, pattern, maxResults, lang, language, // DEPRECATED searchMetadata, // TODO tipitakaCategories, reTipCat, root, patPrimary, }; dbg && console.log(msg, slowOpts); var msStart = Date.now(); var result; var { memoizer, grepMemo } = this; if (grepMemo == null) { this.grepMemo = grepMemo = memoizer.memoize(Seeker.slowGrep, Seeker); } result = grepMemo(slowOpts); var msElapsed = Date.now() - msStart; // about 20ms return result; } static orderPrimary(lines, patPrimary) { let rePrimary = new RegExp(patPrimary, "ui"); let { primary, secondary } = lines.reduce( (a, line) => { if (rePrimary.test(line)) { a.primary.push(line); } else { a.secondary.push(line); } return a; }, { primary: [], secondary: [] } ); return primary.concat(secondary); } static async slowGrep(opts) { const msg = "Seeker.slowGrep "; const dbg = DBG_GREP; try { var { author, pattern, maxResults, lang, language, // DEPRECATED searchMetadata, // TODO reTipCat, root, patPrimary, } = opts; if (!root.startsWith("/")) { root = `${Files.APP_DIR}/${root}`; } logger.info(msg, { pattern, lang, root }); if (searchMetadata) { return Promise.reject(new Error(`searchMetadata not supported`)); } var grex = pattern; var cwd = lang === "pli" ? path.join(root, "root/pli") : path.join(root, `translation/${lang}`); var rgGlob2 = [ `-g='*-${lang}-${author}.json'`, ]; var rgGlob1 = [ `-g='!atthakatha' `, // exclude pli/vri `-g='!_*' `, // top-level JSON files `-g '!name'`, // exclude name `-g '!blurb'`, // exclude blurbs `-g '!ea'`, // exclude Chinese `-g '!ka'`, // exclude Chinese `-g '!sa'`, // exclude Chinese `-g '!ma'`, // exclude Chinese ]; var rgGlob = rgGlob2; var cmd = [ `rg -c -i -e '${grex}' `, ...rgGlob, `./`, // Must be explicit for Node // (https://github.com/BurntSushi/ripgrep/issues/2227) `|sort -k 2rn -k 1rd -t ':'`, ].join(" "); maxResults && (cmd += `|head -${maxResults}`); var pathPrefix = cwd.replace(root, "").replace(/^\/?/, ""); var cwdMsg = cwd.replace(`${root}/`, ""); logger.info(msg, `(${cwdMsg}) ${cmd}`); var execOpts = { cwd, shell: "/bin/bash", maxBuffer: MAXBUFFER, }; let { stdout, stderr } = await execPromise(cmd, execOpts); let lines = (stdout && stdout.trim().split("\n")) || []; let raw = Seeker.orderPrimary(lines, patPrimary); let rawTipCat = reTipCat ? raw.filter((f) => reTipCat.test(f)) : raw; let paths = rawTipCat.map((f) => path.join(pathPrefix, f)); dbg && console.log(msg, {cmd, execOpts, lines, paths}); return paths; } catch (e) { logger.warn(`slowGrep()`, JSON.stringify(opts), e.message, cmd); throw e; } } async phraseSearch(args) { const msg = "Seeker.phraseSearch() "; this.validate(); var { author, searchLang, searchAuthor, lang, language, pattern, maxResults, tipitakaCategories, patPrimary, } = args; lang = lang || language || this.lang; patPrimary = patPrimary || this.patPrimary; maxResults = maxResults == null ? this.maxResults : maxResults; if (pattern == null) { throw new Error(`${msg} requires pattern`); } lang = searchLang == null ? this.patternLanguage(pattern, lang) : searchLang; if (lang === "pli") { var romPat = this.unicode.romanize(pattern); var pat = romPat === pattern ? `\\b${Pali.romanizePattern(pattern)}` : pattern; } else { var pat = `${Seeker.reWord(lang)}${pattern}`; } author = author || AuthorsV2.langAuthor(lang, { category:tipitakaCategories, }); this.info(msg, `(${pat},${lang},${author})`); var grepArgs = Object.assign({}, args, { author: searchAuthor || author, pattern: pat, lang, maxResults, tipitakaCategories, patPrimary, }); var lines = await this.grep(grepArgs); return { method: "phrase", lang, pattern: pat, lines, }; } async keywordSearch(args) { const msg = "Seeker.keywordSearch() "; try { var { pattern, author, maxResults, lang, searchLang, language, // DEPRECATED searchMetadata, comparator, tipitakaCategories, patPrimary, } = args; comparator = comparator || this.grepComparator; patPrimary = patPrimary || this.patPrimary; lang = lang || language || this.lang; maxResults = maxResults == null ? this.maxResults : maxResults; var keywords = this.patternKeywords(pattern); lang = searchLang == null ? this.patternLanguage(pattern, lang || language) : searchLang; var wordArgs = Object.assign({}, args, { maxResults: 0, // don't clip prematurely lang, patPrimary, }); this.info(msg, `(${keywords}) lang:${lang}`); var mrgOut = []; var mrgIn = []; for (var i = 0; i < keywords.length; i++) { var keyword = keywords[i]; wordArgs.pattern = this.keywordPattern(keyword, lang); //console.log(msg, wordArgs); var wordlines = await this.grep(wordArgs); wordlines.sort(); // sort for merging path mrgOut = []; for (var iw = 0; iw < wordlines.length; iw++) { var lineparts = wordlines[iw].split(":"); var fpath = lineparts[0]; var count = Number(lineparts[1]); if (i === 0) { mrgOut.push({ fpath, count, }); } else if (mrgIn.length) { var cmp = mrgIn[0].fpath.localeCompare(fpath); if (cmp === 0) { var newItem = { fpath, count: Math.min(mrgIn[0].count, count), }; mrgOut.push(newItem); mrgIn.shift(); } else if (cmp < 0) { mrgIn.shift(); // discard left if (mrgIn.length === 0) { break; } iw--; // re-compare } else { // discard right } } } mrgIn = mrgOut; } var lines = mrgOut.sort(comparator) .map((v) => `${v.fpath}:${v.count}`); lines = Seeker.orderPrimary(lines, patPrimary); if (maxResults) { lines = lines.slice(0, maxResults); } return { method: "keywords", resultPattern: keywords .map((k) => this.keywordPattern(k, lang)) .join("|"), lang, maxResults, lines, }; } catch (e) { this.warn(msg, JSON.stringify(args), e.message); throw e; } } findArgs(args) { const msg = "Seeker.findArgs() "; const dbg = DBG.SEEKER || DBG.FINDARGS; const dbgv = DBG.VERBOSE; if (!(args instanceof Array)) { throw new Error("findArgs(?ARRAY-OF-ARGS?)"); } if (typeof args[0] === "string") { var opts = { pattern: args[0], maxResults: args[1], }; } else { var opts = args[0]; } var { author, docAuthor, docLang = opts.lang, includeUnpublished = this.includeUnpublished, lang, langAuthor, language, // DEPRECATED languages, matchHighlight, maxDoc, // maximum number of returned documents maxResults, // maximum number of grep files minLang, // minimum number of languages pattern: rawPattern, refAuthor, refLang, searchLang, searchAuthor, showMatchesOnly, sortLines, tipitakaCategories, trilingual = this.trilingual, types, } = opts; if (rawPattern == null) { throw new Error(`pattern is required`); } // STEP 1. Transform Pali diacriticals rawPattern = rawPattern.replace(/ṃ/gi, "ṁ"); // STEP 2. extract embeddable options var argv = rawPattern.split(" "); var pattern = ""; for (var i = 0; i < argv.length; i++) { var arg = argv[i]; if (arg === "-d" || arg === "--maxDoc") { let n = Number(argv[++i]); if (!isNaN(n) && 0 < n) { maxDoc = n; } } else if (arg === "-mr" || arg === "--maxResults") { let n = Number(argv[++i]); if (!isNaN(n) && 0 < n && n < 4000) { maxResults = n; } } else if (arg.startsWith("-tc:")) { tipitakaCategories = arg.substring("-tc:".length); } else if (arg === "-ml1") { minLang = 1; } else if (arg === "-ml2") { minLang = 2; } else if (arg === "-ml3") { minLang = 3; } else if (arg === "-ml" || arg === "--minLang") { let n = Number(argv[++i]); if (!isNaN(n) && 0 < n && n <= 3) { minLang = n; } } else if (arg === "-da" || arg === "--doc-author") { docAuthor = argv[++i]; trilingual = true; } else if (arg === "-dl" || arg === "--doc-lang") { docLang = argv[++i]; lang = docLang; // override legacy trilingual = true; } else if (arg === "-ra" || arg === "--ref-author") { refAuthor = argv[++i]; trilingual = true; } else if (arg === "-rl" || arg === "--ref-lang") { refLang = argv[++i]; trilingual = true; } else if (arg === "-l" || arg === "--lang") { if ((arg = argv[++i])) { lang = arg; docLang = arg; } } else if (arg === "-sl" || arg === "--searchLang") { (arg = argv[++i]) && (searchLang = arg); } else { pattern = pattern ? `${pattern} ${arg}` : arg; } } // STEP 3. Assign default values if (trilingual) { author = author || docAuthor; // override legacy } if (refLang == null) { let info = AuthorsV2.authorInfo(refAuthor); refLang = info && info.lang || 'en'; } if (refAuthor == null) { refAuthor = AuthorsV2.langAuthor(refLang); } lang = lang || language || docLang || this.lang; langAuthor = langAuthor || author || AuthorsV2.langAuthor(lang, {tipitakaCategories}); if (searchLang == null) { switch (docLang) { case undefined: case null: case 'de': case 'en': searchLang = this.patternLanguage(pattern, lang) dbgv && console.log(msg, '[1]searchLang', searchLang); break; default: searchLang = docLang; dbgv && console.log(msg, '[2]searchLangDoc', searchLang); break; } } minLang = minLang || 2; pattern = Seeker.sanitizePattern(pattern); pattern = Seeker.normalizePattern(pattern); showMatchesOnly == null && (showMatchesOnly = true); languages = languages || this.languages.slice() || []; lang && !languages.includes(lang) && languages.push(lang); refLang === searchLang && !languages.includes('ref') && languages.push('ref'); let isSuttaRef = SuttaCentralId.test(pattern); maxResults = Number(maxResults == null ? this.maxResults : maxResults); if (isNaN(maxResults)) { throw new Error("maxResults must be a number"); } maxDoc = isSuttaRef ? MAX_DOC : Number(maxDoc == null ? this.maxDoc : maxDoc); if (isNaN(maxDoc)) { throw new Error("maxDoc must be a number"); } matchHighlight == null && (matchHighlight = this.matchHighlight); if (!author) { author = searchLang && AuthorsV2.langAuthor(searchLang, { category: tipitakaCategories, }); } if (!author) { author = this.author; } if (trilingual) { if (isSuttaRef) { let [ patSuid, patLang, patAuthor ] = pattern.split('/'); if (patLang) { author = docAuthor; patAuthor = patAuthor || AuthorsV2.langAuthor(patLang); docAuthor = patAuthor || docAuthor; author = docAuthor; docLang = patLang; } docAuthor = docAuthor || patAuthor; } docAuthor = docAuthor || AuthorsV2.langAuthor(docLang) || 'sujato'; if (docLang == null) { let info = AuthorsV2.authorInfo(docAuthor); docLang = info && info.lang; } } types = types || ["root", "translation"]; //console.log(msg, {docLang, docAuthor, isSuttaRef}); if (docLang == null) { if (isSuttaRef) { let pats = pattern.split(','); let [ segref, patLang, patAuthor ] = pats[0].split("/"); patLang && (docLang = patLang); docLang = patLang || lang; patAuthor && (docAuthor = patAuthor); } else { docLang = lang; } } docAuthor = docAuthor || AuthorsV2.langAuthor(docLang);; searchAuthor = searchAuthor || docLang === searchLang && docAuthor || refLang === searchLang && refAuthor || AuthorsV2.langAuthor(searchLang) || docAuthor; let result = { author, docLang, docAuthor, includeUnpublished, lang, langAuthor, languages, matchHighlight, maxDoc, maxResults, minLang, pattern, refAuthor, refLang, searchLang, searchAuthor, showMatchesOnly, sortLines, tipitakaCategories, trilingual, types, }; dbg && console.log(msg, '[4]=>', result); return result; } clearMemo(name) { var cache = this.memoizer.cache; if (name === "find") { return cache.clearVolume(`Seeker.callSlowFind`); } else if (name === "grep") { return cache.clearVolume(`Seeker.slowGrep`); } } find(...args) { const msg = "Seeker.find() "; let dbg = DBG.FIND; var { findMemo, memoizer } = this; var findArgs = this.findArgs(args); var that = this; var callSlowFind = (args) => { return that.slowFind.call(that, args); }; var msStart = Date.now(); //var pattern = typeof args === 'string' //? args //: args[0].pattern; let { pattern, trilingual } = findArgs; if (this.isExample(pattern)) { dbg && console.log(msg, '[1]example', { trilingual, pattern, }); if (findMemo == null) { that.findMemo = findMemo = memoizer.memoize(callSlowFind, Seeker); } var promise = findMemo(findArgs); this.debug(`${msg} example:${pattern}`); } else { this.info(`${msg} non-example:${pattern}`); dbg && console.log(msg, '[2]!example', { trilingual, pattern, }); var promise = callSlowFind(findArgs); } return promise; } slowFindId(opts={}) { const msg = "Seeker.slowFindId() "; const dbg = DBG.SEEKER || DBG.SLOWFIND || DBG.SLOWFINDID; const dbgv = DBG.VERBOSE && dbg; let { lang='en', languages=['pli','en'], maxResults, pattern, author, docLang, docAuthor, refLang, refAuthor, trilingual, } = opts; var bd = this.bilaraData; var examples = bd.examples; var resultPattern = pattern; let method, uids, suttaRefs; if (!SuttaCentralId.test(pattern)) { dbg && console.log(msg, '[1]!sutta-id', ({pattern})); this.debug(msg, 'not sutta id', {pattern}); return undefined; } maxResults = maxResults || this.maxResults; if (pattern.indexOf("/") < 0) { pattern = pattern .split(",") .map((p) => `${p}/${lang}`) .join(","); } pattern = pattern.replace(/:[^/,]*/g, ''); // remove segment refs if (trilingual) { // trilingual always uses pli, so remove language bias // triglingual relies on docAuthor, docLang, refLang, refAuthor pattern = pattern.replace(/\/[-a-z0-9.]*/ig, ''); } let res = bd.sutta_uidSearch(pattern, maxResults); dbgv && console.log(msg, '[2]sutta_uidSearch', res); method = res.method; uids = res.uids; suttaRefs = res.suttaRefs; res.lang && (lang = res.lang); if (!languages.includes(lang)) { languages = [...languages.filter((l) => l !== "en"), lang]; } let result = { lang, maxResults, pattern, method, uids, suttaRefs, languages, docLang, docAuthor, refLang, refAuthor, trilingual, }; dbg && console.log(msg, '=>', result); return result; } async slowFind(findArgs) { const msg = "Seeker.slowFind() "; const dbg = DBG.SLOWFIND; try { var msStart = Date.now(); let result; if (findArgs.trilingual) { dbg && console.log(msg, "[1]slowFind", findArgs.pattern); result = this.slowFindTrilingual(findArgs) } else { dbg && console.log(msg, "[2]slowFindLegacy", findArgs.pattern); result = this.slowFindLegacy(findArgs); } var msElapsed = Date.now() - msStart; let secs = `${(msElapsed/1000).toFixed(3)}s`; logger.info(msg, findArgs.pattern, secs); return result; } catch (e) { this.warn(msg, JSON.stringify(findArgs), e.message); throw e; } } async slowFindLegacy(findArgs) { const msg = "Seeker.slowFindLegacy() "; const dbg = DBG.SEEKER || DBG.SLOWFIND; var msStart = Date.now(); var { author, includeUnpublished, docLang, docAuthor, lang, languages, matchHighlight, maxDoc, maxResults, minLang=2, pattern, refAuthor = "sujato", refLang, searchLang, showMatchesOnly, sortLines, tipitakaCategories, trilingual, types, } = findArgs; var bd = this.bilaraData; var examples = bd.examples; var resultPattern = pattern; var scoreDoc = true; let method, uids, suttaRefs; let isSuidPattern = SuttaCentralId.test(pattern); if (examples[lang] && examples[lang].indexOf(pattern) >= 0) { searchLang = lang; } if (isSuidPattern) { dbg && console.log(msg, '[1]slowFindId', pattern); let res = this.slowFindId({ author, lang, languages, maxResults, pattern, docLang, docAuthor, refLang, refAuthor, trilingual, }); lang = res.lang; maxResults = res.maxResults; method = res.method; uids = res.uids; suttaRefs = res.suttaRefs; languages = res.languages; scoreDoc = false; } else { dbg && console.log(msg, '[2]slowFindPhrase', pattern); let res = await this.slowFindPhrase({ author, lang, maxResults, pattern, searchLang, showMatchesOnly, sortLines, tipitakaCategories, }); method = res.method; resultPattern = res.resultPattern; sortLines = res.sortLines; suttaRefs = res.suttaRefs; } var mlDocs = []; var segsMatched = 0; var bilaraPaths = []; var matchingRefs = []; var msStart = Date.now(); for (var i = 0; i < suttaRefs.length; i++) { let suttaRef = suttaRefs[i]; let [suid, srLang, authorId] = suttaRef.split("/"); author = authorId || author; let suttaInfo = bd.suttaInfo(suttaRef); if (!suttaInfo) { this.info(`skipping ${suttaRef}`); continue; } let isBilDoc = bd.isBilaraDoc({ suid, lang: srLang || docLang || lang, author: author, includeUnpublished, }); let mld; if (isBilDoc) { let mldOpts = { suid, languages, lang, types, }; if (method==="sutta_uid" && author!=null && author!=="ms") { mldOpts.author = author; } mld = await bd.loadMLDoc(mldOpts); var mldBilaraPaths = mld.bilaraPaths.sort(); if (mldBilaraPaths.length < minLang) { //console.log(msg, `skipping ${mld.suid} ${mld.title}`); this.debug( `skipping ${mld.suid} minLang`, `${mldBilaraPaths.length}<${minLang} [${languages}]` ); continue; } bilaraPaths = [...bilaraPaths, ...mldBilaraPaths]; var resFilter = mld.filterSegments({ pattern, resultPattern, languages: [searchLang], showMatchesOnly, method, }); mld.segsMatched = resFilter.matched; segsMatched += mld.segsMatched; if (matchHighlight) { mld.highlightMatch(resultPattern, matchHighlight); } if (resFilter.matched === 0) { this.info(`Ignoring ${mld.suid} ${pattern}`); } else if (mld.bilaraPaths.length >= minLang) { let segIds = Object.keys(mld.segMap); if (segIds.length) { mlDocs.push(mld); matchingRefs.push(suttaRef); } else { this.info(`skipping ${mld.suid} segments:0`); } } else { this.info(`skipping ${mld.suid} minLang:${minLang}`); } } else { let isBilDocUnpub = bd.isBilaraDoc({ suid, lang: refLang || lang, author, includeUnpublished: true, }); if (isBilDocUnpub) { this.debug( `slowFind() -> unpublished:`, `${suid}/${refLang || lang}/${author}` ); } else { this.warn(`NOT SUPPORTED: legacy ${suid}`); } } } scoreDoc && mlDocs.sort(MLDoc.compare); mlDocs = mlDocs.slice(0, maxDoc); var result = { author, lang, // embeddable option searchLang, // embeddable option minLang, // embeddable option maxDoc, // embeddable option maxResults, // embeddable option pattern, method, resultPattern, segsMatched, bilaraPaths, suttaRefs: matchingRefs, mlDocs, refLang, refAuthor, docLang, docAuthor, }; return result; } async slowFindTrilingual(findArgs) { const msg = "Seeker.slowFindTrilingual()"; const dbg = DBG.SEEKER || DBG.SLOWFIND || DBG.SLOWFINDID; const dbgv = DBG.VERBOSE && dbg; var msStart = Date.now(); var { author, includeUnpublished, docLang, docAuthor, lang, languages, matchHighlight, maxDoc, maxResults, minLang=2, pattern, refAuthor = "sujato", refLang, searchLang, showMatchesOnly, sortLines, tipitakaCategories, trilingual, types, } = findArgs; var bd = this.bilaraData; var examples = bd.examples; var resultPattern = pattern; var scoreDoc = true; let method, uids, suttaRefs; let isSuidPattern = SuttaCentralId.test(pattern); if (examples[lang] && examples[lang].indexOf(pattern) >= 0) { searchLang = lang; } if (isSuidPattern) { dbg && console.log(msg, '[1]slowFindId'); let res = this.slowFindId({ author, lang, languages, maxResults, pattern, docLang, docAuthor, refLang, refAuthor, trilingual, }); lang = res.lang; maxResults = res.maxResults; method = res.method; uids = res.uids; suttaRefs = res.suttaRefs; languages = res.languages; scoreDoc = false; } else { dbg && console.log(msg, '[2]slowFindPhrase'); let res = await this.slowFindPhrase({ author, lang, maxResults, pattern, searchLang, showMatchesOnly, sortLines, tipitakaCategories, }); method = res.method; resultPattern = res.resultPattern; sortLines = res.sortLines; suttaRefs = res.suttaRefs; } var mlDocs = []; var segsMatched = 0; var bilaraPaths = []; var matchingRefs = []; var msStart = Date.now(); dbg && console.log(msg, '[3]suttaRefs', suttaRefs); for (var i = 0; i < suttaRefs.length; i++) { let suttaRef = suttaRefs[i]; let [suid, srLang, authorId] = suttaRef.split("/"); author = authorId || author; let suttaInfo = bd.suttaInfo(suttaRef); if (!suttaInfo) { this.info(`skipping ${suttaRef}`); continue; } let isBilDoc = bd.isBilaraDoc({ suid, lang: 'pli', author: 'ms', includeUnpublished, }); let mld; if (isBilDoc) { let mldOpts = { refLang, refAuthor, docLang, docAuthor, trilingual, } mld = await bd.trilingualDoc(suttaRef, mldOpts); dbg && console.log(msg, '[3]trilingualDoc', suttaRef, { score: mld?.score, langSegs: mld?.langSegs, }); var mldBilaraPaths = mld.bilaraPaths.sort(); if (mldBilaraPaths.length < minLang) { dbg && console.log(msg, '[4]!minLang', mld.suid, `${mldBilaraPaths.length}<${minLang} [${languages}]` ); continue; } bilaraPaths = [...bilaraPaths, ...mldBilaraPaths]; let filterLangs = [searchLang]; if (searchLang === refLang) { filterLangs.push('ref'); } let filterOpts = { pattern, resultPattern, languages: filterLangs, showMatchesOnly, method, } var resFilter = mld.filterSegments(filterOpts); dbg && console.log(msg, '[4]filterSegments', resFilter.matched, filterOpts); mld.segsMatched = resFilter.matched; segsMatched += mld.segsMatched; if (matchHighlight) { mld.highlightMatch(resultPattern, matchHighlight); } if (resFilter.matched === 0) { dbg && console.log(msg, '[5]ignoring', suttaRef, resFilter); this.info(`Ignoring ${mld.suid} ${pattern}`); } else if (mld.bilaraPaths.length >= minLang) { let segIds = Object.keys(mld.segMap); if (segIds.length) { dbg && console.log(msg, '[6]mlDocs', suttaRef); mlDocs.push(mld); matchingRefs.push(suttaRef); } else { dbg && console.log(msg, '[7]skipping', suttaRef); this.info(`skipping ${mld.suid} segments:0`); } } else { dbg && console.log(msg, '[8]ignoring', suttaRef); this.info(`skipping ${mld.suid} minLang:${minLang}`); } } else { let isBilDocUnpub = bd.isBilaraDoc({ suid, lang: refLang || lang, author, includeUnpublished: true, }); dbg && console.log(msg, '[9]isBilDocUnpub', isBilDocUnpub); if (isBilDocUnpub) { dbg && console.log(msg, `slowFind() -> unpublished:`, `${suid}/${refLang || lang}/${author}` ); } } } scoreDoc && mlDocs.sort(MLDoc.compare); mlDocs = mlDocs.slice(0, maxDoc); var result = { author, lang, // embeddable option searchLang, // embeddable option minLang, // embeddable option maxDoc, // embeddable option maxResults, // embeddable option pattern, method, resultPattern, segsMatched, bilaraPaths, suttaRefs: matchingRefs, mlDocs, refLang, refAuthor, docLang, docAuthor, trilingual, }; return result; } async slowFindPhrase(args = {}) { const msg = "Seeker.slowFindPhrase() "; const dbg = DBG.SEEKER; let { author, lang, maxResults, pattern, searchLang = args.lang, searchAuthor = args.author, showMatchesOnly, sortLines, tipitakaCategories, } = args; author = author || AuthorsV2.langAuthor(searchLang, { category: tipitakaCategories, }); try { let msStart = Date.now(); let bd = this.bilaraData; let examples = bd.examples; var resultPattern = pattern; let scoreDoc = true; let method = "phrase"; let uids, suttaRefs; let searchOpts = { author, pattern, searchLang, maxResults, lang, showMatchesOnly, tipitakaCategories, }; var { lines, pattern: resultPattern } = await this.phraseSearch( searchOpts ); if (lines.length) { dbg && console.log(msg, {resultPattern, lines}); this.debug(msg, `phrase`, { resultPattern, lines: lines.length }); } else { method = "keywords"; let data = await this.keywordSearch(searchOpts); var { lines, resultPattern } = data; this.debug(msg, `keywords`, { resultPattern, lines: lines.length, }); } sortLines && lines.sort(sortLines); suttaRefs = lines.map((line) => BilaraPath.pathParts(line).suttaRef); dbg && console.log(msg, `suttaRefs`, suttaRefs); return { method, resultPattern, sortLines, suttaRefs, }; } catch (e) { this.warn('logLevel', this.logLevel); this.warn(msg, JSON.stringify({ lang, maxResults, pattern, searchLang, showMatchesOnly, sortLines, tipitakaCategories, }), e.message ); throw e; } } static serialize(obj) { return JSON.stringify(obj, null, 2); } static deserialize(buf) { var json = JSON.parse(buf); var { volume, args, value } = json; if (volume === "Seeker.callSlowFind") { json.value.mlDocs = json.value.mlDocs.map((m) => new MLDoc(m)); } return json; } } module.exports = exports.Seeker = Seeker; })(typeof exports === "object" ? exports : (exports = {}));