UNPKG

scv-bilara

Version:
827 lines (747 loc) 25.8 kB
#!/usr/bin/env node const fs = require('fs'); const path = require('path'); const { logger } = require('log-instance'); const { Files } = require("memo-again"); const { BilaraData, English, ExampleV2, ExecGitMock, Seeker, SuttaCentralId, Verse, } = require('../../index'); const { DBG, } = require('../../src/defines.cjs'); const { AuthorsV2, BilaraPath, } = require("scv-esm"); const LOCAL = path.join(__dirname, '../../local'); const BILARA_DATA = path.join(LOCAL, '/bilara-data'); var bdName = 'bilara-data'; var bilaraData; function help() { console.log(` NAME search - search bilara-data root text and translations SYNOPSIS search [OPTIONS] PATTERN_KEYWORDS DESCRIPTION Searches bilara-data for root text or translations. Writes output results in JSON to stdout, highlighting matches if output is console. -b0, --break0 By default, verse grouped lines are joined separated by a single space. -b1, --break1 Separate verse grouped lines by two spaces followed by a newline. In Markdown blockquotes, each line will stand alone. -c, --color COLORNUMBER Display matches with colors. The default color is 201. Use "--color auto" to remove color when stdout is not a console. Use "--color none" to remove color. See https://misc.flogisoft.com/bash/tip_colors_and_formatting -bd, --bilara-data DATA_NAME specifiy name of Bilara data directory (e.g., "ebt-data"). Default is 'bilara-data' -d, --maxDoc NUMBER specify maximum number of documents to display. Default is 50. -da, --doc-author AUTHOR Specify document author (e.g., sujato) -dl, --doc-lang ISO_LANG_2 Specify document language (e.g., en) -es, --exampleSuttas Return JSON map of examples to matching suttas. Use examples for given docAuthor and docLang values. Matching suttas are listed in descending order of relevant scores. Suttas with the highest score are "definitional suttas" that provide the most explanation for a given example. -esk, --exampleSuttaKeywords Like "-es" option but only returns results for keyword searches only. -esp, --exampleSuttaPhrases Like "-es" option but only returns results for phrase searches only. -esd3, --exampleSuttaD3 Return JSON D3 graph of examples and matching suttas. Use examples for given docAuthor and docLang values. -f, --filter MODE Filter segments according to mode: "pattern", "none". If mode is "pattern", then only segments matching pattern will be shown. If mode is "none", segments will not be filtered. -ga ACCOUNT, --gitAccount ACCOUNT Choose GitHub account name. Default is "suttacentral". -gb BRANCH, --gitBranch BRANCH Choose git branch for bilara-data. Default is "unpublished". --gitMock Ignore all git operations. This option is for containers with fixed content. -l, --lang ISO_LANG_2 Specify ISO 2-letter language code for primary translation language. Default is "en" for English. -ll, --logLevel LOGLEVEL Logging is normally turned off, but you can specificy a LOGLEVEL: debug, warn, info, error. The most useful will be "info". The default is "warn". -ml, --minLang NUMBER Only show segments from documents with at least minLang languages. Default is 3 unless the pattern language is 'en', in which case it is 2. -mr, --maxResults NUMBER Maximum number of grep result files to work with (default 1000). -nm, --no-memo Don't use memoizer cache (slow search for new content) -oc, --outCSV Output comma-separated values. -oj, --outJSON Output JSON -oh, --outHuman Output human format (default). -oh1, --outHuman1 Output human format in translation language. -oh2, --outHuman2 Output human format in Pali and translation language. -oh3, --outHuman3 Output human format in Pali, English and translation language. -ol, --outLines Output lines only. -ol1 Output translation lines only. -ol2 Output matching lines as well as the corresponding translation or root text. -ol3 Output trilingual lines. -om, --outMarkdown Output Markdown for matching segments. Default. -om1, --outMarkdown1 Output translation for matching segments, formatted with Markdown. -om2, --outMarkdown2 Output Pali and translation for matching segments, formatted with Markdown. -om3, --outMarkdown3 Output matching segments, formatted with tri-lingual Markdown. -op, --outPaths Output file paths of matching suttas -os, --outScore Output sutta references and score -ot, --outTrans Output translation only for matching segments. -ov, --outVerse Output text by verse (vs. by line) -ov1, --outVerse1 Output matching monoligual translation verses (e.g., English) -ov2, --outVerse2 Output matching bilingual verses (e.g., Pali, English) -ov3, --outVerse3 Output matching trilingual verses (e.g., Pali, English, German) --outLegacy Output legacy format. (NO LONGER SUPPORTED) -ra, --ref-author AUTHOR Specify reference author (e.g., sujato) -rl, --ref-lang ISO_LANG_2 Specify reference language (e.g., en) -sl, --searchLang ISO_LANG_2 Specify ISO 2-letter language code for language to search. Default is determined from pattern language. -sy, --sync Fetch the latest bilara-data -up, --unpublished Search unpublished documents in current branch -tc:CATEGORIES Restrict searches to listed categories. For example, "-tc:bi,pj" will search for information in the Bhikkhuni Pārājika. To see only suttas, use "-tc:sutta" or "-tc:su" `); process.exit(0); } var gitAccount = "suttacentral"; var pattern; var root; var maxResults = 1000; var logLevel = 'warn'; var color = 201; var outFormat = 'human'; var showMatchesOnly = true; var includeUnpublished = false; var isTTY = process.stdout.isTTY; var verbose = false; var readFile = true; var sync = undefined; var execGit = undefined; var groupBy = 'line'; var linebreak = ' '; var branch = 'unpublished'; var trilingual = true; var exampleSuttas = false; var exampleSuttaD3 = false; var exampleSuttaKeywords = false; var exampleSuttaPhrases = false; var docLang = 'en'; var docAuthor; //var searchLang; var nargs = process.argv.length; if (nargs < 3) { help(); } for (var i = 2; i < nargs; i++) { var arg = process.argv[i]; if (i<2) { continue; } // peek if (arg === '-dl' || arg === '--docLang') { docLang = process.argv[i+1]; } else if (arg === '-da' || arg === '--docAuthor') { docAuthor = process.argv[i+1]; } // shift if (arg === '-?' || arg === '--help') { help(); } else if (arg === '-b0' || arg === '--break0') { linebreak = ' '; } else if (arg === '-b1' || arg === '--break1') { linebreak = ' \n'; } else if (arg === '-bd' || arg === '--bilara-data') { bdName = process.argv[++i]; if (bdName === 'ebt-data') { branch = 'published'; } } else if (arg === '-esd3' || arg === '--exampleSuttaD3') { exampleSuttaD3 = true; } else if (arg === '-esk' || arg === '--exampleSuttaKeywords') { exampleSuttaKeywords = true; } else if (arg === '-esp' || arg === '--exampleSuttaPhrases') { exampleSuttaPhrases = true; } else if (arg === '-es' || arg === '--exampleSuttas') { exampleSuttas = true; } else if (arg === '-ll' || arg === '--logLevel') { logLevel = process.argv[++i]; } else if (arg === '-f' || arg === '--filter') { var filter = process.argv[++i]; showMatchesOnly = filter === 'pattern'; } else if (arg === '-sy' || arg === '--sync') { sync = true; } else if (arg === '-ga' || arg === '--gitAccount') { gitAccount = process.argv[++i]; } else if (arg === '-gb' || arg === '--gitBranch') { branch = process.argv[++i]; } else if (arg === '-nm' || arg === '--no-memo') { readFile = false; } else if (arg === '-c' || arg === '--color') { color = process.argv[++i]; } else if (arg === '--gitMock') { execGit = new ExecGitMock(); } else if (arg === '-ov' || arg === '--outVerse') { groupBy = 'verse'; outFormat = 'verse'; showMatchesOnly = false; } else if (arg === '-ov1' || arg === '--outVerse1') { groupBy = 'verse1'; outFormat = 'verse'; showMatchesOnly = false; } else if (arg === '-ov2' || arg === '--outVerse2') { groupBy = 'verse2'; outFormat = 'verse'; showMatchesOnly = false; } else if (arg === '-ov3' || arg === '--outVerse3') { groupBy = 'verse3'; outFormat = 'verse'; showMatchesOnly = false; } else if (arg === '-os' || arg === '--outScore') { outFormat = 'score'; } else if (arg === '-oj' || arg === '--outJSON') { outFormat = 'json'; } else if (arg === '-om' || arg === '--outMarkdown') { outFormat = 'markdown'; } else if (arg === '-om1' || arg === '--outMarkdown1') { outFormat = 'markdown1'; } else if (arg === '-om2' || arg === '--outMarkdown2') { outFormat = 'markdown2'; } else if (arg === '-om3' || arg === '--outMarkdown3') { outFormat = 'markdown3'; } else if (arg === '-ot' || arg === '--outTrans') { outFormat = 'trans'; } else if (arg === '-ol' || arg === '--outLines') { outFormat = 'lines'; } else if (arg === '-ol1') { outFormat = 'lines1'; } else if (arg === '-ol2') { outFormat = 'lines2'; } else if (arg === '-ol3') { outFormat = 'lines3'; } else if (arg === '-op' || arg === '--outPaths') { outFormat = 'paths'; } else if (arg === '-oh' || arg === '--outHuman') { outFormat = 'human'; } else if (arg === '-oh1' || arg === '--outHuman1') { outFormat = 'human1'; } else if (arg === '-oh2' || arg === '--outHuman2') { outFormat = 'human2'; } else if (arg === '-oh3' || arg === '--outHuman3') { outFormat = 'human3'; } else if (arg === '--outLegacy') { outFormat = 'legacy'; help(); } else if (arg === '-oc' || arg === '--outCSV') { outFormat = 'csv'; } else if (arg === '-mr' || arg === '--maxResults') { maxResults = Number(process.argv[++i]); } else if (arg === '-up' || arg === '--unpublished') { includeUnpublished = true; } else { pattern = pattern ? `${pattern} ${arg}` : arg; } } pattern = pattern || `wurzel des leidens`; function matchBash(color) { if (color === 'none' || color==='auto' && (isTTY || outFormat==='json')) { return `$&` } if (color === 'auto') { return `\u001b[38;5;201m$&\u001b[0m` } var c = Number(color); return `\u001b[38;5;${c}m$&\u001b[0m` } function outCSV(res) { var { mlDocs, } = res; console.log(`scid,lang,text`); mlDocs.forEach(mld => { mld.segments().forEach(seg => { var scid = seg.scid; Object.keys(seg).forEach(k => { if (k === 'scid') { return; } console.log([ `"${scid}"`, `"${k}"`, `"${seg[k]}"`, ].join(',')); }); }); }); } function outJSON(res) { var { resultPattern, } = res; var text = JSON.stringify(res, null, 2); if (color !== 'none' && (isTTY || color !== 'auto')) { var rex = new RegExp(resultPattern, "giu"); text = text.replace(rex, matchBash(color)); } console.log(text); } function outHuman(res, pattern, nLang=1) { const msg = "search.outHuman() "; var { mlDocs, suttaRefs, elapsed, searchLang, method, minLang, lang, segsMatched, refLang = '', refAuthor = '', docLang = '', docAuthor = '', } = res; var refs = res.suttaRefs.map(s=>s.split('/')[0]) .sort(SuttaCentralId.compareLow) .join(','); var nRefs = res.suttaRefs.length; var nDocs = mlDocs.length; console.log( `pattern : "${res.pattern}" grep:${res.resultPattern} source : ${root}@${branch} languages : translation:${res.lang} search:${searchLang} minLang:${res.minLang} output : ${outFormat} color:${color} elapsed:${elapsed}s maxDoc:${res.maxDoc} found : segs:${segsMatched} by:${method} mlDocs:${nDocs} docs:${nRefs} ${refs}`); if (trilingual) { console.log( `trilingual:`, `doc:${docLang}/${docAuthor}`, `ref:${refLang}/${refAuthor}`, ); } mlDocs.forEach((mld,im) => { let {suid, docAuthor, docAuthorName, docFooter, lang} = mld; mld.segments().forEach((seg,i) => { var scid = seg.scid; var sep = '---'; if (i === 0) { let sm = mld.hasOwnProperty('segsMatched') ? mld.segsMatched : ''; let score = mld.score.toFixed(3); let title = [ `doc:${im+1}/${nDocs}`, `${docAuthorName} ${suid}/${lang}/${docAuthor}`, `score:${score}` ].join(' '); console.log(`${sep} ${title} ${sep}`); } let scidText = [ `\u001b[38;5;80m`, scid, `\u001b[0m`, ].join(''); switch (nLang) { case 1: console.log(`${scidText}: ${seg[searchLang]}`); break; case 2: if (trilingual) { let { name } = bilaraData; console.log(`scid: ${scidText}`); if (docAuthor !== refAuthor) { console.log(` ref: ${seg.ref || ''}`); console.log(` ${docLang}: ${seg[docLang] || ''}`); } else { console.log(` pli: ${seg.pli || ''}`); console.log(` ${docLang}: ${seg[docLang] || ''}`); } } else { console.log(`scid: ${scidText}`); console.log(` pli: ${seg.pli || ''}`); console.log(` en: ${seg.en || ''}`); if (searchLang !== 'pli' && searchLang !== 'en') { var text = seg[searchLang] || ''; console.log(` ${searchLang}: ${text}`); } else if (lang !== 'pli' && lang !== 'en') { var text = seg[lang] || ''; console.log(` ${lang}: ${text}`); } } break; case 3: default: if (trilingual) { console.log(`scid: ${scidText}`); console.log(` pli: ${seg.pli || ''}`); console.log(` ref: ${seg.ref || ''}`); console.log(` ${docLang}: ${seg[docLang] || ''}`); } else { console.log(`scid: ${scidText}`); console.log(` pli: ${seg.pli || ''}`); console.log(` en: ${seg.en || ''}`); if (searchLang !== 'pli' && searchLang !== 'en') { var text = seg[searchLang] || ''; console.log(` ${searchLang}: ${text}`); } else if (lang !== 'pli' && lang !== 'en') { var text = seg[lang] || ''; console.log(` ${lang}: ${text}`); } } break; } }); let footer = mld.docFooter || ''; footer = footer.replaceAll(/<[^>]*>/g, ''); console.log('\n', footer); }); } function outScore(res, pattern) { res.mlDocs.forEach(mld => { console.log(`${mld.score}\t${mld.suid}`); }); } function outPaths(res, pattern) { res.bilaraPaths.forEach(p => { console.log(path.join(BILARA_DATA, p)); }); } function suttacentralLink(scid, lang, author_uid) { var suid = scid.split(':')[0]; var linkText = new SuttaCentralId(scid).standardForm(); var link = `https://suttacentral.net/${suid}`; if (lang && author_uid) { var author = author_uid.split(', ')[0] || author_uid; link = `https://suttacentral.net/${suid}/${lang}/${author}#${scid}`; } return `[${linkText}](${link})`; } function outVerse(res, pattern, n=0) { var { lang, searchLang, } = res; n = Number(n); let showPli = n===2 && searchLang===lang || n>2 && searchLang!=='pli' && lang!=='pli'; let showEn = n>2 && searchLang!=='en' && lang!=='en'; let verse = new Verse({ linebreak, lang, searchLang, showPli, showEn, }); res.mlDocs.forEach(mld => { var suid = mld.suid; let segments = mld.segments(); let lines = verse.versify(segments, mld.lang, mld.author_uid); console.log(lines.join('\n')); }); } function outLines(res, pattern, n=0) { const msg = "search.outLines() "; var { lang, searchLang, author, docLang, docAuthor, docAuthorName, refLang, refAuthor, } = res; console.log( `---:pli/ms`, `doc:${docLang}/${docAuthor} ${docAuthorName}`, `ref:${refLang}/${refAuthor}` ); n = Number(n); res.mlDocs.forEach(mld => { var suid = mld.suid; mld.segments().forEach((seg,i) => { var scid = seg.scid; var langText = seg[lang] || ''; var searchText = ( n===0 && searchLang!==lang || n===2 && searchLang!==lang || n>2 && searchLang!==lang && searchLang!=='pli' ) && seg[searchLang] || ''; var enText = n>2 && searchLang!=='en' && lang!=='en' && seg.en || ''; var pliText = ( n===2 && searchLang===lang || n>2 && searchLang!=='pli' && lang!=='pli' ) && lang!=='pli' && seg.pli || ''; var docText = seg[docLang] || ''; var refText = ( n > 2 ) && seg[refLang] || ''; pliText && console.log(`${scid} ---: ${pliText}`); refText && console.log(`${scid} ref: ${refText}`); docText && console.log(`${scid} doc: ${docText}`); //enText && console.log(`${scid} en: ${enText}`); //searchText && console.log(`${scid} find: ${searchText}`); //langText && console.log(`${scid} lang: ${langText}`); if (!pliText && !searchText && !langText) { console.log(seg); } }); }); } function outMarkdown(res, pattern, nLang=3) { res.mlDocs.forEach(mld => { var suid = mld.suid; mld.segments().forEach((seg,i) => { var scid = seg.scid; var langText = (seg[res.lang] || '').trim(); var scLink = suttacentralLink(scid, mld.lang, mld.author_uid); if (nLang > 1) { console.log(`> ${scLink}: ${seg.pli} `); } if (nLang > 2 && res.lang !== 'en') { console.log(`> ${scLink}: ${seg.en} `); } langText && console.log(`> ${scLink}: ${langText} `); }); }); } function outTrans(res, pattern) { res.mlDocs.forEach(mld => { var suid = mld.suid; mld.segments().forEach((seg,i) => { var scid = seg.scid; var text = (seg[res.lang] || '').trim(); if (i === 0) { console.log(`--- [${suid}](https://suttacentral.net/${suid}) ---`); } text && console.log(text); }); }); } function write_editor(res, args, editor) { var searchPaths = res.bilaraPaths.filter(p => BilaraPath.pathParts(p).lang === res.searchLang ) .map(p => path.join(BILARA_DATA, p)); script = [ args, ...searchPaths, ].join(' '); var epath = path.join(LOCAL, `bls_edit.${editor}`); fs.writeFileSync(epath, script); } function scriptEditor(res, pattern) { write_editor(res, '', 'subl'); var vipat = res.resultPattern .replace(/\\b/, '\\<') .replace(/[|()]/g,'\\$&'); write_editor(res, `'+/${vipat}'`, 'vi'); } async function outExampleSuttaD3(opts={}) { let lang = docLang; let author = docAuthor || AuthorsV2.langAuthor(lang); let memoize = readFile; let ev2 = await new ExampleV2({lang, author, memoize}).initialize(); let examples = await ev2.examples(); let links = []; let nodes = []; examples = examples.filter(eg=>!!eg); let egSuttaMap = await ev2.suttasOfExamples(examples, opts); let suidLinks = {}; let suidRank = {}; for (let i=0; i < examples.length; i++) { let eg = examples[i]; let egSuttas = egSuttaMap[eg]; egSuttas.forEach((egs,i)=>{ let rank = i+1; suidRank[egs] = Math.min(rank, suidRank[egs]||rank); suidLinks[egs] = suidLinks[egs] || 0; suidLinks[egs]++; links.push({ source: eg, target: egs, rank}); }); nodes.push({ id: eg, group: "Examples", links: egSuttas.length, }); } Object.keys(suidLinks).forEach(sutta_uid=>{ nodes.push({ id: sutta_uid, group: sutta_uid.replace(/[^a-z]+/i, ''), rank: suidRank[sutta_uid], links: suidLinks[sutta_uid], }); }); let graph = { nodes, links } console.log(JSON.stringify(graph, null, 2)); } async function outExampleSuttas(opts={}) { let lang = docLang; let author = docAuthor || AuthorsV2.langAuthor(lang); let memoize = readFile; let ev2 = await new ExampleV2({lang, author, memoize}).initialize(); //let examples = ['wurzel des leidens']; let examples = await ev2.examples(); examples = examples.filter(eg=>!!eg); let suttas = await ev2.suttasOfExamples(examples, opts); for (let i=0; i < examples.length; i++) { let key = examples[i]; let value = suttas[key]; suttas[key] = value.join(' '); } console.log(JSON.stringify(suttas, null, 2)); } logger.logLevel = logLevel; (async function() { try { const msg = "js/search() "; const dbg = DBG.SEARCH_SCRIPT; const dbgv = DBG.VERBOSE && dbg; logger.info(msg, `creating BilaraData ${bdName} ${branch}`); let localRoot = path.join(process.cwd(), 'local', bdName); let libRoot = path.join(Files.LOCAL_DIR, bdName); root = fs.existsSync(localRoot) ? localRoot : libRoot; dbgv && console.warn(msg, '[1]root', root, '@', branch); bilaraData = new BilaraData({ name: bdName, root, execGit, branch, gitAccount, includeUnpublished, }); logger.info(msg, 'initializing BilaraData', {sync}); await bilaraData.initialize(sync); if (exampleSuttaD3) { outExampleSuttaD3(); return; } else if (exampleSuttaPhrases) { outExampleSuttas({method:"phrase"}); return; } else if (exampleSuttaKeywords) { outExampleSuttas({method:"keywords"}); return; } else if (exampleSuttas) { outExampleSuttas(); return; } logger.info(msg, 'load English.wordSet'); let enWords = await English.wordSet({source:'file'}); logger.info(msg, 'creating Seeker'); var skr = await new Seeker({ matchColor: color, maxResults, bilaraData, readFile, logger, enWords, }).initialize(); var matchHighlight = matchBash(color); var findOpts = { pattern, matchHighlight, showMatchesOnly, trilingual, }; logger.info(msg, `findOpts`, findOpts); var msStart = Date.now(); var res = await skr.find(findOpts); var secElapsed = (Date.now() - msStart)/1000; logger.info(msg, `find() ${secElapsed.toFixed(1)}s`); if (outFormat === 'verse') { if (groupBy === 'verse1') { outVerse(res, pattern, 1); } else if (groupBy === 'verse2') { outVerse(res, pattern, 2); } else if (groupBy === 'verse3') { outVerse(res, pattern, 3); } else { outVerse(res, pattern); } } else if (outFormat === 'csv') { outCSV(res, pattern); } else if (outFormat === 'json') { outJSON(res, pattern); } else if (outFormat === 'paths') { outPaths(res, pattern); } else if (outFormat === 'lines') { outLines(res, pattern); } else if (outFormat === 'lines1') { outLines(res, pattern, 1); } else if (outFormat === 'lines2') { outLines(res, pattern, 2); } else if (outFormat === 'lines3') { outLines(res, pattern, 3); } else if (outFormat === 'markdown') { outMarkdown(res, pattern); } else if (outFormat === 'markdown1') { outMarkdown(res, pattern, 1); } else if (outFormat === 'markdown2') { outMarkdown(res, pattern, 2); } else if (outFormat === 'markdown3') { outMarkdown(res, pattern, 3); } else if (outFormat === 'trans') { outTrans(res, pattern); } else if (outFormat === 'human1') { outHuman(res, pattern, 1); } else if (outFormat === 'human2') { outHuman(res, pattern, 2); } else if (outFormat === 'human3') { outHuman(res, pattern, 3); } else if (outFormat === 'human3') { outScore(res); } else if (outFormat === 'score') { outScore(res); } else { outHuman(res, pattern, 3); } scriptEditor(res, pattern); } catch(e) { logger.warn(e.stack); }})();