scv-bilara
Version:
SuttaCentral bilara-data library
513 lines (482 loc) • 14 kB
JavaScript
(function (exports) {
const fs = require("fs");
const path = require("path");
const { logger } = require('log-instance');
const { Files } = require('memo-again');
const { AuthorsV2, BilaraPath } = require("scv-esm");
const Unicode = require('./unicode');
const Pali = require('./pali');
const { SuttaCentralId } = require('scv-esm');
const BILARA_PATH = path.join(Files.LOCAL_DIR, 'bilara-data');
const { DBG, DBG_MLD, } = require("./defines.cjs");
class MLDoc {
constructor(opts = {}) {
const msg = 'M3c.ctor:';
const dbg = DBG.MLD_CTOR;
dbg>1 && console.log(msg, '[1]opts', opts);
(opts.logger || logger).logInstance(this, opts);
var {
author,
author_uid,
bilaraPaths,
category = 'sutta',
footer = MLDoc.SC_FOOTER,
hyphen = "\u00ad",
lang,
langSegs,
maxWord = 30,
minWord = 5,
score = 0, // search relevance
segMap = {},
segsMatched,
sutta_uid,
title,
trilingual,
type = 'translation',
} = opts;
if (bilaraPaths == null) {
throw new Error(`bilaraPaths is required`);
}
lang = lang || MLDoc.bilaraPathLanguages(bilaraPaths, lang).pop();
if (author == null) {
let aInfo = AuthorsV2.authorInfo(author_uid);
author = aInfo?.name?.join(', ');
}
if (author) {
this.author = author;
}
Object.assign(this, {
segMap, // For console debugging, this is first
author,
author_uid,
bilaraPaths,
category,
footer,
hyphen,
lang,
langSegs,
maxWord,
minWord,
score,
segsMatched,
sutta_uid,
title,
type,
});
if (trilingual) {
this.trilingual = trilingual;
this.docLang = opts.docLang;
this.docAuthor = opts.docAuthor;
this.docAuthorName = opts.docAuthorName;
let docInfo = AuthorsV2.authorInfo(this.docAuthor);
if (docInfo) {
this.docAuthorName = docInfo.name?.join(', ');
this.docFooter = MLDoc.SC_FOOTER;
}
this.refLang = opts.refLang || 'en';
this.refAuthor = opts.refAuthor || 'sujato';
this.refAuthorName = opts.refAuthorName;
let refInfo = AuthorsV2.authorInfo(this.refAuthor);
if (refInfo) {
this.refAuthorName = refInfo.name?.join(', ');
this.refFooter = MLDoc.SC_FOOTER;
}
}
Object.defineProperty(this, "unicode", {
value: opts.unicode || new Unicode(),
});
}
static get SC_FOOTER() {
return [
'<a href="https://suttacentral.net/licensing" target="_blank">',
'SuttaCentral',
'</a>',
].join('');
}
static compare(m1, m2) {
var cmp = m2.score - m1.score;
return cmp ? cmp : SuttaCentralId.compareLow(m1.suid, m2.suid);
}
static langCompare_pli_en(a, b) {
if (a === b) {
return 0;
}
if (a === 'pli') { // Pali is primary source
return -1;
} else if (b === 'pli') {
return 1;
}
if (a === 'en') { // English is secondary source
return -1;
} else if (b === 'en') {
return 1;
}
return a.localeCompare(b); // arbitrary but consistent
}
get suid() {
var {
bilaraPaths,
sutta_uid,
} = this;
return sutta_uid || bilaraPaths.reduce((a, bp) => {
var suid = BilaraPath.pathParts(bp).suid;
if (a && suid !== a) {
throw new Error(`uid mismatch ` +
`expected:${a} ` +
`actual:${suid} `);
}
return a || suid;
}, null);
}
get root_text() {
return this.bilaraPaths.reduce((a, bp) => {
var parts = BilaraPath.pathParts(bp);
return parts.type === 'root' ? parts : a;
}, undefined);
}
get translations() {
let {
bilaraPaths,
author_uid,
lang,
type,
sutta_uid,
category,
} = this;
if (bilaraPaths.length) {
return bilaraPaths
.map(bp => BilaraPath.pathParts(bp))
.filter(t => t.type === 'translation');
} else {
return [{ // legacy translation
type,
lang,
author_uid,
category,
collection: sutta_uid.replace(/[-0-9.]+/, ''),
suttaRef: `${sutta_uid}/${lang}/${author_uid}`,
//bilaraPath: translationPath('an/an1/an1.1-10','en','sujato'),
sutta_uid,
}];
}
}
titles(lang = this.lang) {
const msg = 'M3c.titles:';
const dbg = DBG.MLD_TITLES;
let headSegs = this.segments().slice(0,4);
dbg && console.log(msg, {lang, headSegs});
var titles = headSegs.slice(0, 4).reduce((a, s, i) => {
if (s) {
let text = (s[lang] || s.en || s.pli || '');
let segNum = s.scid.split(':')[1];
if (segNum.match(/^0/)) {
text.length && a.push(text.trim());
} else {
this.debug(`titles() ignoring segments[${i}] with segNum:${segNum}`, s);
}
}
return a;
}, []);
if (titles.length === 0) {
titles = [`(no-title-${this.suid})`];
}
return titles;
}
scids() {
var result = Object.keys(this.segMap);
result.sort(SuttaCentralId.compareLow);
return result;
}
static bilaraPathLanguages(bilaraPaths, lang='en') {
return bilaraPaths.length
? Object.keys(bilaraPaths.reduce((a, bp) => {
a[bp.split('/')[1]] = true;
return a;
}, {})).sort(MLDoc.langCompare_pli_en)
: [lang];
}
async load(root = BILARA_PATH) {
const msg = "M3c.load() ";
const dbg = DBG.MLD_LOAD;
try {
var {
segMap,
bilaraPaths,
docLang,
docAuthor,
docAuthorName,
refLang,
refAuthor,
refAuthorName,
trilingual,
} = this;
this.langSegs = {};
let langMap = {};
var p_bp = [];
dbg && console.log(msg, '[1]',
{bilaraPaths, docLang, docAuthor, refLang, refAuthor});
for (var ip = 0; ip < bilaraPaths.length; ip++) {
var parts = BilaraPath.pathParts(bilaraPaths[ip]);
var bp = path.join(root, parts.bilaraPath);
var fh = fs.existsSync(bp) && await fs.promises.open(bp);
var isTrans = parts.type === 'translation';
var isRoot = parts.type === 'root';
var lang = isTrans || isRoot
? parts.lang
: parts.type;
if (trilingual && parts.author_uid === refAuthor) {
lang = 'ref';
}
if (langMap[lang]) {
dbg && console.log(msg, `[2]skipping: ${bilaraPaths[ip]}`);
fh && fh.close();
continue;
}
langMap[lang] = true;
if (fh) {
try {
var bpe = {
fh,
bp,
p_read: fh.readFile(),
lang,
};
p_bp.push(bpe);
} catch(e) {
this.warn(`${msg} Could not read Bilara file:`, bp);
throw e;
}
} else {
this.log(`${msg} path not found:${bp}`);
}
}
// assemble content
let sameAuthor = trilingual &&
refLang === docLang &&
refAuthor === docAuthor;
for (var ip = 0; ip < p_bp.length; ip++) {
var { fh, bp, p_read, lang, } = p_bp[ip];
let copyRefDoc = lang === 'ref' && sameAuthor;
var json = await p_read;
let header;
try {
let strings = JSON.parse(json);
header = strings.__header__;
if (header) {
delete strings.__header__;
let { refAuthor, docAuthor } = this;
this.sutta_uid = header.suid;
this.lang = this.lang || header.lang;
if (refAuthor === header.author_uid) {
dbg && console.log(msg, 'ref');
this.refAuthorName = header.author;
this.refFooter = header.footer;
}
if (docAuthor === header.author_uid) {
this.docAuthorName = header.author;
this.docFooter = header.footer;
}
if (dbg) {
let show = {
lang: this.lang,
docAuthor,
docAuthorName: this.docAuthorName,
docFooter: this.docFooter,
refAuthor,
refAuthorName: this.refAuthorName,
refFooter: this.refFooter,
}
if (dbg > 1) {
show.header = header;
}
console.log(msg, '[3]__header__', show);
}
}
fh.close();
let keys = Object.keys(strings);
this.langSegs[lang] = keys.length;
keys.forEach(k => {
var m = (segMap[k] = segMap[k] || {
scid: k,
});
m[lang] = strings[k];
if (copyRefDoc) {
m[docLang] = strings[k];
}
});
} catch(e) {
this.warn(`${msg} Could not read Bilara file:`, bp);
throw e;
}
}
this.title = this.titles().join('\n');
return this;
} catch (e) {
this.warn(msg, e.message);
throw e;
}
}
segments() {
return this.scids().map(scid => Object.assign({
scid,
}, this.segMap[scid]));
}
matchScid({ seg, scidPat }) {
var match = SuttaCentralId.match(seg.scid, scidPat);
return match;
}
matchText({ seg, languages, rexList }) {
var unicode = this.unicode;
return languages.reduce((a, l) => {
var text = seg[l];
if (!a && text) {
if (rexList.reduce((a, re) => a && re.test(text), true)) {
return true;
} else if (l === 'pli') {
var romText = unicode.romanize(text);
return rexList.reduce(
(a, re) => a && re.test(romText), true);
}
}
return a;
}, false);
}
jsPattern(pat, opts = 'ui') {
var p = pat.toString();
const quotes = `"'\u0060\u2018\u201e\u201c\u201a`;
var result = p.startsWith('\\b')
? new RegExp(`(?<=[\\s,.:;${quotes}]|^)${p.substring(2)}`, opts)
: new RegExp(p, opts);
return result;
}
hyphenate(hyphenator = new Pali()) {
var { maxWord, } = hyphenator;
var lang = "pli";
var scids = this.scids();
scids.forEach((scid, i) => {
var seg = this.segMap[scid];
var text = seg[lang];
var words = text.split(" ");
var changed = false;
var hyphenated = words.reduce((a, w) => {
if (w.length > maxWord) {
changed = true;
a.push(hyphenator.hyphenate(w));
} else {
a.push(w);
}
return a;
}, []);
changed && (seg[lang] = hyphenated.join(" "));
});
}
filterSegments(...args) {
const msg = "M3c.filterSegments()";
const dbg = DBG_MLD;
if (typeof args[0] === 'string') {
var opts = {
resultPattern: args[0],
languages: args[1],
showMatchesOnly: args[2],
}
} else {
opts = args[0];
}
var {
pattern,
resultPattern,
languages,
showMatchesOnly,
method,
} = opts;
var matchScid = SuttaCentralId.test(resultPattern);
showMatchesOnly = showMatchesOnly === undefined
? true : showMatchesOnly;
languages = languages === undefined
? this.languages() : languages;
pattern = pattern || resultPattern;
var scids = this.scids();
var suid = this.suid;
if (resultPattern instanceof RegExp) {
var rexList = [this.jsPattern(resultPattern)];
} else if (1 && method === 'phrase') {
var rexList = [this.jsPattern(resultPattern, 'imu')];
} else if (matchScid) {
// SuttaCentral.match
} else {
let resultPatterns = resultPattern.split('|');
let patterns = pattern.split(' ').map(p => {
return p.charAt(0) === '_'
? p.substring(1) // unconstrained match
: `\\b${p}`; // word start match
});
let srcPats = resultPatterns.length === patterns.length
? resultPatterns
: patterns;
var rexList = srcPats.map(p => this.jsPattern(p));
}
var unicode = this.unicode;
var matchLow = SuttaCentralId.rangeLow(resultPattern);
var matchHigh = SuttaCentralId.rangeHigh(resultPattern);
var matched = 0;
scids.forEach((scid, i) => {
var seg = this.segMap[scid];
var match;
if (matchScid) {
match = SuttaCentralId.match(seg.scid, pattern);
dbg>1 && console.log(msg, '[1]matchScid', {
pattern, seg, match});
} else {
match = this.matchText({ seg, languages, rexList });
dbg>1 && console.log(msg, '[2]matchScid', {
pattern, seg, match, languages});
}
if (match) {
matched++;
seg.matched = true;
} else {
showMatchesOnly && delete this.segMap[scid];
}
});
dbg && console.log(msg, '[3]', {
matched, pattern, suid, languages,});
var score = matchScid
? 0
: Number((matched + matched / scids.length).toFixed(3));
this.score = score;
return {
matched,
matchLow,
matchHigh,
matchScid,
rexList,
suid,
score,
}
}
highlightMatch(pattern, matchHighlight) {
var scids = this.scids();
if (SuttaCentralId.test(pattern)) {
// scids are semantic and should never be highlighted
//scids.forEach(scid => {
//var seg = this.segMap[scid];
//seg.scid = seg.scid.replace(/^.*$/, matchHighlight);
//});
} else {
var rex = pattern instanceof RegExp
? rex : new RegExp(pattern, "gui");
var rex = this.jsPattern(pattern, "gui");
scids.forEach(scid => {
var seg = this.segMap[scid];
Object.keys(seg).forEach(k => {
if (k !== 'scid' && k !== 'matched') {
seg[k] = seg[k].replace(rex, matchHighlight);
}
});
});
}
return this;
}
} // class MLDoc
module.exports = exports.MLDoc = MLDoc;
})(typeof exports === "object" ? exports : (exports = {}));