ebt-deepl
Version:
Javascript Library for EBT translation via DeepL
491 lines (432 loc) • 14.1 kB
JavaScript
import fs from "fs";
import path from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
import { BilaraData } from 'scv-bilara'
import { default as SuttaTranslator } from "../src/sutta-translator.mjs"
import pkgScvEsm from "scv-esm";
const { Tipitaka, SuttaRef, AuthorsV2 } = pkgScvEsm;
const cwd = process.cwd();
const EBT_DEEPL = 'ebt-deepl';
let out = "all";
let showNextTipitaka = false;
let suid;
let srcLang1 = 'en';
let srcAuthor1;
let srcLang2;
let srcAuthor2;
let updateGlossary = false;
let dstLang = 'pt-PT';
let dstAuthor = EBT_DEEPL;
let dstReplace = false;
let refLang;
let refAuthor;
let [nodePath, scriptPath, ...args] = process.argv;
let script = scriptPath.split('/').pop();
const msg = `${script}:\t`;
let category = 'sutta';
let listGlossary1 = false;
function help() {
console.log(`
NAME
translate - translate ebt-data sutta
SYNOPSIS
translate [OPTIONS] sutta_uid
DESCRIPTION
Translate sutta from source language (srcLang) to destination
language (dstLang). EBT-DeepL translates from two sources
having consistent and extensive Pali EBT coverage.
The first source by default is Bhante Sujato's EN translations.
The second source by default is Ayya Sabbamitta's DE translations.
DeepL translations will be provided for both translation sources.
Ideally, one should also designate a reference based on
consistent, segmented EBT source authored in Bilara.
For example, the default reference for PT is laera-quaresma
and the default reference for FR is noeismet.
References are not translated by DeepL--they are simply shown
in the output to aid in verification.
The Pali MS segmented text is also shown in the output
for an absolute reference of comparison.
If the sutta_uid includes a segment number (e.g., "an3.49:1.1"),
only that segment is translated.
-da, --dst-author
Destination author. Default is 'ebt-deepl'
-dl, --dst-lang
Destination language. Default is 'pt'.
-dr, --dst-replace
Replace existing destination file. Default is false.
-lg1, --list-glossary1
List glossary entries for first translation source.
-nt, --next-tipitaka
Return sutta_uid of next sutta in Tipitaka
-oa, --out-all
Output Pali, source1, source2, reference, translation1,
translation2 texts. This is the default
-ob1, --out-bilara-data-deepl1
Output JSON translation from source1 to local/bilara-data-deepl
using ebt-deepl author.
-ob2, --out-bilara-data-deepl2
Output JSON translation from source2 to local/bilara-data-deepl
using ebt-deepl author.
-ocqb, --out-curly-quotes-bilara-data
Skip translation and only change straight quotes to curly
quotes in destination file. Source files are ignored.
Curly quotes are normally emitted during translation,
so this option is rarely needed.
-ocqe, --out-curly-quotes-ebt-data
Skip translation and only change straight quotes to curly
quotes in destination file. Source files are ignored.
Curly quotes are normally emitted during translation,
so this option is rarely needed.
-oe1, --out-ebt-data1
Output JSON translation from source1 to local/ebt-data
using ebt-deepl author.
-oe2, --out-ebt-data2
Output JSON translation from source2 to local/ebt-data
using ebt-deepl author.
-oj1, --out-json1
Output JSON to stdout from source 1
-oj2, --out-json2
Output JSON to stdout from source 2
-ra, --ref-author
Reference author. Default is determined from reference language.
-rl, --ref-lang
Reference language. Default is destination language.
-sa1, --src-author1
Source author #1. Default is determined from source language.
-sa2, --src-author2
Source author #2. Default is determined from source language.
-sl1, --src-lang1
Source language #1. Default is 'de'.
-sl2, --src-lang2
Source language #2. Default is 'en'.
-ug, --update-glossary
Update glossary file(s) before translating
`);
process.exit(0);
}
if (args.length < 1) {
help();
}
for (var i = 0; i < args.length; i++) {
var arg = args[i];
if (arg === '-?' || arg === '--help') {
help();
} else if (arg === '-dl' || arg === '--dst-lang') {
dstLang = args[++i];
} else if (arg === '-da' || arg === '--dst-author') {
dstAuthor = args[++i];
} else if (arg === '-dr' || arg === '--dst-replace') {
dstReplace = true;
} else if (arg === '-lg1' || arg === '--list-glossary1') {
listGlossary1 = true;
} else if (arg === '-sl1' || arg === '--src-lang1') {
srcLang1 = args[++i];
} else if (arg === '-sl2' || arg === '--src-lang2') {
srcLang2 = args[++i];
} else if (arg === '-sa1' || arg === '--src-author1') {
srcAuthor1 = args[++i];
} else if (arg === '-sa2' || arg === '--src-author2') {
srcAuthor2 = args[++i];
} else if (arg === '-rl' || arg === '--ref-lang') {
refLang = args[++i];
} else if (arg === '-ra' || arg === '--ref-author') {
refAuthor = args[++i];
} else if (arg === '-nt' || arg === '--next-tipitaka') {
showNextTipitaka = true;
} else if (arg === '-oa' || arg === '--out-all') {
out = 'all';
} else if (arg === '-oj1' || arg === '--out-json1') {
out = 'oj1';
} else if (arg === '-oj2' || arg === '--out-json2') {
out = 'oj2';
} else if (arg === '-ob1' || arg === '--out-bilara-data-deepl1') {
out = 'ob1';
} else if (arg === '-ob2' || arg === '--out-bilara-data-deepl2') {
out = 'ob2';
} else if (arg==='-ocqb' || arg==='--out-curly-quotes-bilara-data') {
out = 'ocqb';
} else if (arg==='-ocqe' || arg==='--out-curly-quotes-ebt-data') {
out = 'ocqe';
} else if (arg === '-oe1' || arg === '--out-ebt-data1') {
out = 'oe1';
} else if (arg === '-oe2' || arg === '--out-ebt-data2') {
out = 'oe2';
} else if (arg === '-ug' || arg === '--update-glossary') {
updateGlossary = true;
} else if (arg.startsWith('-')) {
console.warn(`Invalid argument: "${arg}". Try:`);
console.warn(' scripts/translate.mjs --help');
process.exit(-1);
} else {
suid = args[i];
}
}
let tipitaka = new Tipitaka();
let nextSuid = suid && tipitaka.nextSuid(suid);
if (showNextTipitaka) {
console.log(msg, {nextSuid});
process.exit(0);
}
srcAuthor1 = srcAuthor1 || AuthorsV2.langAuthor(srcLang1);
srcAuthor2 = srcAuthor2 || srcLang2 && AuthorsV2.langAuthor(srcLang2);
refLang = refLang || dstLang;
refAuthor = refAuthor || AuthorsV2.langAuthor(refLang);
// For SC-Voice.net
let ebtData = await new BilaraData({
name:'ebt-data',
}).initialize();
// For bilara-data pre-translation pull requests
let bdDeepL = await new BilaraData({
name: 'bilara-data-deepl',
branch: 'unpublished'
}).initialize();
let xlts = [
await SuttaTranslator.create({
srcLang: srcLang1,
srcAuthor: srcAuthor1,
dstLang,
dstAuthor,
bilaraData: ebtData,
updateGlossary,
}),
];
let dstLang2 = xlts[0].dstLang2;
if (srcAuthor2) {
xlts.push(
await SuttaTranslator.create({
srcLang: srcLang2,
srcAuthor: srcAuthor2,
dstLang,
dstAuthor,
bilaraData: ebtData,
updateGlossary,
})
)
}
async function listGlossaryEntries(xlt) {
let { xltDeepL } = xlt;
let { glossaryName, glossary, translator } = xltDeepL;
let { glossaryId, entryCount } = glossary;
console.warn('name:', glossaryName, `[${entryCount} entries]`);
console.warn('id :', glossaryId);
let entries = await translator.getGlossaryEntries(glossaryId);
let { implEntries } = entries;
let keys = Object.keys(implEntries);
keys.forEach(key=>{
let value = implEntries[key];
console.log(`${key} | ${value}`);
});
process.exit(0);
}
if (listGlossary1) {
listGlossaryEntries(xlts[0]);
}
if (suid == null) {
if (updateGlossary) {
console.log("Glossary uploaded");
process.exit(0);
}
console.log("No sutta_uid specified");
process.exit(-1);
}
let sref = SuttaRef.create(suid);
if (sref == null) {
throw new Error(`Invalid SuttaRef ${suid}`);
}
let { sutta_uid, lang, author, segnum, scid } = sref;
let srcRef1 = {sutta_uid, lang:srcLang1, author:srcAuthor1}
let srcRef2 = srcAuthor2 && {sutta_uid, lang:srcLang2, author:srcAuthor2}
let refRef = {sutta_uid, lang:dstLang2, author:refAuthor}
let pliRef = {sutta_uid, lang:'pli', author: 'ms'}
let { segments: pliSegs, } = await xlts[0].loadSutta(pliRef);
let xltOpts = {bilaraData: ebtData}
let { segments: refSegs } =
await SuttaTranslator.loadSutta(refRef, xltOpts);
let { segments: srcSegs1 } =
await SuttaTranslator.loadSutta(srcRef1, xltOpts);
if (srcRef2) {
var { segments: srcSegs2 } = srcRef2 &&
await SuttaTranslator.loadSutta(srcRef2, xltOpts);
}
let xltsOut = [];
for (let i=0; i<xlts.length; i++) {
switch (out) {
case 'ocqb':
case 'ocqe':
break;
default:
let xlt = xlts[i];
xltsOut[i] = await xlt.translate(suid);
break;
}
}
let scids = Object.keys(pliSegs);
if (segnum) {
scids = scids.filter(s => s===scid);
}
function outAll() {
console.log(`Sutta : ${sutta_uid}`);
console.log(`Source1 : ${srcLang1}/${srcAuthor1}`);
srcAuthor2 && console.log(`Source2 : ${srcLang2}/${srcAuthor2}`);
console.log(`Reference: ${refLang}/${refAuthor}`);
console.log(`Target : ${dstLang2}/${dstAuthor}`);
let scSegs = xltsOut.map(x=>
SuttaTranslator.curlyQuoteSegments(x.dstSegs));
for (let i=0; i<scids.length; i++) {
let si = scids[i];
console.log('-----', si, '-----');
console.log(`pli:\t`, pliSegs[si]);
console.log(`${srcLang1}:\t`, srcSegs1[si]);
srcAuthor2 && console.log(`${srcLang2}:\t`, srcSegs2[si]);
console.log(`ref:\t`, refSegs && refSegs[si]);
console.log(`${srcLang1}-${dstLang2}:\t`, scSegs[0][si]);
srcAuthor2 && console.log(`${srcLang2}-${dstLang2}:\t`,
scSegs[1][si]);
}
}
function outJson(xltOut) {
let { dstSegs } = xltOut;
let scSegs = SuttaTranslator.curlyQuoteSegments(dstSegs);
if (segnum) {
console.log({
[scid]: scSegs[scid],
});
} else {
console.log(JSON.stringify(scSegs, null, 2));
}
}
async function outBilaraData(xltOut, bd) {
const msg = 'translate.outBilaraData()';
const dbg = 0;
const { name } = bd;
let outDir = path.join(__dirname,
`../local/${name}/translation`,
dstLang2,
EBT_DEEPL,
category,
);
let pliPath = bd.docPaths(sref)[0];
let dstPath = pliPath
.replace('root/pli/ms',
['translation', dstLang2, dstAuthor].join('/'))
.replace('root-pli-ms',
['translation', dstLang2, dstAuthor].join('-'));
let dstDir = path.dirname(dstPath);
let dstBase = path.basename(dstPath);
dbg && console.warn(msg, 'mkdir:', dstDir.replace(cwd,'').substring(1));
await fs.promises.mkdir(dstDir, { recursive: true })
let dstSegs;
if (segnum) {
try {
dstSegs = JSON.parse(await fs.promises.readFile(dstPath));
} catch(e) {
console.warn(msg, 'Cannot update non-existent translation:\n ',
dstBase);
let json = JSON.stringify(xltOut.dstSegs, null, 2);
console.log(json);
process.exit(-1);
}
dstSegs[scid] = xltOut.dstSegs[scid];
} else {
dstSegs = xltOut.dstSegs;
}
let scSegs = SuttaTranslator.curlyQuoteSegments(dstSegs);
let json = JSON.stringify(scSegs, null, 2);
let dstExists;
try {
let res = await fs.promises.stat(dstPath);
dstExists = true;
} catch(e) { // file does not exist
dstExists = false;
}
if (dstExists) {
if (!dstReplace) {
console.log(msg, json);
console.warn(msg, 'CANCELLED (file exists)', dstBase);
console.warn(msg, 'override with "--dst-replace"');
return;
}
dbg && console.warn(msg, 'overwriting:', dstBase, `${json.length}B`);
} else {
dbg && console.warn(msg, 'writing:', dstBase, `${json.length}B`);
}
dbg && console.warn(msg, json);
await fs.promises.writeFile(dstPath, json);
dbg && console.warn(msg, `translated ${dstBase}`);
}
async function outCurlyQuotes(bd) {
const msg = 'translate.outCurlyQuotes()';
const dbg = 0;
const { name } = bd;
let outDir = path.join(__dirname,
`../local/${name}/translation`,
dstLang2,
EBT_DEEPL,
category,
);
let pliPath = bd.docPaths(sref)[0];
let dstPath = pliPath
.replace('root/pli/ms',
['translation', dstLang2, dstAuthor].join('/'))
.replace('root-pli-ms',
['translation', dstLang2, dstAuthor].join('-'));
let dstDir = path.dirname(dstPath);
let dstBase = path.basename(dstPath);
let dstSegs;
try {
dstSegs = JSON.parse(await fs.promises.readFile(dstPath));
} catch(e) {
console.warn(msg, 'file not found', dstBase);
process.exit(-1);
}
let scSegs = SuttaTranslator.curlyQuoteSegments(dstSegs);
let json = JSON.stringify(scSegs, null, 2);
dbg && console.warn(msg, json);
await fs.promises.writeFile(dstPath, json);
}
if (!dstAuthor) {
throw new Error(`${out} required dstAuthor`);
}
switch (out) {
case 'all':
outAll();
break;
case 'ob1':
outBilaraData(xltsOut[0], bdDeepL);
break;
case 'ob2':
if (!srcAuthor2) {
throw new Error(`${out} requires srcAuthor2`);
}
outBilaraData(xltsOut[1], bdDeepL);
break;
case 'oe1':
outBilaraData(xltsOut[0], ebtData);
console.log(msg, {nextSuid});
break;
case 'oe2':
if (!srcAuthor2) {
throw new Error(`${out} requires srcAuthor2`);
}
outBilaraData(xltsOut[1], ebtData);
console.log(msg, {nextSuid});
break;
case 'oj1':
outJson(xltsOut[0]);
break;
case 'oj2':
if (!srcAuthor2) {
throw new Error(`${out} requires srcAuthor2`);
}
outJson(xltsOut[1]);
break;
case 'ocqb':
outCurlyQuotes(bdDeepL);
break;
case 'ocqe':
outCurlyQuotes(ebtData);
break;
}