jmdict-util
Version:
Parse JMdict XML file and export to SQLite and JSON files.
376 lines (305 loc) • 10.8 kB
JavaScript
const sqlite3 = require('sqlite3').verbose();
const fs = require('fs');
const parser = require('xml2json');
const console = require('console');
const readline = require('readline');
class JMdictUtil {
/**
* @param {string} path
*/
constructor(path, shortEntities = true) {
// Properties
/** @type {string} */
this.data = null;
/** @type {JMdict.JMdict} */
this.jmdictObj = null;
/** @type {Object.<string, number[]>} */
this.kanjiIndex = null;
/** @type {Object.<string, string>} */
this.entities = null;
/** @type {string[]} */
this.kanjiArray = null;
/** @type {Object.<string, number[]>} */
this.readingIndex = null;
/** @type {string[]} */
this.readingArray = null;
// Constructor script
this.load(path, shortEntities);
}
/**
* Load JMdict_e file
* @param {string} path
* @param {boolean} shortEntities
* If true, the entities will be the short version.
* ("adj-ix" vs "adjective (keiyoushi) - yoi/ii class")
* @returns {void}
*/
load(path, shortEntities = true) {
/** @type {string} */
this.data = fs.readFileSync(path, 'utf8');
/** @type {RegExp} */
const entityRegex = /<!ENTITY (.*?) "(.*?)">/g;
// save entities
this.entities = {};
let captures = entityRegex.exec(this.data);
while (captures !== null) {
const key = captures[1];
const value = captures[2];
this.entities[key] = value;
captures = entityRegex.exec(this.data);
}
// remove entities from data
if (shortEntities) {
this.data = this.data.replace(/<!ENTITY (.*?) "(.*?)">/g, '<!ENTITY $1 "$1">');
}
}
/**
* Get JMdict Object
* @private
* @returns {JMdict.JMdict}
*/
getJMdictObject() {
if (this.jmdictObj) return this.jmdictObj;
this.jmdictObj = /** @type {JMdict.JMdict} */ (parser.toJson(this.data, {
object: true,
arrayNotation: true,
}));
return this.jmdictObj;
}
/**
* @returns {JMdict.entry[]}
*/
getJMdictEntries() {
const jmdictObject = this.getJMdictObject();
return jmdictObject.JMdict[0].entry;
}
/**
* @return {Object.<string, string>}
*/
getEntityDefinitions() {
return this.entities;
}
/**
* @returns {Object.<string, number[]>}
*/
getKanjiIndex() {
if (this.kanjiIndex) return this.kanjiIndex;
this.kanjiIndex = {};
const jmdictEntries = this.getJMdictEntries();
jmdictEntries.forEach((jmdictEntry) => {
const entSeq = jmdictEntry.ent_seq[0];
if (jmdictEntry.k_ele) {
jmdictEntry.k_ele.forEach((kEle) => {
if (this.kanjiIndex[kEle.keb[0]]) {
this.kanjiIndex[kEle.keb[0]].push(entSeq);
} else {
this.kanjiIndex[kEle.keb[0]] = [entSeq];
}
});
}
});
return this.kanjiIndex;
}
/**
* @returns {string[]}
*/
getKanjiArray() {
return Object.keys(this.getKanjiIndex());
}
/**
* @returns {Object.<string, number[]>}
*/
getReadingIndex() {
if (this.readingIndex) return this.readingIndex;
this.readingIndex = {};
const jmdictEntries = this.getJMdictEntries();
jmdictEntries.forEach((jmdictEntry) => {
const entSeq = jmdictEntry.ent_seq[0];
if (jmdictEntry.r_ele) {
jmdictEntry.r_ele.forEach((rEle) => {
if (this.readingIndex[rEle.reb[0]]) {
this.readingIndex[rEle.reb[0]].push(entSeq);
} else {
this.readingIndex[rEle.reb[0]] = [entSeq];
}
});
}
});
return this.readingIndex;
}
/**
* @returns {string[]}
*/
getReadingArray() {
return Object.keys(this.getReadingIndex());
}
/**
* @param {Array<string>} argPriArray
* @returns {number}
*/
static priCalc(argPriArray) {
let priArray = argPriArray;
if (priArray === undefined || priArray === null) {
// eslint-disable-next-line no-param-reassign
priArray = [];
}
let priNum = 0;
// news
if (priArray.indexOf('news1') !== -1) priNum += 0;
else if (priArray.indexOf('news2') !== -1) priNum += 12001;
else priNum += 24001;
// ichi
if (priArray.indexOf('ichi1') !== -1) priNum += 0;
else if (priArray.indexOf('ichi2') !== -1) priNum += 9401;
else priNum += 9501;
// spec
if (priArray.indexOf('spec1') !== -1) priNum += 0;
else if (priArray.indexOf('spec2') !== -1) priNum += 1601;
else priNum += 3201;
// gai
if (priArray.indexOf('gai1') !== -1) priNum += 0;
else if (priArray.indexOf('gai2') !== -1) priNum += 4200;
else priNum += 4410;
// nf
const nfCheck = /** @type {[string] | []} */ (priArray.filter((el) => {
if (el.slice(0, 2) === 'nf') return true;
return false;
}));
if (nfCheck.length === 1) {
const nfNum = Number(nfCheck[0].slice(2, 4)); // Get the number from "nfxx"
priNum += ((nfNum - 1) * 500 + 1);
} else {
priNum += 23541;
}
return priNum;
}
/**
* @param {string} path target DB export
* @returns {Promise}
*/
buildSqlite(path) {
return new Promise((resolve, reject) => {
try {
if (fs.existsSync(path)) throw Error(`DB File ${path} already exists, please delete or change the path.`);
const jmdictEntries = this.getJMdictEntries();
const db = new sqlite3.Database(path);
db.serialize(() => {
db.run('CREATE TABLE jsons (`ent_seq` INTEGER, `json` TEXT, PRIMARY KEY(`ent_seq`))');
db.run('CREATE TABLE vocabs (`ent_seq` INTEGER, `kanji` TEXT, `reading` TEXT, `pri_point` INTEGER, `sense` INTEGER, PRIMARY KEY(`ent_seq`, `kanji`,`reading`))');
db.run('CREATE TABLE entities (`name` TEXT, `value` TEXT, PRIMARY KEY(`name`))');
});
db.serialize(() => {
// TABLE: jsons
db.run('BEGIN');
jmdictEntries.forEach((jmdictEntry) => {
const entSeq = jmdictEntry.ent_seq[0];
db.run('INSERT INTO jsons VALUES (?, ?)',
entSeq,
JSON.stringify(jmdictEntry, null, 0));
readline.clearLine(process.stdout, 0);
readline.cursorTo(process.stdout, 0, null);
process.stdout.write(`jsons table : ${jmdictEntry.ent_seq[0]}`);
});
let prevEntSeq = -1;
// TABLE: vocabs
jmdictEntries.forEach((jmdictEntry) => {
const entSeq = jmdictEntry.ent_seq[0];
// console.log(`==${entSeq}==`);
/** @type {[number, string, string, number, string][]}
* ent_seq, kanji, reading, pri_point, sense */
const vocabRows = [];
// Check unsorted entSeq in xml
if (entSeq < prevEntSeq) {
console.error(`entseq ${entSeq}: unsorted`);
}
prevEntSeq = entSeq;
// Calculate reading element priority points
/** @type {Object.<string, number>} */
const rElePriPoints = {};
jmdictEntry.r_ele.forEach((rEle) => {
const rElePriPoint = JMdictUtil.priCalc(rEle.re_pri);
rElePriPoints[rEle.reb[0]] = rElePriPoint;
});
// If vocab has kanji element
if (jmdictEntry.k_ele) {
jmdictEntry.k_ele.forEach((kEle) => {
const keb = kEle.keb[0];
const kElePriPoint = JMdictUtil.priCalc(kEle.ke_pri);
jmdictEntry.r_ele.forEach((rEle) => {
/** @type {string} kanji reading */
const reb = rEle.reb[0];
/** @type {number} */
const priPoint = kElePriPoint > rElePriPoints[reb]
? kElePriPoint : rElePriPoints[reb];
// If the reading has no kanji tag
if (Object.hasOwnProperty.call(rEle, 're_nokanji')) {
vocabRows.push([entSeq, null, reb, priPoint, '']);
// If the reading has reading restriction to the kanji
} else if (Object.hasOwnProperty.call(rEle, 're_restr')) {
if (rEle.re_restr.indexOf(keb) !== -1) {
vocabRows.push([entSeq, keb, reb, priPoint, '']);
}
// If the reading doesn't have tags above,
// it applies to all kanji
} else {
vocabRows.push([entSeq, keb, reb, priPoint, '']);
}
});
});
}
// If the vocab doesn't have kanji element
if (!jmdictEntry.k_ele) {
jmdictEntry.r_ele.forEach((rEle) => {
vocabRows.push([entSeq, null, rEle.reb[0], rElePriPoints[rEle.reb[0]], '']);
});
}
// Add glossaries
jmdictEntry.sense.forEach((sense) => {
let glosses = sense.gloss.reduce((ax, gloss) => `${ax}; ${gloss.$t}`, '');
glosses = glosses.slice(2);
glosses += '; ';
const stagk = sense.stagk ? sense.stagk[0] : null;
const stagr = sense.stagr ? sense.stagr[0] : null;
vocabRows.forEach((vocabRow) => {
if (stagk === null && stagr === null) {
// eslint-disable-next-line no-param-reassign
vocabRow[4] += glosses;
} else if (stagk !== null && vocabRow[1] === stagk) {
// eslint-disable-next-line no-param-reassign
vocabRow[4] += glosses;
} else if (stagr !== null && vocabRow[2] === stagr) {
// eslint-disable-next-line no-param-reassign
vocabRow[4] += glosses;
}
});
// console.log(stagk, stagr, glosses);
});
// Put in db
vocabRows.forEach((row) => {
db.run('INSERT INTO vocabs VALUES (?, ?, ?, ?, ?)', row);
readline.clearLine(process.stdout, 0);
readline.cursorTo(process.stdout, 0, null);
process.stdout.write(`jsons table : ${row[0]} ${row[1]} ${row[2]}`);
});
});
// TABLE: entities
Object.keys(this.entities).forEach((key) => {
db.run('INSERT INTO entities VALUES (?, ?)',
key,
this.entities[key]);
});
db.run('END');
readline.clearLine(process.stdout, 0);
readline.cursorTo(process.stdout, 0, null);
process.stdout.write('Inserting data to SQLite file...\n');
});
db.close(() => {
resolve();
});
} catch (err) {
reject(err);
}
});
}
}
module.exports = JMdictUtil;