UNPKG

dbay-cmudict

Version:

An SQLite version of the CMU Pronouncing Dictionary, a DBay demo project

654 lines (604 loc) 24.8 kB
(function() { 'use strict'; var BRITFONE, CND, PATH, SQL, badge, data_path, debug, echo, guy, help, home, info, isa, rpr, type_of, types, urge, validate, validate_list_of, warn, whisper; //########################################################################################################### CND = require('cnd'); rpr = CND.rpr; badge = 'DBAY-CMUDICT'; debug = CND.get_logger('debug', badge); warn = CND.get_logger('warn', badge); info = CND.get_logger('info', badge); urge = CND.get_logger('urge', badge); help = CND.get_logger('help', badge); whisper = CND.get_logger('whisper', badge); echo = CND.echo.bind(CND); //........................................................................................................... PATH = require('path'); types = require('./types'); ({isa, type_of, validate, validate_list_of} = types.export()); SQL = String.raw; guy = require('guy'); home = PATH.resolve(PATH.join(__dirname, '..')); data_path = PATH.join(home, 'data'); BRITFONE = require('britfone'); //=========================================================================================================== this.Cmud = (function() { class Cmud { //--------------------------------------------------------------------------------------------------------- static cast_constructor_cfg(me, cfg = null) { var R, clasz; clasz = me.constructor; R = cfg != null ? cfg : me.cfg; // #....................................................................................................... // if R.path? // R.temporary ?= false // R.path = PATH.resolve R.path // else // R.temporary ?= true // filename = me._get_random_filename() // R.path = PATH.resolve PATH.join clasz.C.autolocation, filename return R; } //--------------------------------------------------------------------------------------------------------- static declare_types(me) { var db; /* called from constructor via `guy.cfg.configure_with_types()` */ me.cfg = this.cast_constructor_cfg(me); me.types.validate.constructor_cfg(me.cfg); ({db} = guy.obj.pluck_with_fallback(me.cfg, null, 'db')); me.cfg = guy.lft.freeze(guy.obj.omit_nullish(me.cfg)); guy.props.def(me, 'db', { enumerable: false, value: db }); guy.props.def(me, 'cache', { enumerable: false, value: {} }); return null; } //--------------------------------------------------------------------------------------------------------- constructor(cfg) { guy.cfg.configure_with_types(this, cfg, types); this._compile_sql(); this._create_sql_functions(); this._open_cmu_db(); return void 0; } //--------------------------------------------------------------------------------------------------------- _create_db_structure() { var prefix, schema; ({prefix, schema} = this.cfg); this.db.execute(SQL`drop index if exists ${schema}.entries_word_idx; drop index if exists ${schema}.entries_ipa_idx; drop table if exists ${schema}.trlits; drop table if exists ${schema}.trlit_nicks; drop table if exists ${schema}.abs_phones; drop table if exists ${schema}.entries; drop table if exists ${schema}.source_nicks; -- ................................................................................................... vacuum ${schema}; -- ................................................................................................... create table ${schema}.entries ( id integer not null primary key, word text not null, source text not null references source_nicks ( nick ), nr integer not null default 1, ipa text not null, ipa_raw text not null ); create index ${schema}.entries_word_idx on entries ( word ); create index ${schema}.entries_ipa_idx on entries ( ipa ); -- ................................................................................................... create table ${schema}.trlits ( -- trlits: transliterations ipa text not null, nick text not null references trlit_nicks ( nick ), trlit text not null, example text, primary key ( ipa, nick ) ); create table ${schema}.trlit_nicks ( nick text not null, name text not null, comment text, primary key ( nick ) ); create table ${schema}.source_nicks ( nick text not null, name text not null, comment text, primary key ( nick ) );`); // -- -- ................................................................................................... // -- create view #{schema}.abs_phones as select // -- r1.word as word, // -- r2.lnr as lnr, // -- r2.rnr as rnr, // -- r2.part as abs1_phone // -- from // -- entries as r1, // -- std_str_split_re( r1.abs1, '\s' ) as r2; return null; } //--------------------------------------------------------------------------------------------------------- _compile_sql() { var prefix, schema, sql; ({prefix, schema} = this.cfg); sql = { get_db_object_count: SQL`select count(*) as count from ${schema}.sqlite_schema;`, truncate_entries: SQL`delete from ${schema}.entries where source = $source;`, insert_entry: SQL`insert into ${schema}.entries ( word, source, nr, ipa_raw, ipa ) values ( $word, $source, $nr, $ipa_raw, $ipa );`, insert_trlit: SQL`insert into ${schema}.trlits ( ipa, nick, trlit, example ) values ( $ipa, $nick, $trlit, $example );`, upsert_source_nick: SQL`insert into ${schema}.source_nicks ( nick, name, comment ) values ( $nick, $name, $comment ) on conflict ( nick ) do update set name = excluded.name, comment = excluded.comment;`, upsert_trlit_nick: SQL`insert into ${schema}.trlit_nicks ( nick, name, comment ) values ( $nick, $name, $comment ) on conflict ( nick ) do update set name = excluded.name, comment = excluded.comment;`, delete_arpabet_trlits: SQL`delete from ${schema}.trlits where nick in ( 'ab1', 'ab2' );` }; // insert_abs_phones: SQL""" // insert into #{schema}.abs_phones ( word, lnr, rnr, abs0_phone, abs1_phone, stress ) // values ( $word, $lnr, $rnr, $abs0_phone, $abs1_phone, $stress );""" guy.props.def(this, 'sql', { enumerable: false, value: sql }); return null; } //--------------------------------------------------------------------------------------------------------- _create_sql_functions() { var prefix, schema; ({prefix, schema} = this.cfg); // #------------------------------------------------------------------------------------------------------- // @db.create_function // name: prefix + 'ipa_from_abs1' // deterministic: true // varargs: false // call: ( abs1 ) => @ipa_from_abs1( abs1 ) //....................................................................................................... return null; } //--------------------------------------------------------------------------------------------------------- _get_db_object_count() { return this.db.single_value(this.sql.get_db_object_count); } _truncate_entries(source) { return this.db(this.sql.truncate_entries, {source}); } _delete_arpabet_trlits() { return this.db(this.sql.delete_arpabet_trlits); } //--------------------------------------------------------------------------------------------------------- _open_cmu_db() { this.db.open(this.cfg); if (this.cfg.rebuild || (this._get_db_object_count() === 0)) { this._create_db_structure(); this._populate_db(); } else { null; } return null; } //--------------------------------------------------------------------------------------------------------- _populate_db() { this._populate_arpabet_trlits(); // @_populate_xsampa_to_ipa() this._cache_spellings(); this._populate_cmu_entries(); this._populate_beep_entries(); return this._populate_bf_entries(); } //--------------------------------------------------------------------------------------------------------- _populate_cmu_entries() { var count, insert_entry, source; count = 0; insert_entry = this.db.prepare(this.sql.insert_entry); source = 'cmu'; this._truncate_entries(source); this.db(this.sql.upsert_source_nick, { nick: source, name: "CMUdict", comment: "v0.7b" }); this.db(() => { var ab, ipa, ipa_raw, line, nr, ref, ref1, word; ref = guy.fs.walk_lines(this.cfg.paths.cmu); for (line of ref) { if (line.startsWith(';;;')) { continue; } line = line.trimEnd(); [word, ab] = line.split('\x20\x20'); word = word.trim(); if ((word == null) || (word.length === 0) || (ab == null) || (ab.length === 0)) { warn('^4443^', count, rpr(line)); continue; } //................................................................................................... count++; if (count > this.cfg.max_entry_count) { warn('^dbay-cmudict/main@1^', `shortcutting at ${this.cfg.max_entry_count} entries`); break; } ({word, nr} = this._get_bracketed_nr(word)); word = word.toLowerCase(); word = (ref1 = this.cache.spellings[word]) != null ? ref1 : word/* replace LC variant with correct upper/lower case where found */ ipa_raw = this.ipa_raw_from_arpabet2(ab); ipa = this.ipa_from_cmu_ipa_raw(ipa_raw); insert_entry.run({word, source, nr, ipa_raw, ipa}); } return null; }); return null; } //--------------------------------------------------------------------------------------------------------- _populate_beep_entries() { var count, insert_entry, nr, source; count = 0; insert_entry = this.db.prepare(this.sql.insert_entry); source = 'be'; nr = 1; this._truncate_entries(source); this.db(this.sql.upsert_source_nick, { nick: source, name: "BEEP", comment: "v1.0" }); this.db(() => { var ab, ipa, ipa_raw, line, match, ref, ref1, word; ref = guy.fs.walk_lines(this.cfg.paths.beep); for (line of ref) { if (line.startsWith('#')) { continue; } line = line.trim(); if (line.length === 0) { continue; } if ((match = line.match(/(?<word>\S+)\s+(?<ab>.*)$/)) == null) { continue; } ({word, ab} = match.groups); if ((word.length === 0) || (ab.length === 0)) { warn('^4443^', count, rpr(line)); continue; } //................................................................................................... count++; if (count > this.cfg.max_entry_count) { warn('^dbay-cmudict/main@2^', `shortcutting at ${this.cfg.max_entry_count} entries`); break; } word = word.toLowerCase(); word = (ref1 = this.cache.spellings[word]) != null ? ref1 : word/* replace LC variant with correct upper/lower case where found */ word = this._rewrite_beep_word(word); ipa_raw = this.ipa_raw_from_arpabet2(ab); ipa = this.ipa_from_beep_ipa_raw(ipa_raw); insert_entry.run({word, source, nr, ipa_raw, ipa}); } return null; }); return null; } //--------------------------------------------------------------------------------------------------------- _populate_bf_entries() { var count, insert_entry, source; count = 0; insert_entry = this.db.prepare(this.sql.insert_entry); source = 'bf'; this._truncate_entries(source); this.db(this.sql.upsert_source_nick, { nick: source, name: "Britfone", comment: "v3.0.1" }); this.db(() => { var ipa, ipa_raw, line, match, nr, ref, ref1, word; ref = guy.fs.walk_lines(this.cfg.paths.bf_main); for (line of ref) { if (line.startsWith('#')) { continue; } line = line.trim(); if (line.length === 0) { continue; } if ((match = line.match(/(?<word>[^,]+),\s*(?<ipa_raw>.*)$/)) == null) { continue; } ({word, ipa_raw} = match.groups); if ((word.length === 0) || (ipa_raw.length === 0)) { warn('^4443^', count, rpr(line)); continue; } //................................................................................................... count++; if (count > this.cfg.max_entry_count) { warn('^dbay-cmudict/main@2^', `shortcutting at ${this.cfg.max_entry_count} entries`); break; } ({word, nr} = this._get_bracketed_nr(word)); word = word.toLowerCase(); word = (ref1 = this.cache.spellings[word]) != null ? ref1 : word/* replace LC variant with correct upper/lower case where found */ // word = word.replace /_/g, '\x20' ipa = this.ipa_from_britfone_ipa_raw(ipa_raw); insert_entry.run({word, source, nr, ipa_raw, ipa}); } return null; }); return null; } //--------------------------------------------------------------------------------------------------------- _cache_spellings() { var base1, cache, count, lc, line, match, ref, spelling; cache = ((base1 = this.cache).spellings != null ? base1.spellings : base1.spellings = {}); count = 0; ref = guy.fs.walk_lines(this.cfg.paths.spellings); for (line of ref) { if (line.startsWith('#')) { continue; } line = line.trim(); if (line.length === 0) { continue; } if ((match = line.match(/(?<lc>\S+)\s+(?<spelling>.*)$/)) == null) { continue; } //..................................................................................................... count++; if (count > this.cfg.max_entry_count) { warn('^dbay-cmudict/main@3^', `shortcutting at ${this.cfg.max_entry_count} entries`); break; } //..................................................................................................... ({lc, spelling} = match.groups); lc = lc.toLowerCase(); spelling = spelling.trimEnd(); cache[lc] = spelling; } return null; } //--------------------------------------------------------------------------------------------------------- _populate_arpabet_trlits() { var insert_trlit, line_nr; this._delete_arpabet_trlits(); line_nr = 0; insert_trlit = this.db.prepare(this.sql.insert_trlit); this.db(this.sql.upsert_trlit_nick, { nick: 'ab1', name: "ARPAbet1", comment: null }); this.db(this.sql.upsert_trlit_nick, { nick: 'ab2', name: "ARPAbet2", comment: null }); this.db(() => { var ab1, ab2, cv, example, field, fields, i, idx, ipa, j, len, len1, line, ref; ref = guy.fs.walk_lines(this.cfg.paths.abipa); for (line of ref) { line_nr++; line = line.trim(); if (line.length === 0) { continue; } if (line.startsWith('#')) { continue; } fields = line.split('\t'); for (idx = i = 0, len = fields.length; i < len; idx = ++i) { field = fields[idx]; fields[idx] = field.trim(); } for (idx = j = 0, len1 = fields.length; j < len1; idx = ++j) { field = fields[idx]; if (field === 'N/A') { fields[idx] = null; } } [cv, ab1, ab2, ipa, example] = fields; if (ab1) { ab1 = ab1.toLowerCase(); } ab2 = ab2.toLowerCase(); example = example.replace(/\x20/g, ''); if (ab1 != null) { insert_trlit.run({ ipa, nick: 'ab1', trlit: ab1, example }); } insert_trlit.run({ ipa, nick: 'ab2', trlit: ab2, example }); } return null; }); return null; } //--------------------------------------------------------------------------------------------------------- _undoublequote(text) { var last_idx; if (text[0] !== '"') { return text; } if (text[last_idx = text.length - 1] !== '"') { return text; } return text.slice(1, last_idx); } //--------------------------------------------------------------------------------------------------------- _populate_xsampa_to_ipa() { var insert/* #cache */, line_nr, xs_by_ipa; this._truncate_xsipa(); line_nr = 0; xs_by_ipa = {}; insert = this.db.prepare(this.sql.insert_xsipa); this.db(() => { var _, description, example, field, fields, i, idx, ipa, len, line, ref, xs; ref = guy.fs.walk_lines(this.cfg.paths.xsipa); for (line of ref) { line_nr++; line = line.trim(); if (line.length === 0) { continue; } if (line.startsWith('#')) { continue; } fields = line.split('\t'); for (idx = i = 0, len = fields.length; i < len; idx = ++i) { field = fields[idx]; fields[idx] = field.trim(); } // fields[ idx ] = null for field, idx in fields when field is 'N/A' [xs, ipa, _, description, example] = fields; if (example == null) { example = "(no example)"; } example = this._undoublequote(example); example = example.replace(/\\"/g, '"'); xs_by_ipa[ipa] = xs/* #cache */ insert.run({description, xs, ipa, example}); } return null; }); xs_by_ipa = guy.lft.freeze(xs_by_ipa); /* #cache */ guy.props.def(this, 'xs_by_ipa', { enumerable: false, value: xs_by_ipa }); /* #cache */ return null; } //========================================================================================================= //--------------------------------------------------------------------------------------------------------- _build_cache_ipa_raw_from_arpabet2() { var R, ref, row; R = {}; ref = this.db(SQL`select * from ${this.cfg.schema}.trlits where nick = 'ab2';`); for (row of ref) { R[row.trlit] = row.ipa; } return R; } //--------------------------------------------------------------------------------------------------------- ipa_raw_from_arpabet2(ab) { var R, base, base1, cache, i, j, len, len1, letter, level, match, phone, ref, ref1, ref2, ref3, replacement, stress; cache = ((base1 = this.cache).ipa_raw_from_arpabet2 != null ? base1.ipa_raw_from_arpabet2 : base1.ipa_raw_from_arpabet2 = this._build_cache_ipa_raw_from_arpabet2()); replacement = this.constructor.C.replacement; R = []; ab = ab.trim().toLowerCase(); ref = ab.split(/\x20+/); for (i = 0, len = ref.length; i < len; i++) { phone = ref[i]; stress = null; if ((match = phone.match(/^(?<base>\D+)(?<level>\d*)$/)) != null) { ({base, level} = match.groups); ref2 = Array.from((ref1 = cache[base]) != null ? ref1 : replacement); for (j = 0, len1 = ref2.length; j < len1; j++) { letter = ref2[j]; R.push(letter + level); } } else { R.push((ref3 = cache[phone]) != null ? ref3 : replacement); } } return R.join(' '); } //--------------------------------------------------------------------------------------------------------- ipa_from_cmu_ipa_raw(ipa_raw) { var R; R = ipa_raw; R = ',' + (R.replace(/\x20+/g, ',')) + ','; R = R.replace(/,ʌ([02]),/g, ',ə$1,'); R = R.replace(/,ɝ0,/g, ',ə0,r,'); R = R.replace(/,ɝ1,/g, ',ɜ1,r,'); R = R.replace(/,ɝ2,/g, ',ɜ2,r,'); R = R.replace(/,/g, ''); R = R.replace(/0/g, ''); R = R.replace(/1/g, '̲'); R = R.replace(/2/g, '̤'); return R; } //--------------------------------------------------------------------------------------------------------- ipa_from_beep_ipa_raw(ipa_raw) { var R; R = ipa_raw; R = ',' + (R.replace(/\x20+/g, ',')) + ','; R = R.replace(/,ɝ,/g, ',ɜ,r,'); R = R.replace(/,/g, ''); return R; } //--------------------------------------------------------------------------------------------------------- ipa_from_britfone_ipa_raw(ipa_raw) { var R; R = ipa_raw; R = R.replace(/\x20+/g, ''); // R = ',' + ( R.replace /\x20+/g, ',' ) + ',' // # R = R.replace /,ɝ,/g, ',ɜ,r,' // R = R.replace /,/g, '' return R; } //--------------------------------------------------------------------------------------------------------- _get_bracketed_nr(word) { var nr; nr = 1; word = word.replace(/\((\d+)\)$/, ($0, $1, index) => { nr = parseInt($1, 10); return ''; }); return {word, nr}; } //--------------------------------------------------------------------------------------------------------- _rewrite_beep_word(word) { var R; R = word; R = R.replace(/_/g, '\x20'); R = R.replace(/\\'a/g, 'á'); R = R.replace(/\\`a/g, 'à'); R = R.replace(/\\\^a/g, 'â'); R = R.replace(/\\'e/g, 'é'); R = R.replace(/\\`e/g, 'è'); R = R.replace(/\\\^e/g, 'ê'); R = R.replace(/\\\^o/g, 'ô'); return R; } }; //--------------------------------------------------------------------------------------------------------- Cmud.C = guy.lft.freeze({ replacement: '█', defaults: { //..................................................................................................... constructor_cfg: { db: null, prefix: 'cmud_', schema: 'cmud', path: PATH.join(home, 'cmudict.sqlite'), paths: { cmu: PATH.join(data_path, 'cmudict-0.7b'), beep: PATH.join(data_path, 'beep/beep-1.0'), bf_expansions: BRITFONE.expansions, bf_main: BRITFONE.main, bf_symbols: BRITFONE.symbols, spellings: PATH.join(data_path, 'beep/case.txt'), abipa: PATH.join(data_path, 'arpabet-to-ipa.tsv'), xsipa: PATH.join(data_path, 'xsampa-to-ipa.tsv') }, rebuild: false, max_entry_count: 2e308 } } }); return Cmud; }).call(this); }).call(this); //# sourceMappingURL=main.js.map