UNPKG

@nahanil/zhdict-lite

Version:

Access extended CC-CEDICT dictionary data

168 lines (142 loc) 5.88 kB
const fs = require('fs') const _ = require('lodash') const commandLineArgs = require('command-line-args') const LineByLine = require('n-readlines') const crypto = require('crypto') const unihan = require('cjk-unihan') const hsk = require('@nahanil/hsk-words') const bushou = require('@nahanil/bushou') const keys = ['traditional', 'simplified', 'pinyin', 'english'] const regex = /(.*)\s+(.*)\s+\[([^]+)\]\s+[/](.*)[/]/ const getUnihan = (key, field) => { return new Promise((resolve, reject) => { unihan.get(key, field, (err, res) => { if (err) { return reject(err) } resolve(res) }) }) } // Handle command line options const optionDefinitions = [ { name: 'verbose', alias: 'v', type: Boolean, defaultValue: false }, { name: 'input', type: String, defaultValue: './cedict_ts.u8' }, { name: 'output', type: String, defaultValue: './cedict.sqlite' } ] const options = commandLineArgs(optionDefinitions) options.verbose && console.log('Got command line options: ', options) // Check the input file exists if (!fs.existsSync(options.input)) { console.error('Input file not found: ' + options.input) process.exit(1) } const db = require('./models')({ dbpath: options.output }) // Once we have access to the database we can start the fun stuff console.log('Sync database') let i = 0 db.sequelize.sync({ force: true }).then(async () => { console.log(' * Done!') const hskWords = await hsk.getWordList('%') console.log(`Found ${hskWords.length} HSK Words`) console.log('Starting to read file') const liner = new LineByLine(options.input) // Loop through every line in the file const seen = [] let _line = liner.next() while (_line) { // Gotta be utf8 let line = _line.toString('utf8') _line = liner.next() // Skip comments if (line.charAt(0) === '#') { continue } // Parse the line into an object let data = _.zipObject(keys, line.match(regex).slice(1, 5)) // Ensure the pinyin is always lowercase.. data.pinyin = data.pinyin.toLowerCase() // Generate the ID data.id = crypto.createHash('sha1') .update(data.simplified + data.traditional + data.pinyin) .digest('hex') // Merge with the other variant(s), ie Jia1/jia1/家/家 if (seen.indexOf(data.id) !== -1) { let existing = await db.word.findOne({ where: { id: data.id } }) existing.english = [].concat(data.english.split('/'), existing.english).join('/') await existing.save() options.verbose && console.log(`\nUpdated ${data.simplified} [${data.traditional}] /${data.pinyin}/`) continue } seen.push(data.id) // Add extra data for single characters if (data.simplified.length === 1) { // Add radical for single characters (keeping in mind that it may be different for simplified/traditional characters) data.radicalSimplified = bushou.for(data.simplified) data.radicalTraditional = bushou.for(data.simplified) // Add some unihan data if we can find it let unihanData = { Simplified: await getUnihan(data.simplified), Traditional: await getUnihan(data.traditional) } let variants = [] ;(['Simplified', 'Traditional']).forEach((variant) => { if (unihanData[variant]) { // Add strokes if (unihanData[variant].kTotalStrokes) { data[`strokes${variant}`] = parseInt(unihanData[variant].kTotalStrokes) || null } if (unihanData[variant].kCangjie) { data[`cangjie${variant}`] = unihanData[variant].kCangjie || null } // Add radical // kRSKangXi - 40.7 // the 40 is the radical, ie 宀 // the 7 is additional strokes if (unihanData[variant].kRSKangXi) { let parts = unihanData[variant].kRSKangXi.split('.') let r = bushou.byIndex(parts[0]) if (r) { let radical = r[ variant === 'Simplified' && r.simplified ? 'simplified' : 'radical' ] data[`radical${variant}`] = radical data[`strokesMinRad${variant}`] = parts[1] } } const parseVariants = (input) => { // 'U+6C35<kMatthews,U+6C35<kMatthews'.match(/(U[+][0-9A-F]+)/g) if (!input) { return } input.match(/(U[+][0-9A-F]+)/g).forEach((match) => { variants.push(String.fromCharCode(parseInt(match.replace('U+', ''), 16))) }) } unihanData[variant].kZVariant && variants.push(parseVariants(unihanData[variant].kZVariant)) unihanData[variant].kSemanticVariant && variants.push(parseVariants(unihanData[variant].kSemanticVariant)) unihanData[variant].kCompatibilityVariant && variants.push(parseVariants(unihanData[variant].kCompatibilityVariant)) unihanData[variant].kSpecializedSemanticVariant && variants.push(parseVariants(unihanData[variant].kSpecializedSemanticVariant)) if (unihanData[variant][`k${variant}Variant`]) { variants.push(parseVariants(unihanData[`k${variant}Variant`])) data[`variants${variant}`] = parseVariants(unihanData[`k${variant}Variant`]) } } }) if (variants.length) { data.variants = _.uniq(variants.filter(v => !!v)).join(',') } } // Find hsk level? let inHsk = hskWords.filter(w => w.simplified === data.simplified && w.pronunciation === data.pinyin).shift() if (inHsk) { data.hskLevel = inHsk.level } // TODO: Add word 'frequency' so search results can be ordered more usefully try { await db.word.create(data) } catch (err) { console.error('Failed to insert record: ', JSON.stringify(data)) console.error(err, err.stack) process.exit(1) } ++i process.stdout.write(`\rInserted: ${i}`) } })