@nahanil/zhdict-lite
Version:
Access extended CC-CEDICT dictionary data
168 lines (142 loc) • 5.88 kB
JavaScript
const fs = require('fs')
const _ = require('lodash')
const commandLineArgs = require('command-line-args')
const LineByLine = require('n-readlines')
const crypto = require('crypto')
const unihan = require('cjk-unihan')
const hsk = require('@nahanil/hsk-words')
const bushou = require('@nahanil/bushou')
const keys = ['traditional', 'simplified', 'pinyin', 'english']
const regex = /(.*)\s+(.*)\s+\[([^]+)\]\s+[/](.*)[/]/
const getUnihan = (key, field) => {
return new Promise((resolve, reject) => {
unihan.get(key, field, (err, res) => {
if (err) {
return reject(err)
}
resolve(res)
})
})
}
// Handle command line options
const optionDefinitions = [
{ name: 'verbose', alias: 'v', type: Boolean, defaultValue: false },
{ name: 'input', type: String, defaultValue: './cedict_ts.u8' },
{ name: 'output', type: String, defaultValue: './cedict.sqlite' }
]
const options = commandLineArgs(optionDefinitions)
options.verbose && console.log('Got command line options: ', options)
// Check the input file exists
if (!fs.existsSync(options.input)) {
console.error('Input file not found: ' + options.input)
process.exit(1)
}
const db = require('./models')({
dbpath: options.output
})
// Once we have access to the database we can start the fun stuff
console.log('Sync database')
let i = 0
db.sequelize.sync({ force: true }).then(async () => {
console.log(' * Done!')
const hskWords = await hsk.getWordList('%')
console.log(`Found ${hskWords.length} HSK Words`)
console.log('Starting to read file')
const liner = new LineByLine(options.input)
// Loop through every line in the file
const seen = []
let _line = liner.next()
while (_line) {
// Gotta be utf8
let line = _line.toString('utf8')
_line = liner.next()
// Skip comments
if (line.charAt(0) === '#') { continue }
// Parse the line into an object
let data = _.zipObject(keys, line.match(regex).slice(1, 5))
// Ensure the pinyin is always lowercase..
data.pinyin = data.pinyin.toLowerCase()
// Generate the ID
data.id = crypto.createHash('sha1')
.update(data.simplified + data.traditional + data.pinyin)
.digest('hex')
// Merge with the other variant(s), ie Jia1/jia1/家/家
if (seen.indexOf(data.id) !== -1) {
let existing = await db.word.findOne({ where: { id: data.id } })
existing.english = [].concat(data.english.split('/'), existing.english).join('/')
await existing.save()
options.verbose && console.log(`\nUpdated ${data.simplified} [${data.traditional}] /${data.pinyin}/`)
continue
}
seen.push(data.id)
// Add extra data for single characters
if (data.simplified.length === 1) {
// Add radical for single characters (keeping in mind that it may be different for simplified/traditional characters)
data.radicalSimplified = bushou.for(data.simplified)
data.radicalTraditional = bushou.for(data.simplified)
// Add some unihan data if we can find it
let unihanData = {
Simplified: await getUnihan(data.simplified),
Traditional: await getUnihan(data.traditional)
}
let variants = []
;(['Simplified', 'Traditional']).forEach((variant) => {
if (unihanData[variant]) {
// Add strokes
if (unihanData[variant].kTotalStrokes) {
data[`strokes${variant}`] = parseInt(unihanData[variant].kTotalStrokes) || null
}
if (unihanData[variant].kCangjie) {
data[`cangjie${variant}`] = unihanData[variant].kCangjie || null
}
// Add radical
// kRSKangXi - 40.7
// the 40 is the radical, ie 宀
// the 7 is additional strokes
if (unihanData[variant].kRSKangXi) {
let parts = unihanData[variant].kRSKangXi.split('.')
let r = bushou.byIndex(parts[0])
if (r) {
let radical = r[ variant === 'Simplified' && r.simplified ? 'simplified' : 'radical' ]
data[`radical${variant}`] = radical
data[`strokesMinRad${variant}`] = parts[1]
}
}
const parseVariants = (input) => {
// 'U+6C35<kMatthews,U+6C35<kMatthews'.match(/(U[+][0-9A-F]+)/g)
if (!input) { return }
input.match(/(U[+][0-9A-F]+)/g).forEach((match) => {
variants.push(String.fromCharCode(parseInt(match.replace('U+', ''), 16)))
})
}
unihanData[variant].kZVariant && variants.push(parseVariants(unihanData[variant].kZVariant))
unihanData[variant].kSemanticVariant && variants.push(parseVariants(unihanData[variant].kSemanticVariant))
unihanData[variant].kCompatibilityVariant && variants.push(parseVariants(unihanData[variant].kCompatibilityVariant))
unihanData[variant].kSpecializedSemanticVariant && variants.push(parseVariants(unihanData[variant].kSpecializedSemanticVariant))
if (unihanData[variant][`k${variant}Variant`]) {
variants.push(parseVariants(unihanData[`k${variant}Variant`]))
data[`variants${variant}`] = parseVariants(unihanData[`k${variant}Variant`])
}
}
})
if (variants.length) {
data.variants = _.uniq(variants.filter(v => !!v)).join(',')
}
}
// Find hsk level?
let inHsk = hskWords.filter(w => w.simplified === data.simplified && w.pronunciation === data.pinyin).shift()
if (inHsk) {
data.hskLevel = inHsk.level
}
// TODO: Add word 'frequency' so search results can be ordered more usefully
try {
await db.word.create(data)
} catch (err) {
console.error('Failed to insert record: ', JSON.stringify(data))
console.error(err, err.stack)
process.exit(1)
}
++i
process.stdout.write(`\rInserted: ${i}`)
}
})