js-mdict
Version:
mdict (*.mdx, *.mdd) file reader
205 lines • 9.23 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Mdict = void 0;
const mdict_base_js_1 = __importDefault(require("./mdict-base.js"));
const utils_js_1 = __importDefault(require("./utils.js"));
const lzo1x_wrapper_js_1 = __importDefault(require("./lzo1x-wrapper.js"));
const zlib_1 = __importDefault(require("zlib"));
const pako = {
inflate: zlib_1.default.inflateSync
};
class Mdict extends mdict_base_js_1.default {
constructor(fname, options) {
var _a, _b, _c, _d, _e, _f;
options = options || {};
// default options
options = {
passcode: (_a = options.passcode) !== null && _a !== void 0 ? _a : '',
debug: (_b = options.debug) !== null && _b !== void 0 ? _b : false,
resort: (_c = options.resort) !== null && _c !== void 0 ? _c : true,
isStripKey: (_d = options.isStripKey) !== null && _d !== void 0 ? _d : true,
isCaseSensitive: (_e = options.isCaseSensitive) !== null && _e !== void 0 ? _e : true,
encryptType: (_f = options.encryptType) !== null && _f !== void 0 ? _f : -1,
};
const passcode = options.passcode || undefined;
super(fname, passcode, options);
}
/**
* lookupKeyInfoItem lookup the `keyInfoItem`
* the `keyInfoItem` contains key-word record block location: recordStartOffset
* the `recordStartOffset` should indicate the unpacked record data relative offset
* @param word the target word phrase
*/
lookupKeyBlockByWord(word, isAssociate = false) {
// const keyBlockInfoId = this.lookupKeyInfoByWord(word);
// if (keyBlockInfoId < 0) {
// return undefined;
// }
// TODO: if the this.list length parse too slow, can decode by below code
// const list = this.lookupPartialKeyBlockListByKeyInfoId(keyInfoId);
const list = this.keywordList;
// binary search
let left = 0;
let right = list.length - 1;
let mid = 0;
while (left <= right) {
mid = left + ((right - left) >> 1);
const compRes = this.comp(word, list[mid].keyText);
if (compRes > 0) {
left = mid + 1;
}
else if (compRes == 0) {
break;
}
else {
right = mid - 1;
}
}
if (this.comp(word, list[mid].keyText) != 0) {
if (!isAssociate) {
return undefined;
}
}
return list[mid];
}
/**
* locate the record meaning buffer by `keyListItem`
* the `KeyBlockItem.recordStartOffset` should indicate the record block info location
* use the record block info, we can get the `recordBuffer`, then we need decrypt and decompress
* use decompressed `recordBuffer` we can get the total block which contains meanings
* then, use:
* const start = item.recordStartOffset - recordBlockInfo.unpackAccumulatorOffset;
* const end = item.recordEndOffset - recordBlockInfo.unpackAccumulatorOffset;
* the finally meaning's buffer is `unpackRecordBlockBuff[start, end]`
* @param item
*/
lookupRecordByKeyBlock(item) {
const recordBlockIndex = this.reduceRecordBlockInfo(item.recordStartOffset);
const recordBlockInfo = this.recordInfoList[recordBlockIndex];
const recordBuffer = this.scanner.readBuffer(this._recordBlockStartOffset + recordBlockInfo.packAccumulateOffset, recordBlockInfo.packSize);
const unpackRecordBlockBuff = this.decompressBuff(recordBuffer, recordBlockInfo.unpackSize);
const start = item.recordStartOffset - recordBlockInfo.unpackAccumulatorOffset;
const end = item.recordEndOffset - recordBlockInfo.unpackAccumulatorOffset;
return unpackRecordBlockBuff.slice(start, end);
}
/**
* lookupPartialKeyInfoListById
* decode key block by key block id, and we can get the partial key list
* the key list just contains the partial key list
* @param {number} keyInfoId key block id
* @return {KeyWordItem[]}
*/
lookupPartialKeyBlockListByKeyInfoId(keyInfoId) {
const packSize = this.keyInfoList[keyInfoId].keyBlockPackSize;
const unpackSize = this.keyInfoList[keyInfoId].keyBlockUnpackSize;
const startOffset = this.keyInfoList[keyInfoId].keyBlockPackAccumulator + this._keyBlockStartOffset;
const keyBlockPackedBuff = this.scanner.readBuffer(startOffset, packSize);
const keyBlock = this.unpackKeyBlock(keyBlockPackedBuff, unpackSize);
return this.splitKeyBlock(keyBlock, keyInfoId);
}
/**
* lookupInfoBlock reduce word find the nearest key block
* @param {string} word searching phrase
* @param keyInfoList
*/
lookupKeyInfoByWord(word, keyInfoList) {
const list = keyInfoList ? keyInfoList : this.keyInfoList;
let left = 0;
let right = list.length - 1;
let mid = 0;
// when compare the word, the uppercase words are less than lowercase words
// so we compare with the greater symbol is wrong, we need to use the `common.wordCompare` function
while (left <= right) {
mid = left + ((right - left) >> 1);
if (this.comp(word, list[mid].firstKey) >= 0 &&
this.comp(word, list[mid].lastKey) <= 0) {
return mid;
}
else if (this.comp(word, list[mid].lastKey) >= 0) {
left = mid + 1;
}
else {
right = mid - 1;
}
}
return -1;
}
decompressBuff(recordBuffer, unpackSize) {
// decompress
// 4 bytes: compression type
const rbCompType = Buffer.from(recordBuffer.subarray(0, 4));
// record_block stores the final record data
let unpackRecordBlockBuff = new Uint8Array(recordBuffer.length);
// TODO: igore adler32 offset
// Note: here ignore the checksum part
// bytes: adler32 checksum of decompressed record block
// adler32 = unpack('>I', record_block_compressed[4:8])[0]
if (rbCompType.toString('hex') === '00000000') {
unpackRecordBlockBuff = recordBuffer.slice(8);
}
else {
// decrypt
let blockBufDecrypted = null;
// if encrypt type == 1, the record block was encrypted
if (this.meta.encrypt === 1 /* || (this.meta.ext == "mdd" && this.meta.encrypt === 2 ) */) {
// const passkey = new Uint8Array(8);
// record_block_compressed.copy(passkey, 0, 4, 8);
// passkey.set([0x95, 0x36, 0x00, 0x00], 4); // key part 2: fixed data
blockBufDecrypted = utils_js_1.default.mdxDecrypt(recordBuffer);
}
else {
blockBufDecrypted = recordBuffer.subarray(8, recordBuffer.length);
}
// decompress
if (rbCompType.toString('hex') === '01000000') {
unpackRecordBlockBuff = lzo1x_wrapper_js_1.default.decompress(blockBufDecrypted, unpackSize, 1308672);
unpackRecordBlockBuff = Buffer.from(unpackRecordBlockBuff).subarray(unpackRecordBlockBuff.byteOffset, unpackRecordBlockBuff.byteOffset + unpackRecordBlockBuff.byteLength);
}
else if (rbCompType.toString('hex') === '02000000') {
// zlib decompress
unpackRecordBlockBuff = Buffer.from(pako.inflate(blockBufDecrypted));
}
}
return unpackRecordBlockBuff;
}
/**
* find record which record start locate
* @param {number} recordStart record start offset
*/
reduceRecordBlockInfo(recordStart) {
let left = 0;
let right = this.recordInfoList.length - 1;
let mid = 0;
while (left <= right) {
mid = left + ((right - left) >> 1);
if (recordStart >= this.recordInfoList[mid].unpackAccumulatorOffset) {
left = mid + 1;
}
else {
right = mid - 1;
}
}
return left - 1;
}
close() {
this.scanner.close();
this.keywordList = [];
this.keyInfoList = [];
this.recordInfoList = [];
}
}
exports.Mdict = Mdict;
/**
* 经过一系列测试, 发现mdx格式的文件存在较大的词语排序问题,存在如下情况:
* 1. 大小写的问题 比如 a-zA-Z 和 aA-zZ 这种并存的情况
* 2. 多语言的情况,存在英文和汉字比较大小的情况一般情况下 英文应当排在汉字前面
* 3. 小语种的情况
* 上述的这些情况都有可能出现,无法通过字典头中的设置实现排序,所以无法通过内部的keyInfoList进行快速索引,
* 在现代计算机的性能条件下,直接遍历全部词条也可得到较好的效果,因此目前采用的策略是全部读取词条,内部排序
*
*/
exports.default = Mdict;
//# sourceMappingURL=mdict.js.map