js-mdict
Version:
mdict (*.mdx, *.mdd) file reader. Licensed under AGPL-3.0 for better community cooperation and commercial value protection.
888 lines • 39.9 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.MdictMeta = void 0;
const assert_1 = __importDefault(require("assert"));
const lzo1x_wrapper_js_1 = __importDefault(require("./lzo1x-wrapper.js"));
const utils_js_1 = __importDefault(require("./utils.js"));
const scanner_js_1 = require("./scanner.js");
const zlib_1 = __importDefault(require("zlib"));
const pako = {
inflate: zlib_1.default.inflateSync,
};
const UTF_16LE_DECODER = new TextDecoder('utf-16le');
const UTF16 = 'UTF-16';
const UTF_8_DECODER = new TextDecoder('utf-8');
const UTF8 = 'UTF-8';
const BIG5_DECODER = new TextDecoder('big5');
const BIG5 = 'BIG5';
const GB18030_DECODER = new TextDecoder('gb18030');
const GB18030 = 'GB18030';
class MdictMeta {
constructor() {
this.fname = '';
// mdx 密码
this.passcode = '';
// ext 文件后缀
this.ext = 'mdx';
// mdx version
this.version = 2.0;
// num width
this.numWidth = 4;
// num format
this.numFmt = utils_js_1.default.NUMFMT_UINT32;
// encoding 编码
this.encoding = '';
// decoder 解码器
this.decoder = new TextDecoder();
// 是否加密
this.encrypt = 0;
}
}
exports.MdictMeta = MdictMeta;
/**
* @class MdictBase, the basic mdict diction parser class
* @brif
* STEPS:
* 1. read mdict file header
* 2. read key header
* 3. read key block info
* 4. read key block
* 5. read record header
* 6. read record block info
* 7. read record block data
*
* 词典结构包括如下部分:
*
* Header : 记录词典的meta信息,包括名称、描述、样式、编码方式等
* KeyInfo : 记录词典的Key排列信息,设计用于索引
* KeyBlock : 记录词典的所有key列表信息,可以在 key block 中得到本词典的所有词条
* RecordHeader : 记录词典中所有record的meta信息,包括record的数量、大小等
* RecordInfo : 记录词典的所有record词条释义信息,可以加速检索
* RecordBlock: 记录词典的所有record词条释义,如果是mdd文件,则为二进制图片、音频等
*
*/
class MDictBase {
/**
* mdict constructor
* @param {string} fname
* @param {string} passcode
* @param options
*/
constructor(fname, passcode, options) {
// mdx meta
this.meta = new MdictMeta();
// the mdict file name
this.meta.fname = fname;
// the dictionary file decrypt pass code
this.meta.passcode = passcode;
// the dictionary file extension
this.meta.ext = utils_js_1.default.getExtension(fname, 'mdx');
// the file scanner
this.scanner = new scanner_js_1.FileScanner(fname);
// set options
this.options = options !== null && options !== void 0 ? options : {
passcode: passcode,
debug: false,
resort: true,
isStripKey: true,
isCaseSensitive: false,
encryptType: -1,
};
// # decrypt regcode to get the encrypted key
// TODO implements passcode decrypt part
if (passcode) {
// const {regcode, userid} = passcode
// if isinstance(userid, unicode):
// userid = userid.encode('utf8')
// self._encrypted_key = _decrypt_regcode_by_userid(regcode, userid)
}
else if (this.meta.version >= 3.0) {
// uuid = self.header.get(b'UUID')
// if uuid:
// if xxhash is None:
// raise RuntimeError('xxhash module is needed to read MDict 3.0 format')
// mid = (len(uuid) + 1) // 2
// self._encrypted_key = xxhash.xxh64_digest(uuid[:mid]) + xxhash.xxh64_digest(uuid[mid:])
}
// -------------------------
// dict header section
//--------------------------
// read the diction header info
this._headerStartOffset = 0;
this._headerEndOffset = 0;
this.header = {};
// -------------------------
// dict key header section
// --------------------------
this._keyHeaderStartOffset = 0;
this._keyHeaderEndOffset = 0;
this.keyHeader = {
keywordBlocksNum: 0,
keywordNum: 0,
keyInfoUnpackSize: 0,
keyInfoPackedSize: 0,
keywordBlockPackedSize: 0
};
// -------------------------
// dict key info section
// --------------------------
this._keyBlockInfoStartOffset = 0;
this._keyBlockInfoEndOffset = 0;
// key block info list
this.keyInfoList = [];
// -------------------------
// dict key block section
// --------------------------
this._keyBlockStartOffset = 0;
this._keyBlockEndOffset = 0;
this.keywordList = [];
// -------------------------
// dict record header section
// --------------------------
this._recordHeaderStartOffset = 0;
this._recordHeaderEndOffset = 0;
this.recordHeader = {
recordBlocksNum: 0,
entriesNum: 0,
recordInfoCompSize: 0,
recordBlockCompSize: 0,
};
// -------------------------
// dict record info section
// --------------------------
this._recordInfoStartOffset = 0;
this._recordInfoEndOffset = 0;
this.recordInfoList = [];
// -------------------------
// dict record block section
// --------------------------
this._recordBlockStartOffset = 0;
this._recordBlockEndOffset = 0;
this.recordBlockDataList = [];
this.readDict();
}
strip(key) {
if (this._isStripKey()) {
key = key.replace(utils_js_1.default.REGEXP_STRIPKEY[this.meta.ext], '$1');
}
if (!this._isKeyCaseSensitive()) {
key = key.toLowerCase();
}
if (this.meta.ext == 'mdd') {
key = key.replace(utils_js_1.default.REGEXP_STRIPKEY[this.meta.ext], '$1');
key = key.replace(/_/g, '!');
}
return key.toLowerCase().trim();
}
comp(word1, word2) {
return word1.localeCompare(word2);
}
// comp2(word1: string, word2: string): number {
// // if case-sensitive, the uppercase word is smaller than lowercase word
// // for example: `Holanda` is smaller than `abacaxi`
// // so when comparing with the words, we should use the dictionary order,
// // however, if we change the word to lowercase, the binary search algorithm will be confused
// // so, we use the enhanced compare function `common.wordCompare`
//
// const key1 = this.strip(word1);
// const key2 = this.strip(word2);
//
// const collator = new Intl.Collator('en-US');
// const result = collator.compare(key1, key2);
// if (hasLatinies(word1) && hasLatinies(word2)){
// if (word1.length > word2.length) {
// return 1;
// } else if (word1.length < word2.length) {
// const result2 = word1.localeCompare(word2);
// if (result2 >= 0 ){
// return result2;
// } else {
// if (word1.length > word2.length) {
// return 1;
// }
// return -1;
// }
// }
// }
// if (hasLatinies(word1) || hasLatinies(word2)){
// if (word1.length > word2.length) {
// const result2 = word1.localeCompare(word2);
// if (result2 >= 0 ){
// return result2;
// } else {
// if (word1.length > word2.length) {
// return 1;
// }
// return -1;
// }
// } else if (word1.length < word2.length) {
// return 1;
// } else {
// if (hasLatinies(word1) && !hasLatinies(word2)){
// return 1;
// }
// }
// }
// if(result == 0) {
// // prefix
// if (word1.at(0) === '-' && word2.at(0) !== '-') {
// return 1;
// }
// if (word2.at(0) === '-' && word1.at(0) !== '-') {
// return 1;
// }
// //inner space and middle dash
// if (word2.indexOf('-') > 0 && word1.indexOf(' ') >0) {
// return 0;
// }
// if (word1.indexOf('-') > 0 && word2.indexOf(' ') >0) {
// return 0;
// }
//
// }
// if (result < 0) {
// if (this.meta.ext == 'mdd') {
// if (key1.length > key2.length) {
// return this.strip(key1) > this.strip(key2) ? -1 : 1;
// } else if (key2.length > key1.length) {
// return 1;
// }
// }
// return result;
// }
// return result;
// }
_isKeyCaseSensitive() {
return this.options.isCaseSensitive || utils_js_1.default.isTrue(this.header['isCaseSensitive']);
}
_isStripKey() {
return this.options.isStripKey || utils_js_1.default.isTrue(this.header['StripKey']);
}
readDict() {
// STEP1: read header
this._readHeader();
// STEP2: read key header
this._readKeyHeader();
// STEP3: read key block info
this._readKeyInfos();
// STEP4: read key block
// @depreciated
// _readKeyBlock method is very slow, avoid invoke dirctly
// this method will return the whole words list of the dictionaries file, this is very slow
// NOTE: 本方法非常缓慢,也有可能导致内存溢出,请不要直接调用
this._readKeyBlocks();
// STEP5: read record header
this._readRecordHeader();
// STEP6: read record block info
this._readRecordInfos();
// STEP7: read record block
// _readRecordBlock method is very slow, avoid invoke directly
// this._readRecordBlock();
// Finally: resort the keyword list
this.keywordList.sort((ki1, ki2) => {
return ki1.keyText.localeCompare(ki2.keyText);
});
}
/**
* STEP 4.2. split keys from key block
* split key from key block buffer
* @param {Buffer} keyBlock key block buffer
* @param {number} keyBlockIdx
*/
splitKeyBlock(keyBlock, keyBlockIdx) {
const width = this.meta.encoding == 'UTF-16' || this.meta.ext == 'mdd' ? 2 : 1;
const keyList = [];
// because 0-7 is the leading number, we start at keyblock[7]
let keyStartIndex = 0;
while (keyStartIndex < keyBlock.length) {
let meaningOffset = 0;
const meaningOffsetBuff = keyBlock.slice(keyStartIndex, keyStartIndex + this.meta.numWidth);
meaningOffset = utils_js_1.default.b2n(meaningOffsetBuff);
let keyEndIndex = -1;
let i = keyStartIndex + this.meta.numWidth;
while (i < keyBlock.length) {
if ((width === 1 && keyBlock[i] == 0) || (width === 2 && keyBlock[i] == 0 && keyBlock[i + 1] == 0)) {
keyEndIndex = i;
break;
}
i += width;
}
if (keyEndIndex == -1) {
break;
}
const keyTextBuffer = keyBlock.slice(keyStartIndex + this.meta.numWidth, keyEndIndex);
const keyText = this.meta.decoder.decode(keyTextBuffer);
if (keyList.length > 0) {
keyList[keyList.length - 1].recordEndOffset = meaningOffset;
}
keyList.push({
recordStartOffset: meaningOffset,
keyText,
keyBlockIdx: keyBlockIdx,
recordEndOffset: -1
});
keyStartIndex = keyEndIndex + width;
}
return keyList;
}
/**
* STEP 1. read dictionary header
* Get mdx header info (xml content to object)
* [0:4], 4 bytes header length (header_byte_size), big-endian, 4 bytes, 16 bits
* [4:header_byte_size + 4] header_bytes
* [header_bytes_size + 4:header_bytes_size +8] adler32 checksum
* should be:
* assert(zlib.adler32(header_bytes) & 0xffffffff, adler32)
*
*/
_readHeader() {
// [0:4], 4 bytes header length (header_byte_size), big-endian, 4 bytes, 16 bits
const headerByteSizeBuff = this.scanner.readBuffer(0, 4);
const headerByteSize = utils_js_1.default.b2n(headerByteSizeBuff);
// [4:header_byte_size + 4] header_bytes
const headerBuffer = this.scanner.readBuffer(4, headerByteSize);
// TODO: SKIP 4 bytes alder32 checksum
// header_b_cksum should skip for now, because cannot get alder32 sum by js
// const header_b_cksum = readChunk.sync(this.meta.fname, header_byte_size + 4, 4);
// assert(header_b_cksum), "header_bytes checksum failed");
// 4 bytes header size + header_bytes_size + 4bytes alder checksum
this._headerEndOffset = headerByteSize + 4 + 4;
this._keyHeaderStartOffset = headerByteSize + 4 + 4;
// header text in utf-16 encoding ending with `\x00\x00`, so minus 2
// const headerText = common.readUTF16(headerBuffer, 0, headerByteSize - 2);
const headerText = UTF_16LE_DECODER.decode(headerBuffer);
// parse header info
Object.assign(this.header, utils_js_1.default.parseHeader(headerText));
// set header default configuration
this.header.KeyCaseSensitive = this.header.KeyCaseSensitive || 'No';
this.header.StripKey = this.header.StripKey || 'Yes';
// encrypted flag
// 0x00 - no encryption
// 0x01 - encrypt record block
// 0x02 - encrypt key info block
if (!this.header.Encrypted || this.header.Encrypted == '' || this.header.Encrypted == 'No') {
this.meta.encrypt = 0;
}
else if (this.header.Encrypted == 'Yes') {
this.meta.encrypt = 1;
}
else {
this.meta.encrypt = parseInt(this.header['Encrypted'], 10);
}
if (this.options.encryptType && this.options.encryptType != -1) {
this.meta.encrypt = this.options.encryptType;
}
// stylesheet attribute if present takes from of:
// style_number # 1-255
// style_begin # or ''
// style_end # or ''
// TODO: splitstyle info
// header_info['_stylesheet'] = {}
// if header_tag.get('StyleSheet'):
// lines = header_tag['StyleSheet'].splitlines()
// for i in range(0, len(lines), 3):
// header_info['_stylesheet'][lines[i]] = (lines[i + 1], lines[i + 2])
// before version 2.0, number is 4 bytes integer alias, int32
// version 2.0 and above use 8 bytes, alias int64
this.meta.version = parseFloat(this.header['GeneratedByEngineVersion']);
if (this.meta.version >= 2.0) {
this.meta.numWidth = 8;
this.meta.numFmt = utils_js_1.default.NUMFMT_UINT64;
}
else {
this.meta.numWidth = 4;
this.meta.numFmt = utils_js_1.default.NUMFMT_UINT32;
}
if (!this.header.Encoding || this.header.Encoding == '') {
this.meta.encoding = UTF8;
this.meta.decoder = UTF_8_DECODER;
}
else if (this.header.Encoding == 'GBK' || this.header.Encoding == 'GB2312') {
this.meta.encoding = GB18030;
this.meta.decoder = GB18030_DECODER;
}
else if (this.header['Encoding'].toLowerCase() == 'big5') {
this.meta.encoding = BIG5;
this.meta.decoder = BIG5_DECODER;
}
else {
this.meta.encoding =
this.header['Encoding'].toLowerCase() == 'utf16' ||
this.header['Encoding'].toLowerCase() == 'utf-16'
? UTF16
: UTF8;
if (this.meta.encoding == UTF16) {
this.meta.decoder = UTF_16LE_DECODER;
}
else {
this.meta.decoder = UTF_8_DECODER;
}
}
// determine the encoding and decoder, if extension is *.mdd
if (this.meta.ext === 'mdd') {
this.meta.encoding = UTF16;
this.meta.decoder = UTF_16LE_DECODER;
}
}
/**
* STEP 2. read key block header
* read key block header
*/
_readKeyHeader() {
// header info struct:
// [0:8]/[0:4] - number of key blocks
// [8:16]/[4:8] - number of entries
// [16:24]/[8:12] - key block info decompressed size (if version >= 2.0, else not exist)
// [24:32]/null - key block info size
// [32:40]/[12:16] - key block size
// note: if version <2.0, the key info buffer size is 4 * 4
// otherwise, ths key info buffer size is 5 * 8
// <2.0 the order of number is same
// set offset
this._keyHeaderStartOffset = this._headerEndOffset;
// version >= 2.0, key_header bytes number is 5 * 8, otherwise, 4 * 4
const headerMetaSize = this.meta.version >= 2.0 ? 8 * 5 : 4 * 4;
// const keyHeaderBuff = this._readBuffer(this._keyHeaderStartOffset, bytesNum);
const keyHeaderBuff = this.scanner.readBuffer(this._keyHeaderStartOffset, headerMetaSize);
// decrypt
if (this.meta.encrypt & 1) {
if (!this.meta.passcode || this.meta.passcode == '') {
// TODO: encrypted file not support yet
throw Error(' user identification is needed to read encrypted file');
}
// regcode, userid = header_info['_passcode']
if (this.header.RegisterBy == 'Email') {
// encrypted_key = _decrypt_regcode_by_email(regcode, userid);
throw Error('encrypted file not support yet');
}
else {
throw Error('encrypted file not support yet');
}
}
let offset = 0;
// [0:8] - number of key blocks
const keywordBlockNumBuff = keyHeaderBuff.slice(offset, offset + this.meta.numWidth);
this.keyHeader.keywordBlocksNum = utils_js_1.default.b2n(keywordBlockNumBuff);
offset += this.meta.numWidth;
// [8:16] - number of entries
const keywordNumBuff = keyHeaderBuff.slice(offset, offset + this.meta.numWidth);
this.keyHeader.keywordNum = utils_js_1.default.b2n(keywordNumBuff);
offset += this.meta.numWidth;
// [16:24] - number of key block info decompress size
if (this.meta.version >= 2.0) {
// only for version > 2.0
const keyInfoUnpackSizeBuff = keyHeaderBuff.slice(offset, offset + this.meta.numWidth);
const keyInfoUnpackSize = utils_js_1.default.b2n(keyInfoUnpackSizeBuff);
offset += this.meta.numWidth;
this.keyHeader.keyInfoUnpackSize = keyInfoUnpackSize;
}
// [24:32] - number of key block info compress size
const keyInfoPackedSizeBuff = keyHeaderBuff.slice(offset, offset + this.meta.numWidth);
const keyInfoPackedSize = utils_js_1.default.b2n(keyInfoPackedSizeBuff);
offset += this.meta.numWidth;
this.keyHeader.keyInfoPackedSize = keyInfoPackedSize;
// [32:40] - number of key blocks total size, note, key blocks total size, not key block info
const keywordBlockPackedSizeBuff = keyHeaderBuff.slice(offset, offset + this.meta.numWidth);
const keywordBlockPackedSize = utils_js_1.default.b2n(keywordBlockPackedSizeBuff);
offset += this.meta.numWidth;
this.keyHeader.keywordBlockPackedSize = keywordBlockPackedSize;
// 4 bytes alder32 checksum, after key info block (only >= v2.0)
// set end offset
this._keyHeaderEndOffset = this._keyHeaderStartOffset +
headerMetaSize + (this.meta.version >= 2.0 ? 4 : 0); /* 4 bytes adler32 checksum length, only for version >= 2.0 */
}
/**
* STEP 3. read key block info, if you want quick search, read at here already enough
* read key block info
* key block info list
*/
_readKeyInfos() {
this._keyBlockInfoStartOffset = this._keyHeaderEndOffset;
const keyBlockInfoBuff = this.scanner.readBuffer(this._keyBlockInfoStartOffset, this.keyHeader.keyInfoPackedSize);
const keyBlockInfoList = this._decodeKeyInfo(keyBlockInfoBuff);
this._keyBlockInfoEndOffset = this._keyBlockInfoStartOffset + this.keyHeader.keyInfoPackedSize;
(0, assert_1.default)(this.keyHeader.keywordBlocksNum === keyBlockInfoList.length, 'the num_key_info_list should equals to key_block_info_list');
this.keyInfoList = keyBlockInfoList;
// NOTE: must set at here, otherwise, if we haven't invoked the _decodeKeyBlockInfo method,
// var `_recordBlockStartOffset` will not be set.
this._recordBlockStartOffset = this._keyBlockInfoEndOffset + this.keyHeader.keywordBlockPackedSize;
}
/**
* STEP 3.1. decode key block info, this function will invokde in `_readKeyBlockInfo`
* and decode the first key and last key infomation, etc.
* @param {Uint8Array} keyInfoBuff key block info buffer
*/
_decodeKeyInfo(keyInfoBuff) {
const keyBlockNum = this.keyHeader.keywordBlocksNum;
if (this.meta.version == 2.0) {
const packType = keyInfoBuff.subarray(0, 4).join('');
// const _alder32Buff = keyInfoBuff.slice(4, 8)
// const numEntries = this.keyHeader.entriesNum;
if (this.meta.encrypt === 2) {
keyInfoBuff = utils_js_1.default.mdxDecrypt(keyInfoBuff);
}
(0, assert_1.default)(this.keyHeader.keyInfoPackedSize == keyInfoBuff.length, `key_block_info keyInfoPackedSize ${this.keyHeader.keyInfoPackedSize} should equal to key-info buffer length ${keyInfoBuff.length}`);
if (this.meta.version >= 2.0 && packType == '2000') {
// For version 2.0, will compress by zlib, lzo just for 1.0
// key_block_info_compressed[0:8] => compress_type
const keyInfoBuffUnpacked = zlib_1.default.inflateSync(keyInfoBuff.slice(8));
// TODO: check the alder32 checksum
// adler32 = unpack('>I', key_block_info_compressed[4:8])[0]
// assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff)
// this.keyHeader.keyInfoUnpackSize only exist when version >= 2.0
(0, assert_1.default)(this.keyHeader.keyInfoUnpackSize == keyInfoBuffUnpacked.length, `key_block_info keyInfoUnpackSize ${this.keyHeader.keyInfoUnpackSize} should equal to keyInfoBuffUnpacked buffer length ${keyInfoBuffUnpacked.length}`);
keyInfoBuff = keyInfoBuffUnpacked;
}
}
const keyBlockInfoList = [];
// init tmp variables
let entriesCount = 0;
let kbCount = 0;
let indexOffset = 0;
let kbPackSizeAccu = 0;
let kbUnpackSizeAccu = 0;
while (kbCount < keyBlockNum) {
let blockWordCount = 0;
let packSize = 0;
let unpackSize = 0;
let firstWordSize = 0;
let lastWordSize = 0;
let firstKey = '';
let lastKey = '';
blockWordCount = utils_js_1.default.b2n(keyInfoBuff.slice(indexOffset, indexOffset + this.meta.numWidth));
indexOffset += this.meta.numWidth;
firstWordSize = utils_js_1.default.b2n(keyInfoBuff.slice(indexOffset, indexOffset + this.meta.numWidth / 4));
indexOffset += this.meta.numWidth / 4;
if (this.meta.version >= 2.0) {
if (this.meta.encoding === UTF16) {
firstWordSize = (firstWordSize + 1) * 2;
}
else {
firstWordSize += 1;
}
}
else {
if (this.meta.encoding === UTF16) {
firstWordSize = firstWordSize * 2;
}
}
const firstWordBuffer = keyInfoBuff.slice(indexOffset, indexOffset + firstWordSize);
indexOffset += firstWordSize;
lastWordSize = utils_js_1.default.b2n(keyInfoBuff.slice(indexOffset, indexOffset + this.meta.numWidth / 4));
indexOffset += this.meta.numWidth / 4;
if (this.meta.version >= 2.0) {
if (this.meta.encoding === UTF16) {
lastWordSize = (lastWordSize + 1) * 2;
}
else {
lastWordSize += 1;
}
}
else {
if (this.meta.encoding === UTF16) {
lastWordSize = lastWordSize * 2;
}
}
const lastWordBuffer = keyInfoBuff.slice(indexOffset, indexOffset + lastWordSize);
indexOffset += lastWordSize;
packSize = utils_js_1.default.b2n(keyInfoBuff.slice(indexOffset, indexOffset + this.meta.numWidth));
indexOffset += this.meta.numWidth;
unpackSize = utils_js_1.default.b2n(keyInfoBuff.slice(indexOffset, indexOffset + this.meta.numWidth));
indexOffset += this.meta.numWidth;
if (this.meta.encoding === UTF16) {
firstKey = this.meta.decoder.decode(firstWordBuffer);
lastKey = this.meta.decoder.decode(lastWordBuffer);
}
else {
firstKey = this.meta.decoder.decode(firstWordBuffer);
lastKey = this.meta.decoder.decode(lastWordBuffer);
}
keyBlockInfoList.push({
firstKey,
lastKey,
keyBlockPackSize: packSize,
keyBlockPackAccumulator: kbPackSizeAccu,
keyBlockUnpackSize: unpackSize,
keyBlockUnpackAccumulator: kbUnpackSizeAccu,
keyBlockEntriesNum: blockWordCount,
keyBlockEntriesNumAccumulator: entriesCount,
keyBlockInfoIndex: kbCount,
});
kbCount += 1; // key block number
entriesCount += blockWordCount;
kbPackSizeAccu += packSize;
kbUnpackSizeAccu += unpackSize;
}
// assert(
// countEntriesNum === numEntries,
// `the number_entries ${numEntries} should equal the count_num_entries ${countEntriesNum}`
// );
(0, assert_1.default)(kbPackSizeAccu === this.keyHeader.keywordBlockPackedSize);
return keyBlockInfoList;
}
/**
* step 4.1. decode key block
* find the key block by the phrase
* @param kbPackedBuff
* @param unpackSize
*/
unpackKeyBlock(kbPackedBuff, unpackSize) {
// 4 bytes : compression type
const compType = Buffer.from(kbPackedBuff.slice(0, 4));
// TODO 4 bytes adler32 checksum
// 4 bytes : adler checksum of decompressed key block
// adler32 = unpack('>I', key_block_compressed[start + 4:start + 8])[0]
let keyBlock;
if (compType.toString('hex') == '00000000') {
keyBlock = kbPackedBuff.slice(8);
}
else if (compType.toString('hex') == '01000000') {
// TODO: tests for v2.0 dictionary
const decompressedBuff = lzo1x_wrapper_js_1.default.decompress(kbPackedBuff.slice(8), unpackSize, 0);
keyBlock = Buffer.from(decompressedBuff);
}
else if (compType.toString('hex') === '02000000') {
keyBlock = Buffer.from(pako.inflate(kbPackedBuff.slice(8)));
// extract one single key block into a key list
// notice that adler32 returns signed value
// TODO compare with previous word
// assert(adler32 == zlib.adler32(key_block) & 0xffffffff)
}
else {
throw Error(`cannot determine the compress type: ${compType.toString('hex')}`);
}
return keyBlock;
}
/**
* STEP 4. decode key block
* decode key block return the total keys list,
* Note: this method runs very slow, please do not use this unless special target
*/
_readKeyBlocks() {
this._keyBlockStartOffset = this._keyBlockInfoEndOffset;
let keyBlockList = [];
let kbStartOffset = this._keyBlockStartOffset;
for (let idx = 0; idx < this.keyInfoList.length; idx++) {
const packSize = this.keyInfoList[idx].keyBlockPackSize;
const unpackSize = this.keyInfoList[idx].keyBlockUnpackSize;
const start = kbStartOffset;
(0, assert_1.default)(start === this.keyInfoList[idx].keyBlockPackAccumulator + this._keyBlockStartOffset, 'should be equal');
// const end = kbStartOffset + compSize;
const kbCompBuff = this.scanner.readBuffer(start, packSize);
const keyBlock = this.unpackKeyBlock(kbCompBuff, unpackSize);
const splitKeyBlock = this.splitKeyBlock(Buffer.from(keyBlock), idx);
if (keyBlockList.length > 0 && keyBlockList[keyBlockList.length - 1].recordEndOffset == -1) {
keyBlockList[keyBlockList.length - 1].recordEndOffset = splitKeyBlock[0].recordStartOffset;
}
keyBlockList = keyBlockList.concat(splitKeyBlock);
kbStartOffset += packSize;
}
if (keyBlockList[keyBlockList.length - 1].recordEndOffset === -1) {
keyBlockList[keyBlockList.length - 1].recordEndOffset = -1; // the latest one
}
(0, assert_1.default)(keyBlockList.length === this.keyHeader.keywordNum, `key list length: ${keyBlockList.length} should equal to key entries num: ${this.keyHeader.keywordNum}`);
this._keyBlockEndOffset = this._keyBlockStartOffset + this.keyHeader.keywordBlockPackedSize;
// keep keyBlockList in memory
this.keywordList = keyBlockList;
}
/**
* STEP 5.
* decode record header,
* includes:
* [0:8/4] - record block number
* [8:16/4:8] - num entries the key-value entries number
* [16:24/8:12] - record block info size
* [24:32/12:16] - record block size
*/
_readRecordHeader() {
this._recordHeaderStartOffset = this._keyBlockInfoEndOffset + this.keyHeader.keywordBlockPackedSize;
const recordHeaderLen = this.meta.version >= 2.0 ? 4 * 8 : 4 * 4;
this._recordHeaderEndOffset = this._recordHeaderStartOffset + recordHeaderLen;
const recordHeaderBuffer = this.scanner.readBuffer(this._recordHeaderStartOffset, recordHeaderLen);
let ofset = 0;
const recordBlocksNum = utils_js_1.default.b2n(recordHeaderBuffer.slice(ofset, ofset + this.meta.numWidth));
ofset += this.meta.numWidth;
const entriesNum = utils_js_1.default.b2n(recordHeaderBuffer.slice(ofset, ofset + this.meta.numWidth));
(0, assert_1.default)(entriesNum === this.keyHeader.keywordNum);
ofset += this.meta.numWidth;
const recordInfoCompSize = utils_js_1.default.b2n(recordHeaderBuffer.slice(ofset, ofset + this.meta.numWidth));
ofset += this.meta.numWidth;
const recordBlockCompSize = utils_js_1.default.b2n(recordHeaderBuffer.slice(ofset, ofset + this.meta.numWidth));
this.recordHeader = {
recordBlocksNum,
entriesNum,
recordInfoCompSize,
recordBlockCompSize,
};
}
/**
* STEP 6.
* decode record Info,
*/
_readRecordInfos() {
this._recordInfoStartOffset = this._recordHeaderEndOffset;
const recordInfoBuff = this.scanner.readBuffer(this._recordInfoStartOffset, this.recordHeader.recordInfoCompSize);
/**
* record_block_info_list:
* [{
* packSize: number
* packAccumulateOffset: number
* unpackSize: number,
* unpackAccumulatorOffset: number
* }]
* Note: every record block will contain a lot of entries
*/
const recordInfoList = [];
let offset = 0;
let compressedAdder = 0;
let decompressionAdder = 0;
for (let i = 0; i < this.recordHeader.recordBlocksNum; i++) {
const packSize = utils_js_1.default.b2n(recordInfoBuff.slice(offset, offset + this.meta.numWidth));
offset += this.meta.numWidth;
const unpackSize = utils_js_1.default.b2n(recordInfoBuff.slice(offset, offset + this.meta.numWidth));
offset += this.meta.numWidth;
recordInfoList.push({
packSize: packSize,
packAccumulateOffset: compressedAdder,
unpackSize: unpackSize,
unpackAccumulatorOffset: decompressionAdder,
});
compressedAdder += packSize;
decompressionAdder += unpackSize;
}
(0, assert_1.default)(offset === this.recordHeader.recordInfoCompSize);
(0, assert_1.default)(compressedAdder === this.recordHeader.recordBlockCompSize);
this.recordInfoList = recordInfoList;
// assign latest keyword's endoffset
if (this.keywordList.length > 0) {
this.keywordList[this.keywordList.length - 1].recordEndOffset = this.recordInfoList[this.recordInfoList.length - 1].unpackAccumulatorOffset + this.recordInfoList[this.recordInfoList.length - 1].unpackSize;
}
this._recordInfoEndOffset = this._recordInfoStartOffset + this.recordHeader.recordInfoCompSize;
// avoid user not invoke the _decodeRecordBlock method
this._recordBlockStartOffset = this._recordInfoEndOffset;
}
/**
* STEP 7.
* read all records block,
* this is a slow method, do not use!
*/
_readRecordBlocks() {
this._recordBlockStartOffset = this._recordInfoEndOffset;
const keyData = [];
/**
* start reading the record block
*/
// actual record block
let sizeCounter = 0;
let itemCounter = 0;
let recordOffset = this._recordBlockStartOffset;
for (let idx = 0; idx < this.recordInfoList.length; idx++) {
let compressType = 'none';
const packSize = this.recordInfoList[idx].packSize;
const unpackSize = this.recordInfoList[idx].unpackSize;
const rbPackBuff = this.scanner.readBuffer(recordOffset, packSize);
recordOffset += packSize;
// 4 bytes: compression type
const rbCompType = Buffer.from(rbPackBuff.slice(0, 4));
// record_block stores the final record data
let recordBlock = new Uint8Array(rbPackBuff.length);
// TODO: ignore adler32 offset
// Note: here ignore the checksum part
// bytes: adler32 checksum of decompressed record block
// adler32 = unpack('>I', record_block_compressed[4:8])[0]
if (rbCompType.toString('hex') === '00000000') {
recordBlock = rbPackBuff.slice(8, rbPackBuff.length);
}
else {
// decrypt
let blockBufDecrypted = null;
// if encrypt type == 1, the record block was encrypted
if (this.meta.encrypt === 1 /* || (this.meta.ext == "mdd" && this.meta.encrypt === 2 ) */) {
// const passkey = new Uint8Array(8);
// record_block_compressed.copy(passkey, 0, 4, 8);
// passkey.set([0x95, 0x36, 0x00, 0x00], 4); // key part 2: fixed data
blockBufDecrypted = utils_js_1.default.mdxDecrypt(rbPackBuff);
}
else {
blockBufDecrypted = rbPackBuff.slice(8, rbPackBuff.length);
}
// --------------
// decompress
// --------------
if (rbCompType.toString('hex') === '01000000') {
compressType = 'lzo';
// the header was needed by lzo library, should append before real compressed data
// const header = Buffer.from([0xf0, decompSize]);
// Note: if use lzo, here will LZO_E_OUTPUT_RUNOVER, so ,use mini lzo js
// recordBlock = Buffer.from(
// lzo1x.decompress(common.appendBuffer(header, blockBufDecrypted), decompSize, 1308672)
// );
recordBlock = Buffer.from(lzo1x_wrapper_js_1.default.decompress(blockBufDecrypted, unpackSize, 0));
recordBlock = Buffer.from(recordBlock).slice(recordBlock.byteOffset, recordBlock.byteOffset + recordBlock.byteLength);
}
else if (rbCompType.toString('hex') === '02000000') {
compressType = 'zlib';
// zlib decompress
recordBlock = Buffer.from(pako.inflate(blockBufDecrypted));
}
}
// notice that adler32 return signed value
// TODO: ignore the checksum
// assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
(0, assert_1.default)(recordBlock.length === unpackSize);
/**
* 请注意,block 是会有很多个的,而每个block都可能会被压缩
* 而 key_list中的 record_start, key_text是相对每一个block而言的,end是需要每次解析的时候算出来的
* 所有的record_start/length/end都是针对解压后的block而言的
*/
// split record block according to the offset info from key block
let offset = 0;
let i = 0;
while (i < this.keywordList.length) {
const recordStart = this.keywordList[i].recordStartOffset;
const keyText = this.keywordList[i].keyText;
// # reach the end of current record block
if (recordStart - offset >= recordBlock.length) {
break;
}
// # record end index
let recordEnd;
if (i < this.keywordList.length - 1) {
recordEnd = this.keywordList[i + 1].recordStartOffset;
}
else {
recordEnd = recordBlock.length + offset;
}
i += 1;
// const data = record_block.slice(record_start - offset, record_end - offset);
keyData.push({
key: keyText,
idx: itemCounter,
// data,
encoding: this.meta.encoding,
// record_start,
// record_end,
record_idx: idx,
record_comp_start: recordOffset,
record_compressed_size: packSize,
record_decompressed_size: unpackSize,
record_comp_type: compressType,
record_encrypted: this.meta.encrypt === 1,
relative_record_start: recordStart - offset,
relative_record_end: recordEnd - offset,
});
itemCounter++;
}
offset += recordBlock.length;
sizeCounter += packSize;
}
(0, assert_1.default)(sizeCounter === this.recordHeader.recordBlockCompSize);
this.recordBlockDataList = keyData;
this._recordBlockEndOffset = this._recordBlockStartOffset + sizeCounter;
}
}
exports.default = MDictBase;
//# sourceMappingURL=mdict-base.js.map