UNPKG

novel-segment

Version:

Chinese word segmentation 簡繁中文分词模块 以網路小說為樣本

293 lines 10.2 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const tslib_1 = require("tslib"); const worker_threads_1 = require("worker_threads"); const project_config_1 = tslib_1.__importDefault(require("../project.config")); const upath2_1 = tslib_1.__importDefault(require("upath2")); const index_1 = require("@novel-segment/loaders/segment/index"); const uni_string_1 = tslib_1.__importDefault(require("uni-string")); const util_1 = require("@novel-segment/util"); const transliteration_1 = require("transliteration"); const jp_table_convert_1 = require("@lazy-cjk/jp-table-convert"); const fast_glob_1 = require("@bluelovers/fast-glob"); const array_hyper_unique_1 = require("array-hyper-unique"); const loader_line_1 = require("@novel-segment/loader-line"); const debug_color2_1 = require("debug-color2"); const zh_table_greedy_1 = require("@lazy-cjk/zh-table-greedy"); const zh_table_list_1 = require("@lazy-cjk/zh-table-list"); const diff_staged_1 = require("@git-lazy/diff-staged"); const match_1 = require("@git-lazy/util/util/match"); const util_compare_1 = require("@novel-segment/util-compare"); const n_readlines_1 = tslib_1.__importDefault(require("n-readlines")); const fs_extra_1 = tslib_1.__importDefault(require("fs-extra")); let CWD = upath2_1.default.join(project_config_1.default.temp_root); var EnumC1; (function (EnumC1) { EnumC1["char"] = "char"; EnumC1["other"] = "other"; EnumC1["eng"] = "eng"; })(EnumC1 || (EnumC1 = {})); const CWD_SAVETO = upath2_1.default.join(CWD, 'cache'); if (0 && (!fs_extra_1.default.pathExistsSync(upath2_1.default.join(CWD, 'stringify.txt')) || !(0, match_1.matchGlob)((0, diff_staged_1.gitDiffStagedFile)(CWD), [ 'cache.db.info.json' ]).length)) { process.exit(); } if (worker_threads_1.isMainThread) { log("This is the main thread", worker_threads_1.threadId); let workerOptions = { workerData: { time: new Date, //count: 0, //re: / /ig, }, }; let w1 = new worker_threads_1.Worker(__filename, workerOptions); //let w2 = new Worker(__filename, workerOptions); // const subChannel = new MessageChannel(); // // w2.postMessage({ // hereIsYourPort: subChannel.port1 // }, [subChannel.port1]); // w1.postMessage({ // hereIsYourPort: subChannel.port2 // }, [subChannel.port2]); let timeDiff; fs_extra_1.default.removeSync(CWD_SAVETO); w1.on('message', (msg) => { timeDiff = msg.timeDiff; //console.dir(msg); log(msg.index, msg.list.length); let cache = { char: [], other: [], eng: [], }; { let i = 'a'.codePointAt(0); let j = 'z'.codePointAt(0); while (i <= j) { cache[String.fromCodePoint(i)] = []; i++; } } cache = msg.list.reduce(function (cache, cur) { // @ts-ignore let { c1, line } = cur; cache[c1] = cache[c1] || []; cache[c1].push(Buffer.from(line).toString()); return cache; }, cache); Object.entries(cache).forEach(async function ([c1, ls]) { if (!/^[a-z0-9]$/i.test(c1)) { c1 = '0/' + c1; } let file = upath2_1.default.join(CWD_SAVETO, c1 + '.txt'); fs_extra_1.default.ensureFileSync(file); if (!ls.length) { return; } return fs_extra_1.default.appendFileSync(file, ls.join('\n') + '\n'); }); //fs.appendFile() }); w1.on('error', e => debug_color2_1.console.error(debug_color2_1.console)); w1.on('exit', (code) => { let bool = true; try { let i = timeDiff.getTime() - workerOptions.workerData.time.getTime(); log(i, timeDiff); } catch (e) { bool = false; } if (bool) { let ls = (0, fast_glob_1.sync)([ '**/*.txt' ], { cwd: CWD_SAVETO, absolute: true, }).sort(); let file2 = upath2_1.default.join(CWD, 'stringify.sorted.txt'); fs_extra_1.default.ensureFileSync(file2); fs_extra_1.default.truncateSync(file2); let i2 = ls.reduce((a, file) => { log('[start]', upath2_1.default.relative(CWD_SAVETO, file)); const liner = new n_readlines_1.default(file); let line; let index = 0; let list = []; while (line = liner.next()) { let s = line.toString(); let data = (0, index_1.parseLine)(s); let [w, p, f] = data; let cur = { // @ts-ignore data, line: s, index: index++, c1: "other" /* EnumC1.other */, line_type: (0, util_compare_1.chkLineType)(s), cjk_id: (0, util_1.getCjkName)(w), }; list.push(cur); a++; } list = SortList(list); let out_list = list.map(v => v.line); out_list = (0, array_hyper_unique_1.array_unique)(out_list); let out_data = (0, loader_line_1.serialize)(out_list); fs_extra_1.default.outputFileSync(file, out_data + "\n\n"); fs_extra_1.default.appendFileSync(file2, out_data + "\n"); log('[done]', upath2_1.default.relative(CWD_SAVETO, file)); return a; }, 0); log(i2); } if (code != 0) { debug_color2_1.console.error(new Error(`Worker stopped with exit code ${code}`)); } else { log(`Worker stopped`); } }); } else { // parentPort.once('message', (value) => { // value.hereIsYourPort.postMessage('hello'); // value.hereIsYourPort.on('message', msg => { // console.log(`thread ${threadId}: receive ${msg}`); // }); // }); //the worker's code debug_color2_1.console.dir(worker_threads_1.workerData, { colors: true, }); // log(workerData.re.test(' ')); let file = upath2_1.default.join(CWD, 'stringify.txt'); const liner = new n_readlines_1.default(file); let line; let lineNumber = 0; let count = 0; let c1_old; let list = []; while (line = liner.next()) { //console.log('Line ' + lineNumber + ': ' + line.toString('ascii')); let index = lineNumber++; let data = (0, index_1.parseLine)(line.toString()); let cur = { data, line, index, c1: "other" /* EnumC1.other */, }; let [w, p, f] = cur.data; let len = uni_string_1.default.size(w); let c1_now; if (len > 1) { c1_now = getCid(w); if (!c1_now) { debug_color2_1.console.log(c1_now, w); throw new Error(`${w}, ${c1_now}`); } } else if (len === 1) { c1_now = "char" /* EnumC1.char */; } else { c1_now = "other" /* EnumC1.other */; } cur.c1 = c1_now; if (count >= 10000) { worker_threads_1.parentPort.postMessage({ index, list, }); list = []; count = 0; } list.push(cur); c1_old = c1_now; count++; } log('end of line reached', lineNumber); worker_threads_1.workerData.count = lineNumber; worker_threads_1.parentPort.postMessage({ timeDiff: new Date, index: lineNumber, list, }); } function log(...argv) { debug_color2_1.console.log(`[thread:${worker_threads_1.threadId}]`, ...argv); } function getCid(w) { w = uni_string_1.default.slice(w, 0, 1).toLocaleLowerCase(); if (/^[a-z0-9]$/i.test(w)) { return "eng" /* EnumC1.eng */; } let s = (0, util_1.getCjkName)(w); let r = (0, transliteration_1.slugify)(s); if (!r) { r = (0, transliteration_1.slugify)((0, zh_table_greedy_1.greedyTableReplace)(s)); } if (!r) { let arr = (0, zh_table_list_1.auto)(s, { safe: false, greedyTable: 2, }); if (arr.length) { r = (0, transliteration_1.slugify)(arr[1] || arr[0]); } } if (!r) { let arr = (0, zh_table_list_1.auto)(w, { safe: false, greedyTable: 2, }); if (arr.length) { r = (0, transliteration_1.slugify)(arr[1] || arr[0]); } } if (!r) { r = (0, transliteration_1.slugify)((0, jp_table_convert_1.cjk2zhs)(s)); } if (!r) { r = (0, transliteration_1.slugify)((0, jp_table_convert_1.cjk2zht)(s)); } if (!r) { r = (0, transliteration_1.slugify)((0, jp_table_convert_1.cjk2zhs)(w)); } if (!r) { r = (0, transliteration_1.slugify)((0, jp_table_convert_1.cjk2zht)(w)); } if (!r) { r = (0, transliteration_1.slugify)(w); } if (!r) { r = w; } let r2 = uni_string_1.default.slice(r, 0, 1); if (!/^[a-z0-9]$/i.test(r2)) { r2 = "other" /* EnumC1.other */; } return r2.toLocaleLowerCase(); } function SortList(ls) { // @ts-ignore return ls.sort(function (a, b) { if (a.line_type == 2 /* EnumLineType.COMMENT_TAG */ || b.line_type == 2 /* EnumLineType.COMMENT_TAG */) { return (a.index - b.index); } else if (a.line_type == 1 /* EnumLineType.COMMENT */ || b.line_type == 1 /* EnumLineType.COMMENT */) { return (a.index - b.index); } let ret = (0, util_1.zhDictCompare)(a.cjk_id, b.cjk_id) || (a.index - b.index) || 0; return ret; }); } //# sourceMappingURL=sort-stringify-cache.js.map