UNPKG

cecc

Version:

繁簡轉換函式庫 追求正確率 先解析詞性再繁簡轉換 繁體中文↔簡體中文轉換 Chinese converter between Traditional Chinese and Simplified Chinese.

1,221 lines (1,072 loc) 98.5 kB
/* TODO: 簡化辭典複雜度: 分割個別作品的辭典為特設辭典。 依照前後詞彙再建立 Map(),避免條件式串列過長。這可能得考慮如何合併詞性標註錯誤時的條件式。 + PoS: "n*" 放在 "n*:" 之下。 https://zhuanlan.zhihu.com/p/95358646 常用的关键词提取算法:TF-IDF算法、TextRank算法 https://blog.csdn.net/vivian_ll/article/details/106647666 利用jieba进行关键字提取时,有两种接口。一个基于TF-IDF算法,一个基于TextRank算法。 https://s.itho.me/techtalk/2017/%E4%B8%AD%E6%96%87%E6%96%B7%E8%A9%9E%EF%BC%9A%E6%96%B7%E5%8F%A5%E4%B8%8D%E8%A6%81%E6%82%B2%E5%8A%87.pdf 某個詞在⼀篇⽂章中出現的頻率⾼,且在其他⽂章中很少出現,則此詞語為具代表性的關鍵詞 */ 'use strict'; // modify from wikiapi.js let CeL; try { // Load CeJS library. CeL = require('cejs'); } catch (e) /* istanbul ignore next: Only for debugging locally */ { // https://github.com/gotwarlost/istanbul/blob/master/ignoring-code-for-coverage.md require('./_CeL.loader.nodejs.js'); CeL = globalThis.CeL; } // assert: typeof CeL === 'function' // 在非 Windows 平台上避免 fatal 錯誤。 CeL.env.ignore_COM_error = true; // Load modules. CeL.run(['application.debug', // 載入不同地區語言的功能 for wiki.work()。 'application.locale', // Add color to console messages. 添加主控端報告的顏色。 'interact.console', // CeL.data.Convert_Pairs.remove_comments() 'data.Convert_Pairs', // for CeL_CN_to_TW() 'extension.zh_conversion', //for CeL.get_URL() 'application.net.Ajax', // for 'application.platform.nodejs': CeL.env.arg_hash, CeL.wiki.cache(), // CeL.fs_mkdir(), CeL.wiki.read_dump() 'application.storage']); /** {Number}未發現之index。 const: 基本上與程式碼設計合一,僅表示名義,不可更改。(=== -1) */ const NOT_FOUND = ''.indexOf('_'); const module_base_path = CeL.append_path_separator(module.path); const test_directory = CeL.append_path_separator(module_base_path + '_test suite'); // Cache default convertors without CeCC. const CeL_CN_to_TW = CeL.zh_conversion.CN_to_TW, CeL_TW_to_CN = CeL.zh_conversion.TW_to_CN; // ---------------------------------------------------------------------------- // default const KEY_word = 'word', KEY_PoS_tag = 'tag', KEY_filter_name = 'filter_name'; const DEFAULT_TEST_FILE_EXTENSION = 'txt'; const dictionary_template = { TW: 'CN_to_TW.%name.%type.txt', CN: 'TW_to_CN.%name.%type.txt' }; function get_dictionary_file_paths(type) { if (!this.parser_name) throw new Error('No parser name specified!'); const dictionary_file_paths = Object.create(null); for (const language in dictionary_template) { let path = dictionary_template[language] .replace('%name', this.parser_name) .replace('%type', type || 'PoS'); if (type === 'filters') path = path.replace(/[^.]+$/, 'js'); dictionary_file_paths[language] = path; } if (!type) this.dictionary_file_paths = dictionary_file_paths; return dictionary_file_paths; } // CeCC class Chinese_converter { constructor(options) { this.convertion_pairs = Object.create(null); this.KEY_word = KEY_word; this.KEY_PoS_tag = KEY_PoS_tag; if (options?.LTP_URL) { this.LTP_URL = options.LTP_URL; options.using_LTP = options.using_LTP || true; } if (options?.using_LTP) { // 最高正確率 this.KEY_word = 'text'; this.KEY_PoS_tag = 'pos'; this.TAG_punctuation = 'wp'; this.condition_filter = condition_filter_LTP; this.parser_name = 'LTP'; this.filters = get_dictionary_file_paths.call(this, 'filters'); for (const language in this.filters) { const dictionary_file_path = this.dictionaries_directory + this.filters[language]; this.filters[language] = require(dictionary_file_path); } this.generate_condition = generate_condition_LTP; load_synonym_dictionary.call(this); this.tag_paragraph = tag_paragraph_LTP; // .batch_get_tag 批量查詢詞性標記之條件: 1.可接受批量{Array}。 2.單次查詢消耗太大。 this.batch_get_tag = !this.LTP_URL; } else if (options?.CoreNLP_URL) { // using Stanford CoreNLP this.KEY_PoS_tag = 'pos'; this.CoreNLP_URL = new URL(options.CoreNLP_URL); this.parser_name = 'CoreNLP'; // https://stanfordnlp.github.io/CoreNLP/corenlp-server.html this.CoreNLP_URL_properties = { annotators: 'tokenize,ssplit,pos,depparse', }; this.tag_paragraph = tag_paragraph_via_CoreNLP; } else { // fallback to default: nodejieba this.nodejieba_CN = require("nodejieba"); this.nodejieba_CN.load({ dict: this.dictionaries_directory + 'commons.txt' }); this.parser_name = 'jieba'; this.tag_paragraph = tag_paragraph_jieba; } get_dictionary_file_paths.call(this); for (const language in this.dictionary_file_paths) { const dictionary_file_path = this.dictionary_file_paths[language] = this.dictionaries_directory + this.dictionary_file_paths[language]; load_dictionary.call(this, dictionary_file_path, { language }); } if (CeL.is_debug()) { // 這些是比較耗時的轉換。 for (const [language, convertion_pairs] of Object.entries(this.convertion_pairs)) { CeL.info(`convertion pairs for ${language}:`); function show_convertion_pairs(_pairs, tag = 'general') { const size = _pairs.size; if (size > 0) { CeL.log(`\t${tag || 'general'}\t${size} convertion(s)${size < 9 ? '\t' + Array.from(_pairs.keys()).join('\t') : ''}`); } } show_convertion_pairs(convertion_pairs.get(KEY_general_pattern_filter)); for (const [tag, _pairs] of Object.entries(convertion_pairs.get(KEY_tag_pattern_filter))) { show_convertion_pairs(_pairs, tag); } } } this.load_default_text_to_check(); } /** * convert to TW * @param {Array}paragraphs [{String}, {String}, ...] * @param {Object}[options] */ async to_TW(paragraphs, options) { return await convert_Chinese.call(this, await paragraphs, { convert_to_language: 'TW', ...options }); } to_TW_sync(paragraphs, options) { return convert_Chinese.call(this, paragraphs, { convert_to_language: 'TW', ...options }); } /** * convert to CN * @param {Array}paragraphs [{String}, {String}, ...] * @param {Object}[options] */ async to_CN(paragraphs, options) { return await convert_Chinese.call(this, await paragraphs, { convert_to_language: 'CN', ...options }); } to_CN_sync(paragraphs, options) { return convert_Chinese.call(this, paragraphs, { convert_to_language: 'CN', ...options }); } // 自動判斷句子、段落的語境(配合維基百科專有名詞轉換) detect_domain(paragraphs, options) { // TODO } static async has_LTP_server(options) { if (typeof options === 'string') { // treat options as LTP_URL options = { LTP_URL: options }; } else { options = { LTP_URL: 'http://localhost:5000/', ...options }; } if (options.skip_server_test) { CeL.debug('強制使用 LTP server,跳過對 LTP server 的運作測試。請只在您準備全程使用 cache 的情況下才使用這個選項。', 1, Chinese_converter.has_LTP_server.name); return options.LTP_URL; } try { //console.trace(options); // 注意: 測試 LTP server 不可包含空白或者英數字元! const result = await tag_paragraph_LTP.call(options, '測試繁簡轉換伺服器是否正常運作'); //console.trace(result); return Array.isArray(result) && options.LTP_URL; } catch (e) { //console.error(e); } } //#parse_condition = parse_condition } // ---------------------------------------------------------------------------- function to_converted_file_path(convert_from_text__file_name) { return convert_from_text__file_name.replace(/(\.\w+)$/, '.converted$1'); } async function regenerate_converted(convert_from_text__file_path, convert_to_text__file_status, options) { CeL.info(`${regenerate_converted.name}: Generate a new answer file for ${options.convert_from_text__file_name || convert_from_text__file_path}...`); let converted_text = CeL.read_file(convert_from_text__file_path).toString(); //console.trace(options.convert_options); converted_text = options.text_is_TW ? await this.to_CN(converted_text, options.convert_options || regenerate_converted.default_convert_options) : await this.to_TW(converted_text, options.convert_options || regenerate_converted.default_convert_options) ; //console.trace(converted_text.slice(0, 200)); CeL.write_file(convert_to_text__file_status //.replace('.answer.', '.converted.') , converted_text, { backup: { directory_name: 'backup' } }); } regenerate_converted.default_convert_options = { cache_directory: CeL.append_path_separator(test_directory + 'cache_data'), cache_file_for_short_sentences: true, // 超過此長度才創建個別的 cache 檔案,否則會放在 .cache_file_for_short_sentences。 min_cache_length: 40, }; function get_convert_to_text__file_status(convert_from_text__file_name, options) { options = CeL.setup_options(options); const convert_from_text__file_path = this.test_articles_directory + convert_from_text__file_name; const convert_from_text__file_status = CeL.fso_status(convert_from_text__file_path); const convert_to_text__file_path = options.convert_to_text__file_path || (options.convert_to_text__file_name ? this.test_articles_directory + options.convert_to_text__file_name : Chinese_converter.to_converted_file_path(convert_from_text__file_path)); const convert_to_text__file_status = CeL.fso_status(convert_to_text__file_path); const need_to_generate_new_convert_to_text__file = options.regenerate_converted || !convert_to_text__file_status || convert_from_text__file_status.mtime - convert_to_text__file_status.mtime > 0; return { convert_from_text__file_path, convert_from_text__file_status, convert_to_text__file_path, convert_to_text__file_status, need_to_generate_new_convert_to_text__file }; } async function not_new_article_to_check(convert_from_text__file_name, options) { options = CeL.setup_options(options); const { convert_from_text__file_path, convert_from_text__file_status, convert_to_text__file_path, convert_to_text__file_status, need_to_generate_new_convert_to_text__file } = get_convert_to_text__file_status.call(this, convert_from_text__file_name, options); if (need_to_generate_new_convert_to_text__file) { //console.trace('重新生成 .converted.* 解答檔案。'); await this.regenerate_converted(convert_from_text__file_path, convert_to_text__file_path, { ...options, convert_from_text__file_name, }); } if (options.recheck) { // 既然要重新檢查,即便詞典檔是舊的,依然算作有新變化。 return; } // ----------------------------------- // 檢查上一次測試後,是否有新詞典檔。 const latest_test_result = options.latest_test_result && options.latest_test_result[options.test_name]; const latest_test_result_for_file = latest_test_result && latest_test_result.test_results && latest_test_result.test_results[convert_from_text__file_name]; const latest_test_result_date = latest_test_result_for_file?.error_count === 0 ? Date.parse(latest_test_result_for_file?.date) // 檢查是否有比測試檔或 .converted.* 解答檔案更新的新詞典檔。 : convert_to_text__file_status ? Math.max(convert_from_text__file_status.mtime.getTime(), convert_to_text__file_status.mtime.getTime()) : convert_from_text__file_status.mtime.getTime(); //console.trace(this.dictionary_file_paths); for (const dictionary_file_path of Object.values(this.dictionary_file_paths)) { const dictionary_file_status = CeL.fso_status(dictionary_file_path); //console.trace(dictionary_file_status); //console.trace([dictionary_file_status.mtime - latest_test_result_date, convert_from_text__file_status && convert_from_text__file_status.mtime - dictionary_file_status.mtime]); if (dictionary_file_status.mtime - latest_test_result_date > 0) { CeL.info(`${not_new_article_to_check.name}: ${convert_from_text__file_name}: 有新詞典檔 ${dictionary_file_path}`); if (latest_test_result) delete latest_test_result[convert_from_text__file_name]; return; } } // 檢查上一次測試是否比測試檔更新。 //console.trace(latest_test_result_date - convert_from_text__file_status.mtime); if (latest_test_result_date - convert_from_text__file_status.mtime > 0) { //console.trace(!convert_from_text__file_status || latest_test_result_date > convert_from_text__file_status.mtime); return !convert_to_text__file_status || latest_test_result_date > convert_to_text__file_status.mtime; } } const KEY_files_loaded = Symbol('files loaded'); function load_text_to_check(should_be_text__file_name, options) { if (CeL.is_Object(should_be_text__file_name)) { if (should_be_text__file_name.all) { CeL.read_directory(this.test_articles_directory).forEach(from_file_name => { const matched = from_file_name.match(/watch_target\.(?<work_title>[^.]+)\.(?<to_language>TW|CN)\.\w+$/); if (matched) { this.load_text_to_check(from_file_name, { export: { work_title: matched.groups.work_title } }); } }); return; } if (should_be_text__file_name.work_title) { options = CeL.setup_options(options); if (!options.export) options.export = Object.create(null); if (!options.export.work_title) options.export.work_title = should_be_text__file_name.work_title; // e.g., "watch_target.第一序列.TW.txt" should_be_text__file_name = `watch_target.${should_be_text__file_name.work_title}.${should_be_text__file_name.convert_to_language}.${DEFAULT_TEST_FILE_EXTENSION}`; //console.trace(should_be_text__file_name); } else { throw new Error(`${load_text_to_check.name}: Invalid should_be_text__file_name: ${JSON.stringify(should_be_text__file_name)}`); } } let check_language = should_be_text__file_name.match(/\.(TW|CN)\.\w+$/); //console.trace([should_be_text__file_name, check_language]); if (!check_language) { CeL.error(`無法判別檔案之語言: ${should_be_text__file_name}`); return; } check_language = check_language[1]; const convert_to_text__data = get_convert_to_text__file_status.call(this, should_be_text__file_name, options); const should_be_text__file_path = convert_to_text__data.convert_from_text__file_path; if (!this.generate_condition_for_language || options?.reset && !this.generate_condition_for_language.only_default) { //console.trace('初始化。'); this.generate_condition_for_language = { [KEY_files_loaded]: [], only_default: true }; if (!options?.is_default) this.load_default_text_to_check(); } if (this.generate_condition_for_language[KEY_files_loaded].includes(should_be_text__file_path)) { CeL.log(`${load_text_to_check.name}: The file is already loaded, skip ${should_be_text__file_path}`); return; } if (!options?.is_default) delete this.generate_condition_for_language.only_default; this.generate_condition_for_language[KEY_files_loaded].push(should_be_text__file_path); const should_be_texts = get_paragraphs_of_file(should_be_text__file_path, { with_configurations: true }); if (!should_be_texts) return; const source_text__file_path = convert_to_text__data.convert_to_text__file_path; if (convert_to_text__data.need_to_generate_new_convert_to_text__file) { //console.trace('重新生成 .converted.* 解答檔案。'); return this.regenerate_converted(should_be_text__file_path, source_text__file_path, { ...options, convert_from_text__file_name: should_be_text__file_name, text_is_TW: check_language === 'TW', convert_options: { ...regenerate_converted.default_convert_options, cache_directory: CeL.append_path_separator(regenerate_converted.default_convert_options.cache_directory + should_be_text__file_name) } }).then(setup_generate_condition_for.bind(this)); } else { return setup_generate_condition_for.call(this); } function setup_generate_condition_for() { // should_be_text__file_path: .TW.* 為轉換之答案/標的,因此檢查的是相反語言。 .converted 才是原文! const source_texts = get_paragraphs_of_file(source_text__file_path); if (!source_texts) return; if (should_be_texts.length !== source_texts.length) { CeL.error(`${should_be_text__file_name} 與 ${should_be_text__file_path} 含有不同數量之字串!此${CeL.gettext.get_alias(check_language)}之標的檔與欲測試之項目數不符,將不採用解答!若檔案為自動生成,您可以刪除舊檔後,重新生成轉換標的檔案。`); return; } //console.log(this.generate_condition_for_language); // this.generate_condition_for_language[convert_to_language] = { convert_from_text: should_convert_to_text, ... } const generate_condition_for = this.generate_condition_for_language[check_language] || (this.generate_condition_for_language[check_language] = new Map); const generate_condition_for__title = `${options?.export?.work_title ? `《${options.export.work_title}》` : '通用 ' }${CeL.gettext.get_alias(check_language === 'TW' ? 'CN' : 'TW')}→${CeL.gettext.get_alias(check_language)}`; should_be_texts.forEach((should_convert_to_text, index) => { const configuration = should_be_texts.configurations[should_convert_to_text]; //console.trace([should_convert_to_text, configuration]); let text = source_texts[index]; if (false && configuration) { console.trace([text, should_convert_to_text, configuration]); } if (configuration?.原文) { if (configuration.原文 === text) { CeL.info(`${setup_generate_condition_for.name}: 轉換前後文字相同,無需設定"原文" ${JSON.stringify(text)}: ${JSON.stringify(configuration)}`); } else { configuration.original_text_converted = text; text = configuration.原文; } } //console.trace([check_language === 'TW' ? CeL_CN_to_TW(text) : CeL_TW_to_CN(text), should_convert_to_text]); if (generate_condition_for.has(text)) { CeL.log(`${setup_generate_condition_for.name}: ${generate_condition_for__title}: 重複設定 ${JSON.stringify(text)}`); } generate_condition_for.set(text, { should_convert_to_text, ...options?.export, ...configuration }); }); //console.trace(generate_condition_for); const totle_count = generate_condition_for.size; CeL.info(`${load_text_to_check.name}: 自動檢核 ${should_be_texts.length}個${generate_condition_for__title } 之字串。${totle_count === should_be_texts.length ? '' : `總共檢核 ${totle_count}個。`} From ${should_be_text__file_path}`); //console.trace(this.generate_condition_for_language); return this.generate_condition_for_language; } } // 會在每次轉換都測試是否有相符之文字。 function load_default_text_to_check() { this.text_to_check_files.forEach(from_file_name => this.load_text_to_check(from_file_name, { is_default: true })); } // 顯示用函數。 function report_text_to_check(options) { if (!this.generate_condition_for_language) return; const SGR_style = CeL.interact.console.SGR_style; const normal_style = (new SGR_style('fg=green;bg=black')).toString(), NG_style = (new SGR_style('fg=red;bg=white')).toString(), reset_style = (new SGR_style({ reset: true })).toString(); const generate_condition_for = this.generate_condition_for_language[options.convert_to_language]; //console.trace(generate_condition_for); // lost_texts: 用來記錄、顯示還有哪些尚未處理。 const lost_texts = [], multi_matched = Object.create(null); let OK_count = 0, NG_count = 0; for (const [convert_from, convert_data] of generate_condition_for.entries()) { if (!convert_data.work_title) { // e.g., 常出錯詞語 @ this.text_to_check_files continue; } const { check_result } = convert_data; if (!check_result) { lost_texts.push(convert_data.should_convert_to_text); continue; } if (check_result.NG.length > 0 || check_result.OK.length /* + check_result.NG.length */ > 1) { multi_matched[convert_from] = check_result.OK.length; if (check_result.NG.length > 0) multi_matched[convert_from] += ` + ${NG_style}${check_result.NG.length} NG${normal_style}`; } if (check_result.NG.length > 0) NG_count++; else OK_count++; } const message = `${report_text_to_check.name}: ${OK_count} OK, ${NG_count } NG.${lost_texts.length > 0 ? ` ${lost_texts.length} lost:\n\t${lost_texts.join('\n\t')}` : ''}`; if (NG_count > 0) { CeL.error(message); } else { CeL.log(message); } const multi_matched_keys = Object.keys(multi_matched); if (multi_matched_keys.length > 0) { // 這裡可以計算某個值出現幾次。 CeL.log(`multi matched counts:\n${normal_style }${multi_matched_keys.map(convert_from => `\t${convert_from}: \t${multi_matched[convert_from]}`).join('\n') }${reset_style}`); } return { lost_texts, OK_count, NG_count }; } // ---------------------------------------------------------------------------- const condition_delimiter = '+'; /* conditions will be split by `condition_delimiter`: word PoS:word PoS: // "~": 指示此 condition 為標的文字(is_target) ~PoS:word // "!": 指示選出不符合此條件的(not_match) !PoS:word ~!PoS:word // 末尾的"?": 表示此條件可有可無、可以跳過(is_optional) ~!PoS:word? // -------------------------- word: 文字 /search_pattern/flags /search_pattern/replace_to/flags // "~/改成了錯誤的繁體pattern/正確的繁體replace_to/flags$" 表示先進行繁簡轉換再執行此處的替代,僅僅適用於標的文字(is_target) ~/pattern/replace_to/flags 文字~/pattern/replace_to/flags /search_pattern/flags~/pattern/replace_to/flags 文字<filter_name>filter_target */ // [ condition, is target, not match, tag (PoS), word / pattern, is optional / repeat range ] const PATTERN_condition = /^(?<is_target>~)?(?<not_match>!)?(?:(?<tag>[^:+<>]+):)?(?<word>.*?)(?<is_optional>\?)?$/; // [ all, word, do_after_converting ] const PATTERN_do_after_converting = new RegExp('^(?<word>.*?)~(?<do_after_converting>' + CeL.PATTERN_RegExp_replacement.source.slice(1, -1) + ')?$'); // JSON.stringify(): for "\n" function stringify_condition(condition_text) { // .replace(/\r/g, '\\r').replace(/\n/g, '\\n') return JSON.stringify(condition_text).slice(1, -1).replace(/\\"/g, '"'); } function word_data_to_condition(word_data, options) { const tag = word_data[this.KEY_PoS_tag]; return (tag ? tag + ':' : '') + (options?.including_prefix_spaces && word_data[KEY_prefix_spaces] ? stringify_condition(word_data[KEY_prefix_spaces]) : '') + (typeof word_data[this.KEY_word] === 'string' && stringify_condition(word_data[this.KEY_word]) || word_data[this.KEY_word] || ''); } // parse rule // convert {String}full_condition_text to {Object}word_data or {Object}condition function parse_condition(full_condition_text, options) { let target_index; function set_as_target(condition_data) { condition_data.is_target = true; condition_data.full_condition_text = full_condition_text; if (options?.matched_condition) condition_data.matched_condition = options.matched_condition; } const condition = []; const full_condition_splited = full_condition_text.split(condition_delimiter); for (let index = 0, accumulated_target_index_diff = 0; index < full_condition_splited.length; index++) { let token = full_condition_splited[index]; let matched = token.match(PATTERN_condition).groups; if (/^\//.test(matched.tag) && /\(\?$/.test(matched.tag)) { // e.g., "/^(?:a)$/" matched.word = matched.tag + ':' + matched.word; matched.tag = undefined; //console.trace(matched); } if (/^\/(\\\/|[^\/])+$/.test(matched.word)) { // 處理 RegExp pattern 中包含 condition_delimiter 的情況。 // e.g., ~里+/^许.+河$/ v:卷+m:/^[\\d〇一二三四五六七八九零十]+$/+~裡 const full_condition_splited_expanded = Array.isArray(options.full_condition_splited) ? full_condition_splited.concat(options.full_condition_splited.slice(options.index + 1)) : full_condition_splited; for (let combined_token = token, next_index = index; next_index < full_condition_splited_expanded.length;) { const next_token = full_condition_splited_expanded[++next_index]; combined_token += condition_delimiter + next_token; const _matched = combined_token.match(PATTERN_condition).groups; if (CeL.PATTERN_RegExp.test(_matched.word) || CeL.PATTERN_RegExp_replacement.test(_matched.word)) { token = combined_token; matched = _matched; accumulated_target_index_diff += next_index - index; index = next_index; //console.trace([token, matched]); } } if (index >= full_condition_splited.length) { // e.g., ~干<role.type:A1>/那.+何事$/ options.combined_token_count = index - full_condition_splited.length + 1; } //console.log([full_condition_splited_expanded, full_condition_splited, options.full_condition_splited?.slice(options.index + 1), options]); //console.trace([index, target_index, accumulated_target_index_diff, token, matched]); } const condition_data = Object.create(null); if (matched.is_target && !options?.no_target) { set_as_target(condition_data); if (target_index >= 0) CeL.warn(`${parse_condition.name}: Multiple targets: ${full_condition_text}`); else target_index = index - accumulated_target_index_diff; } let do_after_converting = matched.word && matched.word.match(PATTERN_do_after_converting); if (do_after_converting) { do_after_converting = do_after_converting.groups; matched.word = do_after_converting.word; if (do_after_converting = do_after_converting.do_after_converting?.to_RegExp({ allow_replacement: true })) condition_data.do_after_converting = do_after_converting; } if (matched.word) { let filter = matched.word.match(/^(?<word>.*?)<(?<filter_name>[^<>]+)>(?<filter_target>.*?)$/); if (filter) { if (!this.condition_filter) throw new Error('No .condition_filter set but set filter: ' + matched.word); filter = filter.groups; const _options = { no_target: true, full_condition_splited, index }; Object.assign(condition_data, { [this.KEY_word]: filter.word, [KEY_filter_name]: filter.filter_name, filter_target: parse_condition.call(this, filter.filter_target, _options) }); //console.trace(condition_data); if (_options.combined_token_count > 0) { token = full_condition_splited.slice(index, index + _options.combined_token_count + 1).join(condition_delimiter); accumulated_target_index_diff += _options.combined_token_count; index += _options.combined_token_count; } } else { //const replace_pattern = matched.word.match(); condition_data[this.KEY_word] = CeL.PATTERN_RegExp.test(matched.word) || CeL.PATTERN_RegExp_replacement.test(matched.word) ? matched.word.to_RegExp({ allow_replacement: true }) // allow '\n', '\t' in filter. : matched.word.replace(/\\\w/g, char => JSON.parse(`"${char}"`)); } } condition_data.condition_text = token; if (matched.not_match) { // !!: 採用字串作XOR運算,可能出現錯誤。 ('!'^true)===1 condition_data.not_match = !!matched.not_match; //console.trace([matched, condition_data]); } if (matched.tag) condition_data[this.KEY_PoS_tag] = matched.tag; if (matched.is_optional) condition_data.is_optional = true; //console.trace(condition_data); condition.push(condition_data); } if (!(target_index >= 0) && !options?.no_target) { // 當僅僅只有單一 token 時,預設即為當前標的。 set_as_target(condition[0]); } if (condition.length === 1) { return condition[0]; } if (!options?.no_target) { // default: set [0] as target. condition.target_index = target_index || 0; } return condition; } // ------------------------------------------------------------------ // 顯示用函數。 const KEY_matched_condition = 'matched condition'; function print_correction_condition(correction_condition, options) { //console.trace(correction_condition); const to_word_data = correction_condition.parsed[KEY_matched_condition]; let matched_condition_mark; if (to_word_data) { //console.log(correction_condition); //console.log(correction_condition.parsed.parents); //console.trace(to_word_data); matched_condition_mark = ` 匹配的條件式: ${to_word_data.matched_condition ? `${to_word_data.matched_condition} → ` : ''}${to_word_data.full_condition_text}`; CeL.warn(`Matched condition${matched_condition_mark}`); } // 自動提供可符合答案之候選條件式。 CeL.info(`Candidate correction for ${JSON.stringify(correction_condition.parsed.text)}→${JSON.stringify(correction_condition.target)} (錯誤轉換為 ${JSON.stringify(correction_condition.error_converted_to)}):`); const { work_title, original_sentence_word_list, tagged_convert_from_text, } = options; if (tagged_convert_from_text) { const list = correction_condition.slice(1).filter(correction => !correction.includes('<←')); list.push(tagged_convert_from_text.join(condition_delimiter)); CeL.info(`//${matched_condition_mark ? ' 解析錯誤 @' : ''}${work_title ? ` 《${work_title}》` : ''} ${original_sentence_word_list} (${list.join(' ')})${matched_condition_mark || ''}`); } CeL.info(correction_condition.join('\t')); } // 展示有問題的項目。 function print_section_report(configuration, options) { const { tagged_word_list, condition_list, convert_from_text, convert_to_text, should_convert_to_text, show_tagged_word_list, start_index, end_index, distance_token_header_to_metched } = configuration; const { index_hash } = condition_list; const SGR_style = CeL.interact.console.SGR_style; const normal_style_tagged = (new SGR_style('fg=blue;bg=cyan')).toString(), marked_style_row = 'fg=red;bg=white', marked_style = (new SGR_style(marked_style_row)).toString(), reset_style = (new SGR_style({ reset: true })).toString(); const normal_style_convert_from_text_row = 'fg=green;bg=black'; const ansi_convert_from_text = new CeL.interact.console.SGR(convert_from_text); let backward = 0, forward = 0; const is_fragment = start_index >= 0 && should_convert_to_text.chars().length <= 4; if (is_fragment) { // 當截取的詞彙太短,自動擴張成一整句。 // assert: 0 <= start_index < end_index let index = start_index; // 向前找尋標點符號。 while (index > 0) { const word_data = tagged_word_list[--index]; if (word_data[this.KEY_PoS_tag] === this.TAG_punctuation) { if (index < start_index && /[、,;:。?!…]$/.test(word_data[this.KEY_word])) index++; break; } } backward = start_index - index; // assert: 0 <= backward <= start_index // start from next tagged_word_list[], at least move 1 step. // 向後找尋標點符號。 index = end_index; while (index < tagged_word_list.length) { const word_data = tagged_word_list[index++]; if (word_data[this.KEY_PoS_tag] === this.TAG_punctuation) { break; } } forward = index - end_index; //console.trace([start_index, backward, end_index, forward]); } const tagged_word_list_pieces = start_index >= 0 ? tagged_word_list.slice(start_index - backward, end_index + forward) : tagged_word_list; //console.trace(tagged_word_list_pieces); let offset = convert_from_text.match(/^\s*/)[0].length, original_sentence_word_list = []; const tagged_convert_from_text = []; const matched_conditions = []; //console.trace([convert_from_text, offset, distance_token_header_to_metched, start_index, backward]); CeL.log(`${normal_style_tagged }${CeL.gettext.get_alias(options.convert_to_language === 'TW' ? 'CN' : 'TW').slice(0, 1) }\t${tagged_word_list_pieces.map((word_data, index) => { const prefix_spaces = index > 0 && word_data[KEY_prefix_spaces] || ''; // condition filter 預設會排除 prefix spaces,因此將 prefix_spaces 另外列出。 // @see match_single_condition() const text = stringify_condition(prefix_spaces) + word_data_to_condition.call(this, word_data); tagged_convert_from_text.push(text); original_sentence_word_list.push(prefix_spaces + word_data[this.KEY_word]); const matched_condition_data = word_data[KEY_matched_condition]; if (matched_condition_data) { //console.trace(matched_condition_data); matched_conditions.push(matched_condition_data.matched_condition + ' → ' + matched_condition_data.condition_text); } if (backward && (index -= backward) < 0) { return text; } if (prefix_spaces) offset += prefix_spaces.length; const start_offset = offset; offset += word_data[this.KEY_word].length; if (index === 0) { // assert: convert_from_text.trimStart().startsWith(word_data_to_condition.call(this, word_data).slice(distance_token_header_to_metched)); if (distance_token_header_to_metched) { //console.trace([distance_token_header_to_metched, prefix_spaces.length, word_data]); // assert: distance_token_header_to_metched >= prefix_spaces.length offset -= distance_token_header_to_metched - (word_data[KEY_prefix_spaces] || '').length; } } if (!index_hash[start_index >= 0 ? start_index + index : index]) { return text; } //console.trace([word_data, index_hash[index], start_offset, offset, ansi_convert_from_text.style]); if (ansi_convert_from_text.style_at(start_offset, true)) { // assert: 不間斷連續匹配到。先前已設定過 .style_at(start_offset, normal_style_convert_from_text_row) // assert: ansi_convert_from_text.style_at(start_offset, true).toString() === normal_style_convert_from_text_row.toString() ansi_convert_from_text.style_at(start_offset, null); } else { ansi_convert_from_text.style_at(start_offset, marked_style_row); } ansi_convert_from_text.style_at(offset, normal_style_convert_from_text_row); //console.trace([start_offset, offset, convert_from_text.slice(start_offset, offset), text, convert_from_text.slice(word_data.offset, word_data.offset + word_data[this.KEY_word].length)]); return marked_style + text + normal_style_tagged; }).join(condition_delimiter) }${reset_style}`); original_sentence_word_list = original_sentence_word_list.join(''); if (is_fragment) { // show 全句 CeL.log(`\t原文⇒${reset_style}${JSON.stringify(original_sentence_word_list)}`); } //console.log(ansi_convert_from_text); //CeL.log(`\t${JSON.stringify(convert_from_text)}`); CeL.log(`${(new SGR_style(normal_style_convert_from_text_row)).toString() }原文:\t ${ansi_convert_from_text.toString().replace(/\r/g, '\\r').replace(/\n/g, '\\n')}${reset_style}`); // 為轉換前後的差異文字著色。 CeL.coloring_diff(JSON.stringify(convert_to_text), JSON.stringify(should_convert_to_text), { headers: [ `→ ${CeL.gettext.get_alias(options.convert_to_language).slice(0, 1)}\t`, ` ${options.message_should_be || '應為'}\t` ], header_style: { fg: 'cyan' }, print: true }); let { work_title } = options; if (!work_title && (work_title = configuration.test_title)) { work_title = work_title.match(/watch_target\.(.+)\.(TW|CN)/); if (work_title) work_title = work_title[1]; } condition_list.forEach(condition => print_correction_condition(condition, { work_title, original_sentence_word_list, tagged_convert_from_text, })); if (matched_conditions.length > 0) { matched_conditions.unshift('匹配的條件式:'); CeL.log(matched_conditions.join('\n\t')); } if (!is_fragment) { CeL.log(`單純 zh_conversion 轉換過程:`); CeL.log('單純:\t ' + (options.convert_to_language === 'TW' ? CeL_CN_to_TW : CeL_TW_to_CN)(convert_from_text, { show_matched: true })); } if (show_tagged_word_list) { CeL.debug(beautify_tagged_word_list(tagged_word_list_pieces), 1); } } /** {Boolean}跳過長度不同的測試。 e.g., 地區習慣用詞轉換 */ const skip_tests_convert_to_different_length = true; function check_convert_to_different_length(converted_text_String, should_be_text, no_warning, is_convert_from) { if (converted_text_String.chars().length === should_be_text?.chars().length) { return; } if (no_warning) { ; } else if (skip_tests_convert_to_different_length) { // 轉換前後。 CeL.warn(`${check_convert_to_different_length.name}: 預設解答與轉換後之文字長度不符!`); } else { // ,刪除解答 CeL.error(`${check_convert_to_different_length.name}: 預設解答與轉換後之文字長度不符,跳過此項!`); } // 為差異文字著色。 CeL.coloring_diff(converted_text_String, should_be_text, { headers: [ (is_convert_from ? '轉換前:' : '轉換後:') + '\t', `解答:\t`, ], header_style: { fg: 'yellow' }, print: true }); // return true: Skip this test. return !skip_tests_convert_to_different_length; } // ------------------------------------------------------------------ const KEY_tag_filter = Symbol('tag filter'), KEY_tag_pattern_filter = Symbol('tag pattern filter'), KEY_general_pattern_filter = Symbol('general pattern filter'), KEY_pattern = 'pattern'; function get_convert_to_conditions(options) { const { word_data, convertion_pairs } // incase "Variable 'options' is null checked here, but its property is accessed without null check prior" = options === null ? Object.create(null) : options; let convertion_set, key = word_data[this.KEY_word], pattern; const KEY_PoS_tag = this.KEY_PoS_tag; function set_tag_convertion(KEY) { convertion_set = convertion_pairs.get(KEY); if (!convertion_set[word_data[KEY_PoS_tag]]) { if (!options?.create) return; convertion_set[word_data[KEY_PoS_tag]] = new Map; } //console.trace(convertion_set); return convertion_set = convertion_set[word_data[KEY_PoS_tag]]; } if (CeL.is_RegExp(key) || options?.search_pattern) { if (options?.try_tag && word_data[KEY_PoS_tag]) { if (!set_tag_convertion(KEY_tag_pattern_filter)) return; } else { convertion_set = convertion_pairs.get(KEY_general_pattern_filter); } if (CeL.is_RegExp(key)) { pattern = key; key = key.toString().replace(/(\w)+$/, flags => flags.replace(/[g]/, '')); } else { const all_matched_conditions = []; for (const convert_to_conditions of convertion_set.values()) { //console.trace([key, convert_to_conditions[KEY_pattern]]); // assert {Array}convert_to_conditions if (convert_to_conditions[KEY_pattern].test(key)) { if (!options.get_all_matched_conditions) return convert_to_conditions; all_matched_conditions.push(convert_to_conditions); } } if (all_matched_conditions.length > 0) { if (all_matched_conditions.length > 1) { //console.trace(all_matched_conditions); } return all_matched_conditions; } } } else { if (options?.try_tag && word_data[KEY_PoS_tag]) { if (!set_tag_convertion(KEY_tag_filter)) return; } else { convertion_set = convertion_pairs; } } if (!convertion_set.has(key)) { if (!options?.create) return; // 初始化 initialization const convert_to_conditions = []; if (pattern) convert_to_conditions[KEY_pattern] = pattern; convertion_set.set(key, convert_to_conditions); //console.trace(convertion_set); } const convert_to_conditions = convertion_set.get(key); // assert: {Array}convert_to_conditions return options.get_all_matched_conditions ? [convert_to_conditions] : convert_to_conditions; } const KEY_postfix = Symbol('postfix'); function load_dictionary(file_path, options) { const word_list = get_paragraphs_of_file(file_path); // 初始化 initialization: convertion_pairs const convertion_pairs = this.convertion_pairs[options.language] = new Map; convertion_pairs.set(KEY_tag_filter, Object.create(null)); convertion_pairs.set(KEY_tag_pattern_filter, Object.create(null)); convertion_pairs.set(KEY_general_pattern_filter, new Map); convertion_pairs.set(KEY_postfix, []); for (let conditions of word_list) { conditions = conditions.split('\t'); const matched_condition = conditions[0].trim(); if (conditions.length < 2 || !matched_condition) { CeL.error(`${load_dictionary.name}: 未設定轉換條件: ${conditions.join('\t')}`); continue; } const filter = parse_condition.call(this, matched_condition); if (filter.filter_name === 'postfix') { //console.trace(filter); } else if (!filter[this.KEY_word] && !filter[this.KEY_PoS_tag]) { // assert: !!matched_condition === true CeL.error(`Invalid word filter: ${matched_condition}`); continue; } if (filter.not_match) throw new Error('NYI: not_match'); const convert_to_conditions = filter.filter_name === 'postfix' ? convertion_pairs.get(KEY_postfix) : get_convert_to_conditions.call(this, { word_data: filter, convertion_pairs, create: true, try_tag: true }); for (let index = 1; index < conditions.length; index++) { let condition = conditions[index]; if (!condition.trim()) { CeL.error(`${load_dictionary.name}: Empty condition[${index}] in ${JSON.stringify(conditions)}`); continue; } condition = parse_condition.call(this, condition, { matched_condition }); if (condition.do_after_converting || convert_to_conditions.length === 0 || !convert_to_conditions[convert_to_conditions.length - 1].do_after_converting) { // TODO: 將 {Array} 之 pattern 轉成 {Regexp} 之 pattern,採用 .replace(pattern, token => match_condition(token))。 convert_to_conditions.push(condition); } else { // 應該將有 .do_after_converting 的擺到後面。 let index = convert_to_conditions.length - 1; while (index > 0 && convert_to_conditions[index - 1].do_after_converting) index--; // assert: !convert_to_conditions[index - 1].do_after_converting && convert_to_conditions[index].do_after_converting // 將沒有 .do_after_converting 的插入到有 .do_after_converting 的之前。 convert_to_conditions.splice(index, 0, condition); } } //console.trace(convert_to_conditions); } //console.trace(this.convertion_pairs); } const KEY_synonym_pattern = Symbol('synonym pattern'); function load_synonym_dictionary() { // this.synonyms_of_language['TW'] = {Map} { '台灣' => [ '臺灣' ] } if (!this.synonyms_of_language) this.synonyms_of_language = Object.create(null); const file_paths = Object.create(null); for (const language in dictionary_template) { if (!this.synonyms_of_language[language]) { // initialization this.synonyms_of_language[language] = new Map; this.synonyms_of_language[language][KEY_synonym_pattern] = []; } const synonyms_Map = this.synonyms_of_language[language]; let synonym_data = CeL.read_file(this.dictionaries_directory + `synonym.${language}.txt`); if (!synonym_data) continue; synonym_data = CeL.data.Convert_Pairs.remove_comments(synonym_data.toString().replace(/\r/g, '')); synonym_data.split('\n').forEach(line => { if (!line) return; const synonyms = line.split('\t'); const 正字正詞 = synonyms.shift(); if (synonyms.length === 0) { if (CeL.PATTERN_RegExp_replacement.test(正字正詞)) { // {RegExp}通同字/同義詞pattern synonyms_Map[KEY_synonym_pattern].push(正字正詞.to_RegExp({ allow_replacement: true })); } else { CeL.error(`${load_synonym_dictionary.name}: No synonym settle: ${正字正詞}`) } return; } // 有設定`正字正詞`時,僅允許轉換成`正字正詞`,不可轉換為俗寫。 const allowed_synonyms = 正字正詞 ? [正字正詞] : synonyms; synonyms.forEach(synonym => { if (!synonym) return; if (synonyms_Map.has(synonym)) CeL.error(`${load_synonym_dictionary.name}: 重複設定: ${JSON.stringify(synonym)}`); synonyms_Map.set(synonym, allowed_synonyms); }); }); } //console.log(this.synonyms_of_language); } // ---------------------------------------------------------------------------- // @inner 須配合 generate_condition_LTP()。 function condition_filter_LTP(single_condition, word_data, options) { //console.trace(options); if (single_condition.filter_name in this.filters[options.convert_to_language]) return true; //console.trace([single_condition, word_data, options]); //console.trace(options.convertion_pairs.get(KEY_tag_filter).v.get('干')); const { tagged_word_list } = options; // assert: word_data === tagged_word_list[options.index_of_tagged_word_list] const tagged_word_list_index_offset = options.index_of_tagged_word_list - word_data.id; if (single_condition.filter_name === word_data.relation) { // 指定關係。 //console.trace([single_condition.filter_target, tagged_word_list[word_data.parent]]); // e.g., ~只<ATT>b:/表/ return match_single_condition.call(this, single_condition.filter_target, tagged_word_list[tagged_word_list_index_offset + word_data.parent], options); } let matched; matched = single_condition.filter_name.match(/^←(.+)$/); if (matched) { matched = matched[1]; // 搜尋反向關係。 for (let index = tagged_word_list_index_offset, latest_id = -1; index < tagged_word_list.length; index++) { const word_data_to_test = tagged_word_list[index]; if (latest_id >= word_data_to_test.id) { // tagged_word_list 可能是 recover_original_paragraphs() 多次查詢拼合起來的。當 (latest_id > word_data_to_test.id) 的時候,已經超越本次查詢的範圍。 // assert: word_data_to_test.id === 0 return; } // assert: word_data_to_test.id === latest_id + 1 latest_id = word_data_to_test.id; if (word_data_to_test.parent === word_data.id && word_data_to_test.relation === matched && match_single_condition.call(this, single_condition.filter_target, word_data_to_test, options) ) { return true; } } } matched = single_condition.filter_name.match(/(?<property_name>(?:role|parent))(?:\.(?<sub_property_name>[^:]+):(?<sub_property_value>.+))?/); if (matched) { matched = matched.groups; const filter_target = single_condition.filter_target; //console.trace([single_condition, matched, word_data]); // e.g., 沖<role.type:A1>/[水浴杯]/ // 搜尋 roles / parents。 return word_data[matched.property_name + 's'].some(token => { const parent_index = tagged_word_list_index_offset + token.parent; if (parent_index in tagged_word_list) { // 這可能造成 JSON.stringify(tagged_word_list) 出問題 (TypeError: Converting circular structure to JSON)。 Object.assign(token, tagged_word_list[parent_index]); } return (!matched.sub_property_name || token[matched.sub_property_name] === matched.sub_property_value) && match_single_condition.call(this, filter_target, token, options); }); } } function match_single_condition(single_condition, word_data, options) { //if (!single_condition) console.trace([single_condition, word_data, options]); if (single_condition[KEY_filter_name]) { return this.condition_filter && this.condition_filter(single_condition, word_data, options); } let filter; // 依照最佳詞性轉換。 // ICTPOS3.0词性标记集 https://gist.github.com/luw2007/6016931 http://ictclas.nlpir.org/ // CKIP中文斷詞系統 詞類標記列表 http://ckipsvr.iis.sinica.edu.tw/cat.htm https://github.com/ckiplab/ckiptagger/wiki/POS-Tags // NLPIR 词性类别: 计算所汉语词性标记集 http://103.242.175.216:197/nlpir/html/readme.htm filter = single_condition[this.KEY_PoS_tag]; if (filter && !CeL.fit_filter(filter, word_data[this.KEY_PoS_tag])) { //if (single_condition.not_match) console.trace([single_condition, filter, word_data, CeL.fit_filter(filter, word_data[this.KEY_PoS_tag])]); return single_condition.not_match; } filter = single_condition[this.KEY_word]; if (!filter // .is_target 時, [this.KEY_word] 可能是欲改成的字串,此時不做篩選。 || single_condition.is_target && !CeL.is_RegExp(filter)) { return true; } //if (single_condition.not_match && /上/.test(filter)) console.trace([single_condition, filter, word_data, CeL.fit_filter(filter, word_data[this.KEY_word]), single_condition.not_match ^ CeL.fit_filter(filter, word_data[this.KEY_word])]); // console.trace([single_condition, filter, CeL.fit_filter(filter, word_data[this.KEY_word])]); return single_condition.not_match ^ (CeL.fit_filter(filter, word_data[this.KEY_word]) // 接受 condition filter 包含 prefix spaces 的情況。 //|| word_data[KEY_prefix_spaces] && typeof word_data[this.KEY_word] === 'string' && CeL.fit_filter(filter, word_data[KEY_prefix_spaces] + word_data[this.KEY_word]) ); } function match_condition(options) { const { conditions, word_data, tagged_word_list } = options; //console.trace([conditions, word_data]); if (!Array.isArray(conditions)) return match_single_condition.call(this, conditions, word_data, options) && conditions; const target_index = conditions.target_index || 0; //console.assert(conditions[target_index]); //if (!conditions[target_index]) console.trace(conditions); // 檢查當前 part。 if (!match_single_condition.call(this, conditions[target_index], word_data, options)) return; // 向後檢查。 for (let index_of_condition = target_index + 1, index_of_target = options.index_of_tagged_word_list + 1; index_of_condition < conditions.length; index_of_condition++) { if (index_of_target >= tagged_word_list.length) return; const condition = conditions[index_of_condition]; if (match_single_condition.call(this, condition, tagged_word_list[index_of_target], options)) { index_of_target++; } else { if (!condition.is_optional) return; // Skip the condition, try next condition. } } // 向前檢查。 for (let index_of_condition = target_index - 1, index_of_target = options.index_of_tagged_word_list - 1; index_of_condition >= 0; index_of_condition--) { if (index_of_target < 0) return; const condition = conditions[index_of_condition]; if (match_single_condition.call(this, condition, tagged_word_list[index_of_target], options)) { index_of_target--; } else { if (!condition.is_optional) return; // Skip the condition, try next condition. } } return conditions[target_index]; } function get_matched_condition(options) { let all_convert_to_conditions = get_convert_to_conditions.call(this, { ...options, get_all_matched_conditions: true }); //console.trace([word_data, all_convert_to_conditions]); //console.trace(all_convert_to_conditions); if (!all_convert_to_conditions) { return; } const all_matched_conditions = []; all_convert_to_conditions.forEach(convert_to_conditions => { // assert: convert_to_conditions = [{ [this.KEY_word]: '詞', [this.KEY_PoS_tag]: '詞性' }, { [this.KEY_word]: '詞', [this.KEY_PoS_tag]: '詞性' }, ...] for (let index_of_conditions = 0; index_of_conditions < convert_to_conditions.length; index_of_conditions++) { const conditions = convert_to_conditions[index_of_conditions]; const matched_condition = match_condition.call(this, { ...options, conditions }); if (matched_condition) { //console.trace([matched_condition, convert_to_conditions, convert_to_conditions.pattern]); all_matched_conditions.push(matched_condition); if (!convert_to_conditions.pattern) { // 對於非 pattern,僅取第一個 matched 的。 return; } } } }); if (all_matched_conditions.length > 0) { //console.trace([all_matched_conditions, all_convert_to_conditions]); return { all_matched_conditions, all_convert_to_conditions }; } return { all_conver