UNPKG

idsfind

Version:

A tool can search Chinese characters by their components and remaining stroke count

286 lines (258 loc) 8.29 kB
import { existsSync, readFileSync, writeFileSync, readdirSync } from "fs"; import download from "download"; import { parse } from "csv-parse/sync"; import chalk from "chalk"; import { mergeWith, isArray } from "lodash"; interface IDSOBJ { [hanzi: string]: string[]; } interface STOKESOBJ { [hanzi: string]: number; } interface INVERTEDIDS { [ids: string]: string[]; } interface ALLINVERTEDIDS { [depth: number]: INVERTEDIDS; } interface CJKVI_IDS { [char: string]: string; } interface Glyph { name: string; related: string; data: string; } const UNIHAN_URL = "https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip"; const DOWNLOAD_UNIHAN_TO = "data/unihan"; const CHISE_IDS_URL = "https://gitlab.chise.org/CHISE/ids/-/archive/master/ids-master.zip"; const DOWNLOAD_CHISEIDS_TO = "data/chise-ids"; const CJKVI_IDS_URL = "https://github.com/toyjack/cjkvi-ids/archive/refs/heads/master.zip"; const DOWNLOAD_CJKVIIDS_TO = "data/cjkvi-ids"; const GLYPHWIKI_DUMP_URL = "http://glyphwiki.org/dump.tar.gz"; const DOWNLOAD_GLYPHWIKI_DUMP_TO = "data/glyphwiki"; const DOWNLOAD_OPTIONS = { extract: true, }; const CSV_OPTIONS = { comment: "#", delimiter: "\t", skip_empty_lines: true, relax_column_count: true, }; const inverted: ALLINVERTEDIDS = {}; let depth = 0; const strokesObj: STOKESOBJ = {}; const idsObj: IDSOBJ = {}; const cjkviObj: CJKVI_IDS = {}; const glyphwiki_ids_index: string[] = []; const glyphwiki_all: Glyph[] = []; function isIDC(part: string) { const code = part.codePointAt(0); if (!code) return false; return code >= 0x2ff0 && code <= 0x2fff; } function writeOutJsonFile(jsonData: any, fileName: string) { const jsonStr = JSON.stringify(jsonData); writeFileSync(fileName, jsonStr, "utf-8"); } function fixSurrogate(idsString: string) { const temp: string[] = []; for (let i = 0; i < idsString.length; i++) { const idsCode = idsString[i].charCodeAt(0); if (0xd800 <= idsCode && idsCode <= 0xdbff) { const hi = idsString[i]; const low = idsString[i + 1]; temp.push(hi + low); i++; } else { temp.push(idsString[i]); } } return temp; } function genInverted(ids: string[], hanzi: string) { if (ids[0] == "&") { return; } if (ids.length === 1) { //'一':'一'のようなものを除外 return; } for (const idsPart of ids) { if (isIDC(idsPart)) { continue; } if (!inverted[depth]) { inverted[depth] = {}; } if (!inverted[depth][idsPart]) { inverted[depth][idsPart] = []; } inverted[depth][idsPart].push(hanzi); if (idsObj[idsPart] && idsPart != hanzi) { depth++; genInverted(idsObj[idsPart], hanzi); depth--; } } } (async () => { console.log(chalk.blue("Downloading GlyphWiki...")); await download( GLYPHWIKI_DUMP_URL, DOWNLOAD_GLYPHWIKI_DUMP_TO, DOWNLOAD_OPTIONS ); if (existsSync(DOWNLOAD_GLYPHWIKI_DUMP_TO + "/dump_newest_only.txt")) { console.log(chalk.green("Done!")); console.log(chalk.blue("Making GlyphWiki database...")); const content = readFileSync( DOWNLOAD_GLYPHWIKI_DUMP_TO + "/dump_newest_only.txt", "utf8" ); const lines = content.split("\n"); const regexp = /^ u[\da-f]{4,5}-u[\da-f]{4,5}/; for (const line of lines) { // make glyphwiki all data json const cells = line.split("|").map((e) => e.trim()); if (cells.length === 3) { glyphwiki_all.push({ name: cells[0], related: cells[1], data: cells[2], }); } // make ids index if (regexp.test(line)) { glyphwiki_ids_index.push(cells[0]); } } // console.log(glyphwiki_ids_index) writeOutJsonFile(glyphwiki_ids_index, "data/gw_ids.json"); writeOutJsonFile(glyphwiki_all, "data/gw_all.json"); console.log(chalk.green("Done!")); } console.log(chalk.blue("Downloading Unihan database...")); await download(UNIHAN_URL, DOWNLOAD_UNIHAN_TO, DOWNLOAD_OPTIONS); if (existsSync(DOWNLOAD_UNIHAN_TO + "/Unihan_IRGSources.txt")) { console.log(chalk.green("Done!")); console.log(chalk.blue("Making Unihan database...")); const content = readFileSync( DOWNLOAD_UNIHAN_TO + "/Unihan_IRGSources.txt", "utf8" ); const records = parse(content, CSV_OPTIONS); for (const record of records) { if (record[1] == "kTotalStrokes") { const unicodeString = record[0]; const totalStrokes = record[2]; const unicode = parseInt( unicodeString.substring(unicodeString.length, 2), 16 ); strokesObj[String.fromCodePoint(unicode)] = totalStrokes; // strokesObj['一']=1 } } writeOutJsonFile(strokesObj, "data/Strokes.json"); console.log(chalk.green("Done!")); console.log(chalk.blue("Downloading cjkvi-ids...")); await download(CJKVI_IDS_URL, DOWNLOAD_CJKVIIDS_TO, DOWNLOAD_OPTIONS); if ( existsSync(DOWNLOAD_CJKVIIDS_TO + "/cjkvi-ids-master/ids.txt") && existsSync(DOWNLOAD_CJKVIIDS_TO + "/cjkvi-ids-master/ids-ext-cdef.txt") ) { console.log(chalk.green("Converting data...")); const ids_basic = readFileSync( DOWNLOAD_CJKVIIDS_TO + "/cjkvi-ids-master/ids.txt", "utf8" ); const ids_cdef = readFileSync( DOWNLOAD_CJKVIIDS_TO + "/cjkvi-ids-master/ids-ext-cdef.txt", "utf8" ); const ids_basic_records = parse(ids_basic, CSV_OPTIONS); const ids_cdef_records = parse(ids_cdef, CSV_OPTIONS); // console.log(ids_cdef_records) // cjkviObj for (const record of ids_basic_records) { cjkviObj[record[1]] = record[2]; } for (const record of ids_cdef_records) { cjkviObj[record[1]] = record[2]; } writeOutJsonFile(cjkviObj, "data/cjkvi.json"); console.log(chalk.green("Done!")); } console.log(chalk.blue("Downloading CHISE...")); await download(CHISE_IDS_URL, DOWNLOAD_CHISEIDS_TO, DOWNLOAD_OPTIONS); console.log(chalk.green("Done!")); const chiseFileList = readdirSync(DOWNLOAD_CHISEIDS_TO + "/ids-master"); let rawChiseData = ""; console.log(chalk.blue("Making raw data...")); for (const file of chiseFileList) { if (file.match(/^IDS-UCS-.+/)) { console.log("Found ", file); let tempData = readFileSync( DOWNLOAD_CHISEIDS_TO + "/ids-master/" + file, "utf8" ); //cut first line //ref https://stackoverflow.com/questions/2528076/delete-a-line-of-text-in-javascript tempData = tempData.substring(tempData.indexOf("\n") + 1); rawChiseData += tempData; } } console.log(chalk.green("Done!")); const chiseRecords = parse(rawChiseData, CSV_OPTIONS); for (const record of chiseRecords) { const hanzi = record[1]; const re_sanshofu = /&[^;]+;/g; const re_idc = /[⿰⿱⿲⿳⿴⿵⿶⿷⿸⿹⿺⿻]/g; let ids = record[2]; if (re_sanshofu.test(hanzi)) { continue; } ids = ids.replace(re_idc, ""); ids = ids.replace(re_sanshofu, ""); idsObj[hanzi] = fixSurrogate(ids); } // writeOutJsonFile(idsObj, 'data/IDS.json') console.log(chalk.blue("Making inverted IDS data: inverted_ids.json")); for (const hanzi in idsObj) { const ids = idsObj[hanzi]; genInverted(ids, hanzi); } const inverted_ids_first_level = inverted[0]; const inverted_ids_remaining:any = {}; let inverted_ids_all = {}; for (const key in inverted) { if (key != "0") { inverted_ids_remaining[key] = inverted[key]; } //merge // https://qiita.com/minodisk/items/981c074f12d4d1d7b0d5 inverted_ids_all = mergeWith( inverted_ids_all, inverted[key], function (a: string[], b: string[]) { if (isArray(a) && isArray(b)) { return a.concat(b); } } ); } writeOutJsonFile( inverted_ids_first_level, "data/inverted_ids_first_level.json" ); writeOutJsonFile( inverted_ids_remaining, "data/inverted_ids_remaining.json" ); writeOutJsonFile(inverted_ids_all, "data/inverted_ids_all.json"); console.log(chalk.green("Done")); } })();