idsfind
Version:
A tool can search Chinese characters by their components and remaining stroke count
286 lines (258 loc) • 8.29 kB
text/typescript
import { existsSync, readFileSync, writeFileSync, readdirSync } from "fs";
import download from "download";
import { parse } from "csv-parse/sync";
import chalk from "chalk";
import { mergeWith, isArray } from "lodash";
interface IDSOBJ {
[hanzi: string]: string[];
}
interface STOKESOBJ {
[hanzi: string]: number;
}
interface INVERTEDIDS {
[ids: string]: string[];
}
interface ALLINVERTEDIDS {
[depth: number]: INVERTEDIDS;
}
interface CJKVI_IDS {
[char: string]: string;
}
interface Glyph {
name: string;
related: string;
data: string;
}
const UNIHAN_URL = "https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip";
const DOWNLOAD_UNIHAN_TO = "data/unihan";
const CHISE_IDS_URL =
"https://gitlab.chise.org/CHISE/ids/-/archive/master/ids-master.zip";
const DOWNLOAD_CHISEIDS_TO = "data/chise-ids";
const CJKVI_IDS_URL =
"https://github.com/toyjack/cjkvi-ids/archive/refs/heads/master.zip";
const DOWNLOAD_CJKVIIDS_TO = "data/cjkvi-ids";
const GLYPHWIKI_DUMP_URL = "http://glyphwiki.org/dump.tar.gz";
const DOWNLOAD_GLYPHWIKI_DUMP_TO = "data/glyphwiki";
const DOWNLOAD_OPTIONS = {
extract: true,
};
const CSV_OPTIONS = {
comment: "#",
delimiter: "\t",
skip_empty_lines: true,
relax_column_count: true,
};
const inverted: ALLINVERTEDIDS = {};
let depth = 0;
const strokesObj: STOKESOBJ = {};
const idsObj: IDSOBJ = {};
const cjkviObj: CJKVI_IDS = {};
const glyphwiki_ids_index: string[] = [];
const glyphwiki_all: Glyph[] = [];
function isIDC(part: string) {
const code = part.codePointAt(0);
if (!code) return false;
return code >= 0x2ff0 && code <= 0x2fff;
}
function writeOutJsonFile(jsonData: any, fileName: string) {
const jsonStr = JSON.stringify(jsonData);
writeFileSync(fileName, jsonStr, "utf-8");
}
function fixSurrogate(idsString: string) {
const temp: string[] = [];
for (let i = 0; i < idsString.length; i++) {
const idsCode = idsString[i].charCodeAt(0);
if (0xd800 <= idsCode && idsCode <= 0xdbff) {
const hi = idsString[i];
const low = idsString[i + 1];
temp.push(hi + low);
i++;
} else {
temp.push(idsString[i]);
}
}
return temp;
}
function genInverted(ids: string[], hanzi: string) {
if (ids[0] == "&") {
return;
}
if (ids.length === 1) {
//'一':'一'のようなものを除外
return;
}
for (const idsPart of ids) {
if (isIDC(idsPart)) {
continue;
}
if (!inverted[depth]) {
inverted[depth] = {};
}
if (!inverted[depth][idsPart]) {
inverted[depth][idsPart] = [];
}
inverted[depth][idsPart].push(hanzi);
if (idsObj[idsPart] && idsPart != hanzi) {
depth++;
genInverted(idsObj[idsPart], hanzi);
depth--;
}
}
}
(async () => {
console.log(chalk.blue("Downloading GlyphWiki..."));
await download(
GLYPHWIKI_DUMP_URL,
DOWNLOAD_GLYPHWIKI_DUMP_TO,
DOWNLOAD_OPTIONS
);
if (existsSync(DOWNLOAD_GLYPHWIKI_DUMP_TO + "/dump_newest_only.txt")) {
console.log(chalk.green("Done!"));
console.log(chalk.blue("Making GlyphWiki database..."));
const content = readFileSync(
DOWNLOAD_GLYPHWIKI_DUMP_TO + "/dump_newest_only.txt",
"utf8"
);
const lines = content.split("\n");
const regexp = /^ u[\da-f]{4,5}-u[\da-f]{4,5}/;
for (const line of lines) {
// make glyphwiki all data json
const cells = line.split("|").map((e) => e.trim());
if (cells.length === 3) {
glyphwiki_all.push({
name: cells[0],
related: cells[1],
data: cells[2],
});
}
// make ids index
if (regexp.test(line)) {
glyphwiki_ids_index.push(cells[0]);
}
}
// console.log(glyphwiki_ids_index)
writeOutJsonFile(glyphwiki_ids_index, "data/gw_ids.json");
writeOutJsonFile(glyphwiki_all, "data/gw_all.json");
console.log(chalk.green("Done!"));
}
console.log(chalk.blue("Downloading Unihan database..."));
await download(UNIHAN_URL, DOWNLOAD_UNIHAN_TO, DOWNLOAD_OPTIONS);
if (existsSync(DOWNLOAD_UNIHAN_TO + "/Unihan_IRGSources.txt")) {
console.log(chalk.green("Done!"));
console.log(chalk.blue("Making Unihan database..."));
const content = readFileSync(
DOWNLOAD_UNIHAN_TO + "/Unihan_IRGSources.txt",
"utf8"
);
const records = parse(content, CSV_OPTIONS);
for (const record of records) {
if (record[1] == "kTotalStrokes") {
const unicodeString = record[0];
const totalStrokes = record[2];
const unicode = parseInt(
unicodeString.substring(unicodeString.length, 2),
16
);
strokesObj[String.fromCodePoint(unicode)] = totalStrokes; // strokesObj['一']=1
}
}
writeOutJsonFile(strokesObj, "data/Strokes.json");
console.log(chalk.green("Done!"));
console.log(chalk.blue("Downloading cjkvi-ids..."));
await download(CJKVI_IDS_URL, DOWNLOAD_CJKVIIDS_TO, DOWNLOAD_OPTIONS);
if (
existsSync(DOWNLOAD_CJKVIIDS_TO + "/cjkvi-ids-master/ids.txt") &&
existsSync(DOWNLOAD_CJKVIIDS_TO + "/cjkvi-ids-master/ids-ext-cdef.txt")
) {
console.log(chalk.green("Converting data..."));
const ids_basic = readFileSync(
DOWNLOAD_CJKVIIDS_TO + "/cjkvi-ids-master/ids.txt",
"utf8"
);
const ids_cdef = readFileSync(
DOWNLOAD_CJKVIIDS_TO + "/cjkvi-ids-master/ids-ext-cdef.txt",
"utf8"
);
const ids_basic_records = parse(ids_basic, CSV_OPTIONS);
const ids_cdef_records = parse(ids_cdef, CSV_OPTIONS);
// console.log(ids_cdef_records)
// cjkviObj
for (const record of ids_basic_records) {
cjkviObj[record[1]] = record[2];
}
for (const record of ids_cdef_records) {
cjkviObj[record[1]] = record[2];
}
writeOutJsonFile(cjkviObj, "data/cjkvi.json");
console.log(chalk.green("Done!"));
}
console.log(chalk.blue("Downloading CHISE..."));
await download(CHISE_IDS_URL, DOWNLOAD_CHISEIDS_TO, DOWNLOAD_OPTIONS);
console.log(chalk.green("Done!"));
const chiseFileList = readdirSync(DOWNLOAD_CHISEIDS_TO + "/ids-master");
let rawChiseData = "";
console.log(chalk.blue("Making raw data..."));
for (const file of chiseFileList) {
if (file.match(/^IDS-UCS-.+/)) {
console.log("Found ", file);
let tempData = readFileSync(
DOWNLOAD_CHISEIDS_TO + "/ids-master/" + file,
"utf8"
);
//cut first line
//ref https://stackoverflow.com/questions/2528076/delete-a-line-of-text-in-javascript
tempData = tempData.substring(tempData.indexOf("\n") + 1);
rawChiseData += tempData;
}
}
console.log(chalk.green("Done!"));
const chiseRecords = parse(rawChiseData, CSV_OPTIONS);
for (const record of chiseRecords) {
const hanzi = record[1];
const re_sanshofu = /&[^;]+;/g;
const re_idc = /[⿰⿱⿲⿳⿴⿵⿶⿷⿸⿹⿺⿻]/g;
let ids = record[2];
if (re_sanshofu.test(hanzi)) {
continue;
}
ids = ids.replace(re_idc, "");
ids = ids.replace(re_sanshofu, "");
idsObj[hanzi] = fixSurrogate(ids);
}
// writeOutJsonFile(idsObj, 'data/IDS.json')
console.log(chalk.blue("Making inverted IDS data: inverted_ids.json"));
for (const hanzi in idsObj) {
const ids = idsObj[hanzi];
genInverted(ids, hanzi);
}
const inverted_ids_first_level = inverted[0];
const inverted_ids_remaining:any = {};
let inverted_ids_all = {};
for (const key in inverted) {
if (key != "0") {
inverted_ids_remaining[key] = inverted[key];
}
//merge
// https://qiita.com/minodisk/items/981c074f12d4d1d7b0d5
inverted_ids_all = mergeWith(
inverted_ids_all,
inverted[key],
function (a: string[], b: string[]) {
if (isArray(a) && isArray(b)) {
return a.concat(b);
}
}
);
}
writeOutJsonFile(
inverted_ids_first_level,
"data/inverted_ids_first_level.json"
);
writeOutJsonFile(
inverted_ids_remaining,
"data/inverted_ids_remaining.json"
);
writeOutJsonFile(inverted_ids_all, "data/inverted_ids_all.json");
console.log(chalk.green("Done"));
}
})();