@willh/s2t
Version:
簡體轉繁體 CLI 工具
220 lines (189 loc) • 7.81 kB
JavaScript
const https = require('https');
const fs = require('fs');
const path = require('path');
const os = require('os');
const url = 'https://raw.githubusercontent.com/BYVoid/OpenCC/refs/heads/master/data/dictionary/STCharacters.txt';
const localFilePath = path.join(__dirname, 'STCharacters.txt');
const outputFilePath = path.join(__dirname, 'STCharacters.js');
// Helper function to escape strings for JS literals
function escapeString(str) {
return str.replace(/\\/g, '\\\\')
.replace(/"/g, '\\"')
.replace(/\n/g, '\\n')
.replace(/\r/g, '\\r')
.replace(/\t/g, '\\t');
}
function downloadFile(url, dest) {
return new Promise((resolve, reject) => {
const file = fs.createWriteStream(dest);
https.get(url, (response) => {
if (response.statusCode !== 200) {
file.close();
fs.unlink(dest, () => {}); // Clean up empty file
reject(new Error(`Failed to get '${url}' (${response.statusCode})`));
return;
}
response.pipe(file);
file.on('finish', () => {
file.close(resolve);
});
}).on('error', (err) => {
fs.unlink(dest, () => {}); // Delete the file if download fails
reject(err);
});
});
}
async function main() {
try {
console.log(`Downloading ${url} to ${localFilePath}...`);
await downloadFile(url, localFilePath);
console.log(`File downloaded successfully to ${localFilePath}`);
console.log(`Reading and parsing ${localFilePath}...`);
const fileContent = fs.readFileSync(localFilePath, 'utf8');
const lines = fileContent.split('\n');
let sChars = "";
let tChars = "";
let mChars = ""; // multi words 多重字
let rsChars = ""; // rare words 罕見字
let rtChars = ""; // rare words 罕見字
for (const line of lines) {
if (line.trim() === '') continue; // Skip empty or whitespace-only lines
const parts = line.split('\t');
if (parts.length >= 2) {
const sChar = parts[0];
const tPart = parts[1];
if (tPart.includes(' ')) {
mChars += sChar;
continue;
}
if (isDifficultChar(sChar)) {
rsChars += sChar;
continue;
}
if (isDifficultChar(tPart)) {
rtChars += tPart;
continue;
}
sChars += sChar;
tChars += tPart;
}
}
console.log("Parsing complete.");
const jsContent = `// Generated by script on ${new Date().toISOString()}\n` +
`// 簡體字元\n` +
`const sChars = "${escapeString(sChars)}";\n` +
`// 繁體字元\n` +
`const tChars = "${escapeString(tChars)}";\n` +
`// 一簡對多繁的簡體字\n` +
`const mChars = "${escapeString(mChars)}";\n` +
`// 簡體字中的罕見字\n` +
`const rsChars = "${escapeString(rsChars)}";\n` +
`// 繁體字中的罕見字\n` +
`const rtChars = "${escapeString(rtChars)}";\n\n` +
`
/**
* 檢查 text 是否包含任何 mChars 字元
* @param {*} text
* @returns {Boolean}
*/
function isContainsMultipleChars(text) {
return [...text].some(char => mChars.includes(char));
}
/**
* 檢查 text 是否包含任何 mChars 字元
* @param {*} text
* @returns {Boolean}
*/
function isContainsSimplifiedChinese(text) {
return [...text].some(char => sChars.includes(char) || mChars.includes(char));
}
/**
* 將文字轉換為繁體中文
* 此函數依賴於已在外部定義的 sChars 和 tChars 變數。
* @param {String} text 輸入文字
* @returns {String} 轉換後的繁體中文文字
*/
function convertToTraditionalChinese(text) {
if (!text || typeof text !== 'string') {
return text; // Return original if not a string or empty
}
// 檢查 sChars 和 tChars 是否已定義且長度相同
// 這些變數應由使用者在腳本的全域範圍或此函數可訪問的範圍內定義
if (typeof sChars === 'undefined' || typeof tChars === 'undefined') {
const errorMsg = '[convertToTraditionalChinese] 全域變數 sChars 或 tChars 未定義,無法進行轉換。請在腳本中定義它們。';
console.warn(errorMsg);
// 若 DEBUG 模式開啟,也將此警告寫入試算表日誌
if (DEBUG) writeLogToSheet(errorMsg);
return text; // 返回原始文本,不進行轉換
}
if (sChars.length !== tChars.length) {
const errorMsg = \`[convertToTraditionalChinese] 全域變數 sChars (長度 \${sChars.length}) 與 tChars (長度 \${tChars.length}) 長度不同,無法進行轉換。\`;
console.warn(errorMsg);
if (DEBUG) writeLogToSheet(errorMsg);
return text; // 返回原始文本
}
let result = "";
for (let i = 0; i < text.length; i++) {
const char = text[i];
const index = sChars.indexOf(char);
if (index !== -1) {
result += tChars[index];
} else {
result += char; // Keep original character if not found in the map
}
}
return result;
}
module.exports = {
isContainsMultipleChars,
isContainsSimplifiedChinese,
convertToTraditionalChinese
};
`;
console.log(`Writing output to ${outputFilePath}...`);
fs.writeFileSync(outputFilePath, jsContent, 'utf8');
console.log(`Output successfully generated at ${outputFilePath}`);
} catch (error) {
console.error("An error occurred:", error);
} finally {
// Optional: Clean up the downloaded file from TEMP folder
// fs.unlink(localFilePath, (err) => {
// if (err) console.error(`Failed to delete temp file ${localFilePath}:`, err);
// else console.log(`Temp file ${localFilePath} deleted.`);
// });
}
}
function isDifficultChar(char) {
// 如果一個字元的 Unicode Code Point > 0xFFFF,
// 它就需要一個 Surrogate Pair (兩個 UTF-16 code units) 來表示。
// 這樣的字元,若透過 Array.from() 或 for...of 取得,其 char.length 會是 2。
// 更根本的判斷是檢查其 Code Point。
return char.codePointAt(0) > 0xFFFF;
}
function processStrings(sStr, tStr) {
const sArray = Array.from(sStr); // 使用 Array.from 正確處理 surrogate pairs
const tArray = Array.from(tStr);
let newSChars = [];
let newTChars = [];
const minLength = Math.min(sArray.length, tArray.length);
for (let i = 0; i < minLength; i++) {
const sChar = sArray[i];
const tChar = tArray[i];
const sIsDifficult = isDifficultChar(sChar);
const tIsDifficult = isDifficultChar(tChar);
if (sIsDifficult || tIsDifficult) {
console.log(`移除對應字元: '${sChar}' (難字: ${sIsDifficult}) 和 '${tChar}' (難字: ${tIsDifficult})`);
// 如果任一個是難字,則兩者都移除 (即不加入新的陣列)
} else {
newSChars.push(sChar);
newTChars.push(tChar);
}
}
// 如果原始字串長度不同,超出較短字串長度的部分,根據題意應該不處理,
// 因為是「兩邊的對應字都一起移除」。
return {
s: newSChars.join(''),
t: newTChars.join('')
};
}
main();