@harryjwang/simplewordcloud
Version:
A simple word cloud generator supporting English and Chinese text
81 lines • 3.25 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.ChineseTokenizer = void 0;
const jieba = __importStar(require("jieba-wasm"));
/**
* Chinese text tokenizer using the jieba-wasm library
*/
class ChineseTokenizer {
constructor() {
// jieba-wasm doesn't need explicit initialization in newer versions
// Common Chinese stopwords
this.stopwords = new Set([
'的', '了', '和', '是', '就', '都', '而', '及', '與', '著',
'或', '一個', '沒有', '我們', '你們', '他們', '她們', '自己',
'之', '與', '在', '也', '因', '此', '但', '並', '個', '其',
'已', '無', '小', '大', '中', '上', '下', '不', '為', '以',
'於', '對', '她', '他', '你', '我', '們', '的', '可以', '這',
'那', '到', '由', '這個', '那個', '從', '最', '所', '它'
]);
}
/**
* Tokenize Chinese text into words and count their frequencies
* @param text The input Chinese text
* @returns A map of words to their frequencies
*/
tokenize(text) {
// Cut the text into words
const tokens = jieba.cut(text);
// Filter out stopwords and count frequencies
const wordCounts = new Map();
for (const token of tokens) {
// Skip stopwords, single characters, and numbers
if (this.stopwords.has(token) || token.length <= 1 || /^\d+$/.test(token)) {
continue;
}
// Count word frequencies
if (wordCounts.has(token)) {
wordCounts.set(token, wordCounts.get(token) + 1);
}
else {
wordCounts.set(token, 1);
}
}
return wordCounts;
}
}
exports.ChineseTokenizer = ChineseTokenizer;
//# sourceMappingURL=chinese.js.map