@coffeeandfun/google-profanity-words
Version:
Real profanity words banned by Google, extracted from their hidden API before shutdown. Now available as an easy-to-use Node.js library for content filtering.
469 lines (401 loc) • 17.5 kB
JavaScript
import { ProfanityEngine } from '../index.js';
const language = process.env.LANGUAGE || 'en'; // Default to 'en' if the LANGUAGE environment variable is not set
let profanity;
describe('Chinese (Mandarin) Profanity tests', () => {
beforeAll(async () => {
profanity = new ProfanityEngine({
language: 'zh',
testMode: true,
});
});
afterEach(() => {
profanity.reset();
});
describe('Core Chinese functionality', () => {
it('Should get all the profanity words in an array', async () => {
const allWords = await profanity.all();
expect(Array.isArray(allWords)).toBe(true);
expect(allWords.length).toBeGreaterThan(0);
// Update this number based on your actual Chinese word count
// expect(allWords.length).toEqual(XXX);
});
it('Should return true for Chinese profanity words', async () => {
// Test with a common Chinese profanity word (if it exists in your list)
// Replace 'testword' with an actual word from your Chinese list
const searchWord = await profanity.search('testword');
expect(typeof searchWord).toBe('boolean');
});
it('Should return false for normal Chinese words', async () => {
const normalWords = [
'你好', // Hello (nǐ hǎo)
'谢谢', // Thank you (xiè xiè)
'房子', // House (fáng zi)
'书', // Book (shū)
'水', // Water (shuǐ)
'中文', // Chinese language (zhōng wén)
'电脑', // Computer (diàn nǎo)
'家庭', // Family (jiā tíng)
'朋友', // Friend (péng yǒu)
'学校', // School (xué xiào)
];
for (const word of normalWords) {
expect(await profanity.search(word)).toBe(false);
}
});
it('Should return false for any empty string', async () => {
const searchWord = await profanity.search('');
expect(searchWord).toEqual(false);
});
it('Should return true for a sentence containing a profanity word', async () => {
// Replace with actual Chinese sentence containing profanity from your list
const sentence = '这个句子包含不当词汇。'; // "This sentence contains inappropriate words"
const hasCurseWords = await profanity.hasCurseWords(sentence);
expect(typeof hasCurseWords).toBe('boolean');
});
it('Should return false for a sentence with no profanity word', async () => {
const sentence = '这是一个干净礼貌的句子。'; // "This is a clean and polite sentence"
const hasCurseWords = await profanity.hasCurseWords(sentence);
expect(hasCurseWords).toEqual(false);
});
});
describe('Chinese-specific edge cases', () => {
it('Should handle simplified vs traditional Chinese characters', async () => {
// Test pairs of simplified and traditional characters
const characterPairs = [
['学', '學'], // Study (simplified vs traditional)
['国', '國'], // Country (simplified vs traditional)
['电', '電'], // Electric (simplified vs traditional)
['书', '書'], // Book (simplified vs traditional)
['车', '車'], // Vehicle (simplified vs traditional)
['语', '語'], // Language (simplified vs traditional)
];
for (const [simplified, traditional] of characterPairs) {
const result1 = await profanity.search(simplified);
const result2 = await profanity.search(traditional);
expect(typeof result1).toBe('boolean');
expect(typeof result2).toBe('boolean');
// Both should be clean words
expect(result1).toBe(false);
expect(result2).toBe(false);
}
});
it('Should handle Chinese without word boundaries', async () => {
// Chinese doesn't use spaces between words
const continuousText = '我今天去学校学习中文很开心'; // "I went to school today to study Chinese and was very happy"
const result = await profanity.hasCurseWords(continuousText);
expect(result).toBe(false); // Should be clean text
});
it('Should handle single Chinese characters', async () => {
const singleChars = [
'我', // I/me
'你', // You
'他', // He
'好', // Good
'大', // Big
'小', // Small
'人', // Person
'天', // Day/sky
];
for (const char of singleChars) {
expect(await profanity.search(char)).toBe(false);
}
});
it('Should handle Chinese compound words', async () => {
const compoundWords = [
'电脑', // Computer (electric + brain)
'火车', // Train (fire + vehicle)
'飞机', // Airplane (fly + machine)
'手机', // Cell phone (hand + machine)
'汽车', // Car (steam + vehicle)
'电视', // Television (electric + vision)
];
for (const word of compoundWords) {
expect(await profanity.search(word)).toBe(false);
}
});
it('Should handle Chinese numbers and mixed content', async () => {
const mixedContent = [
'我有3本书', // I have 3 books
'今天是2024年', // Today is 2024
'电话号码123456', // Phone number 123456
'第1章', // Chapter 1
'100元', // 100 yuan
];
for (const text of mixedContent) {
const result = await profanity.hasCurseWords(text);
expect(typeof result).toBe('boolean');
}
});
it('Should handle Chinese punctuation', async () => {
const testSentences = [
'你好!', // Hello!
'你好吗?', // How are you?
'是的,我知道。', // Yes, I know.
'他说:"你好"', // He said: "Hello"
'学习、工作、生活', // Study, work, life
'这是...很好', // This is... very good
];
for (const sentence of testSentences) {
const result = await profanity.hasCurseWords(sentence);
expect(typeof result).toBe('boolean');
}
});
it('Should handle Chinese measure words (classifiers)', async () => {
const measureWords = [
'一本书', // One book (classifier: 本)
'两个人', // Two people (classifier: 个)
'三只猫', // Three cats (classifier: 只)
'四辆车', // Four cars (classifier: 辆)
'五张纸', // Five sheets of paper (classifier: 张)
];
for (const phrase of measureWords) {
expect(await profanity.hasCurseWords(phrase)).toBe(false);
}
});
it('Should handle Chinese tone marks in pinyin (if applicable)', async () => {
// If your system processes pinyin alongside Chinese characters
const pinyinWords = [
'nǐ hǎo', // Hello
'xiè xiè', // Thank you
'duì bù qǐ', // Sorry
'zài jiàn', // Goodbye
];
for (const pinyin of pinyinWords) {
const result = await profanity.search(pinyin);
expect(typeof result).toBe('boolean');
}
});
it('Should handle whitespace around Chinese characters', async () => {
const chineseWord = '你好';
expect(await profanity.search(` ${chineseWord} `)).toBe(false);
expect(await profanity.search(`\t${chineseWord}\n`)).toBe(false);
});
it('Should handle mixed Chinese and English text', async () => {
const mixedSentences = [
'I love 中文', // I love Chinese
'这是English和中文的混合', // This is a mix of English and Chinese
'Hello 世界', // Hello world
'我在学习programming', // I am learning programming
];
for (const sentence of mixedSentences) {
const result = await profanity.hasCurseWords(sentence);
expect(typeof result).toBe('boolean');
}
});
it('Should return unique words only in Chinese text', async () => {
// Test with repeated Chinese words
const sentence = '你好你好你好世界';
const foundWords = await profanity.getCurseWords(sentence);
// Should return unique words only
expect(Array.isArray(foundWords)).toBe(true);
// If '你好' were a profanity word, it should appear only once
});
it('Should handle Chinese regional variations', async () => {
// Different Chinese-speaking regions may have different vocabulary
const regionalWords = [
'出租车', // Taxi (Mainland)
'计程车', // Taxi (Taiwan)
'的士', // Taxi (Hong Kong)
'垃圾', // Garbage (Mainland)
'废物', // Waste (General)
];
for (const word of regionalWords) {
const result = await profanity.search(word);
expect(typeof result).toBe('boolean');
}
});
});
describe('Performance tests for Chinese dataset', () => {
it('Should handle large Chinese text efficiently', async () => {
const largeText = '这是一个测试句子。'.repeat(1000) + '中文文本 ' + '干净的文本。'.repeat(1000);
const startTime = Date.now();
const result = await profanity.hasCurseWords(largeText);
const endTime = Date.now();
expect(typeof result).toBe('boolean');
expect(endTime - startTime).toBeLessThan(100); // Should complete in under 100ms
});
it('Should efficiently search through all Chinese terms', async () => {
const allWords = await profanity.all();
if (allWords.length > 0) {
const startTime = Date.now();
for (let i = 0; i < Math.min(100, allWords.length); i++) {
await profanity.search(allWords[i % allWords.length]);
}
const endTime = Date.now();
expect(endTime - startTime).toBeLessThan(50); // Should be very fast with Set lookup
}
});
it('Should handle concurrent operations on Chinese dataset', async () => {
const promises = [
profanity.search('你好'),
profanity.hasCurseWords('这是中文文本'),
profanity.getCurseWords('中文文本'),
profanity.all(),
profanity.search('谢谢')
];
const results = await Promise.all(promises);
expect(results[0]).toBe(false); // search 你好 (should be clean)
expect(results[1]).toBe(false); // hasCurseWords (should be clean)
expect(Array.isArray(results[2])).toBe(true); // getCurseWords
expect(Array.isArray(results[3])).toBe(true); // all words
expect(results[4]).toBe(false); // search 谢谢 (should be clean)
});
});
describe('Chinese language specificity', () => {
it('Should load Chinese words correctly or fallback to English', async () => {
const allWords = await profanity.all();
expect(allWords.length).toBeGreaterThan(0);
// If Chinese file doesn't exist, should fallback to English (958 words)
// If Chinese file exists, should load Chinese words
});
it('Should handle Chinese character encoding (UTF-8)', async () => {
// Test various Chinese character ranges
const chineseChars = [
'一', '二', '三', '四', '五', // Numbers
'人', '大', '小', '中', '国', // Common characters
'學', '國', '語', '電', '車', // Traditional characters
'龍', '鳳', '麒', '麟', '龜', // Complex characters
];
for (const char of chineseChars) {
const result = await profanity.search(char);
expect(typeof result).toBe('boolean');
}
});
it('Should handle Chinese internet slang and abbreviated forms', async () => {
// Common Chinese internet abbreviations and slang
const internetSlang = [
'886', // Bye bye (sounds like "bā bā liù")
'520', // I love you (sounds like "wǒ ài nǐ")
'88', // Bye bye
'233', // LOL (from emoticon)
'666', // Awesome/cool
];
for (const slang of internetSlang) {
const result = await profanity.search(slang);
expect(typeof result).toBe('boolean');
}
});
it('Should handle Chinese variant characters and fonts', async () => {
// Some characters have multiple valid forms
const variants = [
['关', '關'], // Close (simplified vs traditional)
['门', '門'], // Door (simplified vs traditional)
['时', '時'], // Time (simplified vs traditional)
['长', '長'], // Long (simplified vs traditional)
];
for (const [simplified, traditional] of variants) {
expect(await profanity.search(simplified)).toBe(false);
expect(await profanity.search(traditional)).toBe(false);
}
});
it('Should handle Chinese homophone considerations', async () => {
// Chinese has many homophones (same pronunciation, different characters)
const homophones = [
['时', '石', '是'], // shí - time, stone, is
['他', '她', '它'], // tā - he, she, it
['在', '再'], // zài - at/in, again
];
for (const group of homophones) {
for (const char of group) {
const result = await profanity.search(char);
expect(typeof result).toBe('boolean');
}
}
});
});
describe('Data integrity for Chinese', () => {
it('Should not allow modification of Chinese word list', async () => {
const terms1 = await profanity.all();
const originalLength = terms1.length;
// Try to modify the returned array
terms1.push('假词');
terms1.pop();
if (terms1.length > 0) {
terms1[0] = '修改';
}
// Get terms again - should be unchanged
const terms2 = await profanity.all();
expect(terms2.length).toBe(originalLength);
expect(terms2).not.toContain('假词');
if (terms2.length > 0) {
expect(terms2[0]).not.toBe('修改');
}
});
it('Should provide consistent results for Chinese detection', async () => {
const sentence = '这个句子是中文的';
const result1 = await profanity.getCurseWords(sentence);
const result2 = await profanity.getCurseWords(sentence);
const result3 = await profanity.hasCurseWords(sentence);
expect(result1).toEqual(result2);
expect(typeof result3).toBe('boolean');
});
});
describe('Configuration and fallback for Chinese', () => {
it('Should handle missing Chinese language file gracefully', async () => {
// If zh.txt doesn't exist, should fallback to English
const chineseProfanity = new ProfanityEngine({
language: 'zh',
testMode: true,
});
const terms = await chineseProfanity.all();
expect(terms.length).toBeGreaterThan(0);
});
it('Should suppress warnings in test mode for Chinese', async () => {
// Store original console.warn
const originalWarn = console.warn;
let warnCalled = false;
// Mock console.warn
console.warn = () => {
warnCalled = true;
};
const chineseProfanity = new ProfanityEngine({
language: 'zh',
testMode: true,
});
warnCalled = false;
await chineseProfanity.all();
expect(warnCalled).toBe(false);
// Restore original console.warn
console.warn = originalWarn;
});
});
describe('Chinese text processing specifics', () => {
it('Should handle Chinese word segmentation challenges', async () => {
// Chinese word boundaries are ambiguous
const ambiguousTexts = [
'研究生命科学', // Could be "研究生|命科学" or "研究|生命科学"
'北京大学生活', // Could be "北京大学|生活" or "北京|大学生|活"
'中国人民银行', // "中国人民银行" as one entity
];
for (const text of ambiguousTexts) {
const result = await profanity.hasCurseWords(text);
expect(typeof result).toBe('boolean');
}
});
it('Should handle Chinese proper nouns and names', async () => {
const properNouns = [
'北京', // Beijing
'上海', // Shanghai
'中国', // China
'长江', // Yangtze River
'故宫', // Forbidden City
];
for (const noun of properNouns) {
expect(await profanity.search(noun)).toBe(false);
}
});
it('Should handle Chinese grammar particles', async () => {
const particles = [
'的', // Possessive particle
'了', // Completion particle
'着', // Progressive particle
'过', // Experience particle
'吗', // Question particle
'呢', // Question particle
];
for (const particle of particles) {
expect(await profanity.search(particle)).toBe(false);
}
});
});
});