UNPKG

text-moderate

Version:

A comprehensive JavaScript library for content moderation, including profanity filtering, sentiment analysis, and toxicity detection. Leveraging advanced algorithms and external APIs, TextModerate provides developers with tools to create safer and more po

28 lines (25 loc) 1.11 kB
/** * Tokenize module - Converts text into an array of tokens for analysis * @module tokenize */ /** * Removes special characters and returns an array of tokens (words). * Handles various edge cases like empty input and multiple spaces. * * @param {string} input - The input string to tokenize * @returns {string[]} Array of tokens (words) */ module.exports = function tokenize(input) { // Handle empty or non-string input if (!input || typeof input !== 'string') { return []; } return input .toLowerCase() // Convert to lowercase .replace(/\n/g, ' ') // Replace newlines with spaces .replace(/[.,\/#!?$%\^&\*;:{}=_`\"~()\[\]]/g, ' ') // Replace punctuation with spaces .replace(/\s\s+/g, ' ') // Replace multiple spaces with single space .trim() // Remove leading/trailing spaces .split(' ') // Split into array of words .filter(Boolean); // Remove empty tokens };