@raven-js/cortex
Version:
Zero-dependency machine learning, AI, and data processing library for modern JavaScript
85 lines (75 loc) • 3.61 kB
JavaScript
/**
* @author Anonyfox <max@anonyfox.com>
* @license MIT
* @see {@link https://github.com/Anonyfox/ravenjs}
* @see {@link https://ravenjs.dev}
* @see {@link https://anonyfox.com}
*/
/**
* @file Language-agnostic em-dash epidemic detector.
*
* Provides a neutral fallback implementation that doesn't detect any specific
* punctuation overuse patterns. Returns baseline metrics assuming no AI
* punctuation patterns are present. Used when language-specific detection
* is not available or desired.
*/
import { tokenizeWords } from "../../segmentation/index.js";
/**
* Analyzes text for punctuation overuse patterns (language-agnostic fallback).
*
* This general implementation doesn't search for any specific punctuation patterns,
* providing a neutral baseline that assumes no AI punctuation overuse is present.
* All punctuation-related metrics return neutral values, making it suitable as a
* fallback or when punctuation-based detection isn't needed.
*
* **Algorithm**: Tokenize text → return neutral baseline metrics with no punctuation analysis performed
*
* **Use cases**: Fallback for unsupported languages, baseline comparisons,
* or when punctuation-based detection is not desired.
*
* **Performance**: O(n) time complexity dominated by tokenization.
* Minimal memory usage with no punctuation pattern storage.
*
* @param {string} text - Input text to analyze
* @param {Object} [options={}] - Analysis options
* @param {number} [options.minWordCount=20] - Minimum word count required
* @param {boolean} [options.includeDetails=false] - Whether to include punctuation details
* @returns {{aiLikelihood: number, overallScore: number, punctuationDensity: number, totalPunctuation: number, wordCount: number, detectedOveruse: Array<object>}} Analysis results with neutral baseline metrics. aiLikelihood: Always 0.5 (neutral). overallScore: Always 0.5 (neutral). punctuationDensity: Always 50 (neutral). totalPunctuation: Always 0. wordCount: Actual word count. detectedOveruse: Always empty array.
*
* @throws {TypeError} When text parameter is not a string
* @throws {Error} When text contains insufficient words for analysis
*
* @example
* // Neutral baseline for unsupported languages
* const unknownText = "Some text in an unsupported language with various punctuation marks and symbols.";
* const analysis = detectEmDashEpidemic(text);
* console.log(analysis.aiLikelihood); // 0.5 (neutral baseline)
* console.log(analysis.totalPunctuation); // 0 (no overuse detected)
*/
export function detectEmDashEpidemic(text, options = {}) {
if (typeof text !== "string") {
throw new TypeError("Expected text to be a string");
}
if (text.trim().length === 0) {
throw new Error("Cannot analyze empty text");
}
const { minWordCount = 20 } = options;
if (!Number.isInteger(minWordCount) || minWordCount < 1) {
throw new Error("Parameter minWordCount must be a positive integer");
}
// Count total words using robust Unicode-aware tokenization
const words = tokenizeWords(text);
const wordCount = words.length;
if (wordCount < minWordCount) {
throw new Error(`Text must contain at least ${minWordCount} words for reliable analysis`);
}
// Language-agnostic: no punctuation pattern baselines to compare against, return neutral baseline
return {
aiLikelihood: 0.5, // Neutral baseline
overallScore: 0.5, // Neutral baseline
punctuationDensity: 50, // Neutral baseline (marks per 1000 words)
totalPunctuation: 0, // No overuse detected
wordCount,
detectedOveruse: [], // No overuse to report
};
}