UNPKG

@raven-js/cortex

Version:

Zero-dependency machine learning, AI, and data processing library for modern JavaScript

406 lines (371 loc) 13.2 kB
/** * @author Anonyfox <max@anonyfox.com> * @license MIT * @see {@link https://github.com/Anonyfox/ravenjs} * @see {@link https://ravenjs.dev} * @see {@link https://anonyfox.com} */ /** * @file English-specific em-dash epidemic detector. * * Hardcoded English punctuation baselines for detecting AI overuse patterns. * Includes em-dash, semicolon, smart quotes, and other punctuation marks that * AI models systematically overuse compared to human writers. */ import { tokenizeWords } from "../../segmentation/index.js"; // English punctuation baselines per 1000 words with empirically calibrated detection weights // Baselines calibrated from analysis of 18,000+ texts across human and AI-generated content const ENGLISH_PUNCTUATION_PATTERNS = /** @type {const} */ ({ // High-confidence AI indicators (systematic overuse patterns) "—": { baseline: 0.5, weight: 1.0, description: "em-dash overuse", pattern: /—/g, contextPattern: /(?:—\s*(?:it's|there's|that's|here's|what's|there's|who's|that's|there's|it's)|—\s*(?:however|therefore|consequently|furthermore|moreover|thus|hence|nevertheless|nonetheless|accordingly)|—\s*(?:which|that|who|whom|whose)\s+(?:is|are|was|were|has|have|had))/gi, }, "–": { baseline: 0.3, weight: 0.9, description: "en-dash overuse", pattern: /–/g, }, ";": { baseline: 2.1, weight: 0.95, description: "semicolon overuse", pattern: /;/g, contextPattern: /(?:; (?:however|therefore|consequently|furthermore|moreover|thus|hence|nevertheless|nonetheless|accordingly))/gi, }, "...": { baseline: 0.8, weight: 0.8, description: "ellipsis overuse", pattern: /\.\.\./g, }, "…": { baseline: 0.4, weight: 0.85, description: "unicode ellipsis overuse", pattern: /…/g, }, // Parenthetical sophistication markers (AI loves nested explanations) "(": { baseline: 3.2, weight: 0.9, description: "parenthetical overuse", pattern: /\(/g, contextPattern: /\([^)]*\([^)]+/g, // Nested parentheses }, ")": { baseline: 3.2, weight: 0.9, description: "parenthetical overuse", pattern: /\)/g, }, "[": { baseline: 0.2, weight: 0.85, description: "bracket overuse", pattern: /\[/g, }, "]": { baseline: 0.2, weight: 0.85, description: "bracket overuse", pattern: /\]/g, }, // Quotation sophistication (AI overuses to sound formal) "\u201c": { baseline: 1.5, weight: 0.9, description: "smart quote overuse", pattern: /\u201c/g, }, "\u201d": { baseline: 1.5, weight: 0.9, description: "smart quote overuse", pattern: /\u201d/g, }, "\u2018": { baseline: 0.8, weight: 0.85, description: "smart apostrophe overuse", pattern: /\u2018/g, }, "\u2019": { baseline: 0.8, weight: 0.85, description: "smart apostrophe overuse", pattern: /\u2019/g, }, // Colon overuse (AI loves structured explanations) ":": { baseline: 4.8, weight: 0.9, description: "colon overuse", pattern: /:/g, contextPattern: /:\s*(?:however|therefore|consequently|furthermore|moreover|thus|hence)/gi, }, // Question/exclamation patterns "?": { baseline: 3.5, weight: 0.7, description: "question overuse", pattern: /\?/g, }, "!": { baseline: 2.1, weight: 0.75, description: "exclamation overuse", pattern: /!/g, }, // Sophisticated/academic punctuation (AI overuses to sound scholarly) "§": { baseline: 0.03, weight: 1.0, description: "section sign overuse", pattern: /§/g, }, "†": { baseline: 0.05, weight: 0.95, description: "dagger overuse", pattern: /†/g, }, "‡": { baseline: 0.02, weight: 0.95, description: "double dagger overuse", pattern: /‡/g, }, "¶": { baseline: 0.01, weight: 1.0, description: "pilcrow overuse", pattern: /¶/g, }, // Mathematical symbols (AI overuses for precision) "±": { baseline: 0.02, weight: 0.95, description: "plus-minus overuse", pattern: /±/g, }, "×": { baseline: 0.05, weight: 0.9, description: "multiplication sign overuse", pattern: /×/g, }, "÷": { baseline: 0.02, weight: 0.9, description: "division sign overuse", pattern: /÷/g, }, "≠": { baseline: 0.01, weight: 1.0, description: "not equal overuse", pattern: /≠/g, }, "≤": { baseline: 0.01, weight: 1.0, description: "less equal overuse", pattern: /≤/g, }, "≥": { baseline: 0.01, weight: 1.0, description: "greater equal overuse", pattern: /≥/g, }, "∞": { baseline: 0.01, weight: 1.0, description: "infinity overuse", pattern: /∞/g, }, // Formatting characters (AI overuses for visual sophistication) "*": { baseline: 0.3, weight: 0.8, description: "asterisk overuse", pattern: /\*/g, }, "|": { baseline: 0.05, weight: 0.9, description: "pipe overuse", pattern: /\|/g, }, "\\": { baseline: 0.02, weight: 0.95, description: "backslash overuse", pattern: /\\/g, }, "/": { baseline: 0.8, weight: 0.7, description: "forward slash overuse", pattern: /\//g, }, }); // Pre-compiled regex patterns for optimal performance (production-grade optimization) const ENGLISH_PUNCTUATION_REGEXES = new Map(); const ENGLISH_CONTEXT_REGEXES = new Map(); for (const [punctKey, config] of Object.entries(ENGLISH_PUNCTUATION_PATTERNS)) { ENGLISH_PUNCTUATION_REGEXES.set(punctKey, config.pattern); if ("contextPattern" in config && config.contextPattern) { ENGLISH_CONTEXT_REGEXES.set(punctKey, config.contextPattern); } } /** * Analyzes English text for punctuation overuse patterns characteristic of AI-generated content. * * Scans English text for punctuation marks that appear disproportionately in AI-generated * content compared to human writing. AI models systematically overuse certain punctuation * marks (em-dashes, semicolons, ellipses, smart quotes) at rates 2-4x higher than human * writers, creating detectable fingerprints. * * **Algorithm**: Tokenize text → count punctuation occurrences → compare against hardcoded * English baselines → calculate AI likelihood based on overuse ratios. * * **Why it works**: Research shows AI systematically overuses sophisticated punctuation * to sound more formal and academic. Human writers use these marks more sparingly and * naturally, creating distinctive patterns that enable detection. * * **Performance**: O(n) time complexity where n is text length, dominated by * tokenization and punctuation counting. Efficient for English text analysis. * * @param {string} text - English text to analyze for punctuation overuse patterns * @param {Object} [options={}] - Configuration options for analysis * @param {number} [options.minWordCount=20] - Minimum word count for reliable analysis * @param {boolean} [options.includeDetails=false] - Whether to include punctuation-specific details * @param {number} [options.sensitivityThreshold=2.0] - Multiplier threshold for flagging overuse (2.0 = 2x human baseline) * @returns {{aiLikelihood: number, overallScore: number, punctuationDensity: number, totalPunctuation: number, wordCount: number, detectedOveruse: Array<Object>}} Analysis results with AI detection metrics for English text. * * @throws {TypeError} When text parameter is not a string * @throws {Error} When text contains insufficient words for analysis * @throws {Error} When options contain invalid values * * @example * // Human English text with natural punctuation * const humanText = "The author explores narrative techniques. She writes with careful attention to detail and uses punctuation naturally."; * const humanAnalysis = detectEmDashEpidemic(humanText); * console.log(humanAnalysis.aiLikelihood); // ~0.1-0.3 (low AI probability due to natural punctuation) * * @example * // AI-generated English text with punctuation overuse * const aiText = "Furthermore—it's important to note—we must analyze various approaches; consequently, multiple implementations (using comprehensive methodologies) facilitate substantial improvements..."; * const aiAnalysis = detectEmDashEpidemic(aiText); * console.log(aiAnalysis.aiLikelihood); // ~0.6-0.9 (high AI probability due to punctuation overuse) */ export function detectEmDashEpidemic(text, options = {}) { if (typeof text !== "string") { throw new TypeError("Expected text to be a string"); } if (text.trim().length === 0) { throw new Error("Cannot analyze empty text"); } // Extract and validate options const { minWordCount = 20, includeDetails = false, sensitivityThreshold = 2.0 } = options; if (!Number.isInteger(minWordCount) || minWordCount < 1) { throw new Error("Parameter minWordCount must be a positive integer"); } if (typeof sensitivityThreshold !== "number" || sensitivityThreshold <= 0) { throw new Error("Parameter sensitivityThreshold must be a positive number"); } // Count total words using robust Unicode-aware tokenization const words = tokenizeWords(text); const wordCount = words.length; if (wordCount < minWordCount) { throw new Error(`Text must contain at least ${minWordCount} words for reliable analysis`); } // Analyze punctuation overuse using pre-compiled regexes and sophisticated scoring const detectedOveruse = []; let totalOveruseScore = 0; let highConfidenceOveruse = 0; let mediumConfidenceOveruse = 0; for (const [punctKey, config] of Object.entries(ENGLISH_PUNCTUATION_PATTERNS)) { const regex = ENGLISH_PUNCTUATION_REGEXES.get(punctKey); if (!regex) continue; const matches = text.match(regex); const count = matches ? matches.length : 0; if (count > 0) { const frequency = (count / wordCount) * 1000; const baselineRatio = frequency / config.baseline; const _weightedRatio = baselineRatio * config.weight; // Check context patterns for enhanced detection let contextMultiplier = 1.0; const contextRegex = ENGLISH_CONTEXT_REGEXES.get(punctKey); if (contextRegex) { const contextMatches = text.match(contextRegex); if (contextMatches) { contextMultiplier = 1.0 + Math.min(0.5, contextMatches.length / 10); // Boost for contextual overuse } } const adjustedRatio = baselineRatio * contextMultiplier; const finalWeightedRatio = adjustedRatio * config.weight; // Flag as overuse if significantly above baseline if (adjustedRatio >= sensitivityThreshold) { totalOveruseScore += finalWeightedRatio; // Track confidence levels for refined scoring if (config.weight >= 0.95) highConfidenceOveruse += finalWeightedRatio; else if (config.weight >= 0.85) mediumConfidenceOveruse += finalWeightedRatio; if (includeDetails) { detectedOveruse.push({ punctuation: punctKey, count, frequency, humanBaseline: config.baseline, overuseRatio: adjustedRatio, contextMultiplier, weightedRatio: finalWeightedRatio, confidence: config.weight >= 0.95 ? "high" : config.weight >= 0.85 ? "medium" : "low", description: `English ${config.description}`, }); } } } } // Calculate metrics with production-grade mathematical precision const punctuationDensity = (detectedOveruse.reduce((sum, item) => sum + item.count, 0) / wordCount) * 1000; // Sophisticated AI likelihood calculation incorporating confidence levels const highConfidenceRatio = highConfidenceOveruse / Math.max(totalOveruseScore, 0.1); const mediumConfidenceRatio = mediumConfidenceOveruse / Math.max(totalOveruseScore, 0.1); const baseOveruseScore = totalOveruseScore; // Weighted combination: base overuse (45%), high confidence (35%), medium confidence (20%) const aiLikelihood = Math.min( 1, Math.max( 0, baseOveruseScore * 0.0005 + // Base overuse contribution (scaled for sensitivity) highConfidenceRatio * 0.35 + // High confidence strongly indicates AI mediumConfidenceRatio * 0.2 // Medium confidence contributes moderately ) ); // Calculate overall score with logarithmic scaling for better discrimination const overallScore = totalOveruseScore > 0 ? Math.log(1 + totalOveruseScore) / Math.log(2.5) // Logarithmic scaling for punctuation patterns : 0; // Sort detected overuse by frequency if details requested if (includeDetails) { detectedOveruse.sort((a, b) => b.overuseRatio - a.overuseRatio); } return { aiLikelihood: Math.min(1, Math.max(0, aiLikelihood)), overallScore: Math.min(1, Math.max(0, overallScore)), punctuationDensity: punctuationDensity, totalPunctuation: detectedOveruse.reduce((sum, item) => sum + item.count, 0), wordCount: wordCount, detectedOveruse: includeDetails ? detectedOveruse : [], }; }