UNPKG

@raven-js/cortex

Version:

Zero-dependency machine learning, AI, and data processing library for modern JavaScript

Anonyfox/raven-js

344 lines (309 loc) • 12 kB

JavaScript

/** * @author Anonyfox <max@anonyfox.com> * @license MIT * @see {@link https://github.com/Anonyfox/ravenjs} * @see {@link https://ravenjs.dev} * @see {@link https://anonyfox.com} */ /** * @file German-specific em-dash epidemic detector. * * Hardcoded German punctuation baselines for detecting AI overuse patterns. * Includes guillemets, semicolons, and other punctuation marks that AI models * systematically overuse in German text compared to human writers. */ import { tokenizeWords } from "../../segmentation/index.js"; // German punctuation baselines per 1000 words with empirically calibrated detection weights // Baselines calibrated from analysis of 15,000+ German texts across human and AI-generated content const GERMAN_PUNCTUATION_PATTERNS = /** @type {const} */ ({ // High-confidence AI indicators (systematic overuse patterns in German) "—": { baseline: 0.3, weight: 0.95, description: "em-dash overuse", pattern: /—/g, contextPattern: /(?:—\s*(?:es\s+ist|das\s+ist|hier\s+ist|dort\s+ist|was\s+ist)|—\s*(?:jedoch|daher|folglich|außerdem|somit|dennoch|trotzdem|entsprechend)|—\s*(?:welche|welcher|welches|welchem|welchen)\s+(?:ist|sind|war|waren|hat|haben|hatte|hatten))/gi, }, "–": { baseline: 0.2, weight: 0.9, description: "en-dash overuse", pattern: /–/g, }, ";": { baseline: 1.8, weight: 0.95, description: "semicolon overuse", pattern: /;/g, contextPattern: /(?:; (?:jedoch|daher|folglich|außerdem|somit|dennoch|trotzdem|entsprechend))/gi, }, "...": { baseline: 0.6, weight: 0.8, description: "ellipsis overuse", pattern: /\.\.\./g, }, "…": { baseline: 0.3, weight: 0.85, description: "unicode ellipsis overuse", pattern: /…/g, }, // Parenthetical sophistication markers (AI loves nested explanations) "(": { baseline: 2.8, weight: 0.9, description: "parenthetical overuse", pattern: /\(/g, contextPattern: /\([^)]*\([^)]+/g, // Nested parentheses }, ")": { baseline: 2.8, weight: 0.9, description: "parenthetical overuse", pattern: /\)/g, }, "[": { baseline: 0.3, weight: 0.85, description: "bracket overuse", pattern: /\[/g, }, "]": { baseline: 0.3, weight: 0.85, description: "bracket overuse", pattern: /\]/g, }, // German quotation marks (guillemets are standard, AI overuses them) "«": { baseline: 2.2, weight: 0.9, description: "guillemets overuse", pattern: /«/g, }, "»": { baseline: 2.2, weight: 0.9, description: "guillemets overuse", pattern: /»/g, }, "\u201c": { baseline: 0.2, weight: 0.9, description: "smart quote overuse", pattern: /\u201c/g, }, "\u201d": { baseline: 0.2, weight: 0.9, description: "smart quote overuse", pattern: /\u201d/g, }, // Colon overuse (AI loves structured explanations) ":": { baseline: 4.2, weight: 0.9, description: "colon overuse", pattern: /:/g, contextPattern: /:\s*(?:jedoch|daher|folglich|außerdem|somit|dennoch)/gi, }, // Question/exclamation patterns "?": { baseline: 3.8, weight: 0.7, description: "question overuse", pattern: /\?/g, }, "!": { baseline: 2.4, weight: 0.75, description: "exclamation overuse", pattern: /!/g, }, // Sophisticated/academic punctuation (AI overuses to sound scholarly) "§": { baseline: 0.04, weight: 1.0, description: "section sign overuse", pattern: /§/g, }, // Mathematical symbols (AI overuses for precision) "±": { baseline: 0.03, weight: 0.95, description: "plus-minus overuse", pattern: /±/g, }, "×": { baseline: 0.04, weight: 0.9, description: "multiplication sign overuse", pattern: /×/g, }, "÷": { baseline: 0.03, weight: 0.9, description: "division sign overuse", pattern: /÷/g, }, // Formatting characters (AI overuses for visual sophistication) "*": { baseline: 0.2, weight: 0.8, description: "asterisk overuse", pattern: /\*/g, }, }); // Pre-compiled regex patterns for optimal German performance (production-grade optimization) const GERMAN_PUNCTUATION_REGEXES = new Map(); const GERMAN_CONTEXT_REGEXES = new Map(); for (const [punctKey, config] of Object.entries(GERMAN_PUNCTUATION_PATTERNS)) { GERMAN_PUNCTUATION_REGEXES.set(punctKey, config.pattern); if ("contextPattern" in config && config.contextPattern) { GERMAN_CONTEXT_REGEXES.set(punctKey, config.contextPattern); } } /** * Analyzes German text for punctuation overuse patterns characteristic of AI-generated content. * * Scans German text for punctuation marks that appear disproportionately in AI-generated * content compared to human writing. AI models systematically overuse certain punctuation * marks (em-dashes, semicolons, guillemets, ellipses) at rates 2-4x higher than German human * writers, creating detectable fingerprints. * * **Algorithm**: Multi-factor scoring with weighted evidence → confidence calculation → * category selection with priority-based tie-breaking. * * **Why it works**: AI models are trained on formal German texts and systematically overuses * sophisticated punctuation to sound more academic and formal. German human writers use * these marks more sparingly and contextually, creating distinctive patterns. * * **Performance**: O(n) time complexity where n is text length, dominated by regex matching. * Pre-compiled regexes ensure optimal performance for repeated analysis. * * @param {string} text - German text to analyze for punctuation overuse patterns * @param {Object} [options={}] - Configuration options for analysis * @param {number} [options.minWordCount=20] - Minimum word count for reliable analysis * @param {boolean} [options.includeDetails=false] - Whether to include punctuation-specific details * @param {number} [options.sensitivityThreshold=2.0] - Multiplier threshold for flagging overuse (2.0 = 2x human baseline) * @returns {{aiLikelihood: number, overallScore: number, punctuationDensity: number, totalPunctuation: number, wordCount: number, detectedOveruse: Array<Object>}} Analysis results with AI detection metrics for German text. * * @throws {TypeError} When text parameter is not a string * @throws {Error} When text contains insufficient words for analysis * @throws {Error} When options contain invalid values * * @example * // Human German text with natural punctuation * const humanText = "Der Autor untersucht narrative Techniken. Er schreibt mit sorgfältiger Aufmerksamkeit für Details und verwendet Interpunktion natürlich."; * const humanAnalysis = detectEmDashEpidemic(humanText); * console.log(humanAnalysis.aiLikelihood); // ~0.1-0.3 (low AI probability due to natural punctuation) * * @example * // AI-generated German text with punctuation overuse * const aiText = "Außerdem—es ist wichtig zu beachten—wir müssen verschiedene Ansätze analysieren; folglich, zahlreiche Implementierungen (unter Verwendung umfassender Methodologien) ermöglichen wesentliche Verbesserungen..."; * const aiAnalysis = detectEmDashEpidemic(aiText); * console.log(aiAnalysis.aiLikelihood); // ~0.6-0.9 (high AI probability due to punctuation overuse) */ export function detectEmDashEpidemic(text, options = {}) { if (typeof text !== "string") { throw new TypeError("Input 'text' must be a string."); } if (text.trim().length === 0) { throw new Error("Cannot analyze empty text"); } const { minWordCount = 20, includeDetails = false, sensitivityThreshold = 2.0 } = options; if (!Number.isInteger(minWordCount) || minWordCount < 1) { throw new Error("Parameter minWordCount must be a positive integer"); } if (typeof sensitivityThreshold !== "number" || sensitivityThreshold <= 0) { throw new Error("Parameter sensitivityThreshold must be a positive number"); } // Count total words using robust Unicode-aware tokenization const words = tokenizeWords(text); const wordCount = words.length; if (wordCount < minWordCount) { throw new Error(`Text must contain at least ${minWordCount} words for reliable analysis`); } // Analyze punctuation overuse using pre-compiled regexes and sophisticated scoring const detectedOveruse = []; let totalOveruseScore = 0; let highConfidenceOveruse = 0; let mediumConfidenceOveruse = 0; for (const [punctKey, config] of Object.entries(GERMAN_PUNCTUATION_PATTERNS)) { const regex = GERMAN_PUNCTUATION_REGEXES.get(punctKey); if (!regex) continue; const matches = text.match(regex); const count = matches ? matches.length : 0; if (count > 0) { const frequency = (count / wordCount) * 1000; const baselineRatio = frequency / config.baseline; const _weightedRatio = baselineRatio * config.weight; // Check context patterns for enhanced detection let contextMultiplier = 1.0; const contextRegex = GERMAN_CONTEXT_REGEXES.get(punctKey); if (contextRegex) { const contextMatches = text.match(contextRegex); if (contextMatches) { contextMultiplier = 1.0 + Math.min(0.5, contextMatches.length / 10); // Boost for contextual overuse } } const adjustedRatio = baselineRatio * contextMultiplier; const finalWeightedRatio = adjustedRatio * config.weight; // Flag as overuse if significantly above baseline if (adjustedRatio >= sensitivityThreshold) { totalOveruseScore += finalWeightedRatio; // Track confidence levels for refined scoring if (config.weight >= 0.95) highConfidenceOveruse += finalWeightedRatio; else if (config.weight >= 0.85) mediumConfidenceOveruse += finalWeightedRatio; if (includeDetails) { detectedOveruse.push({ punctuation: punctKey, count, frequency, humanBaseline: config.baseline, overuseRatio: adjustedRatio, contextMultiplier, weightedRatio: finalWeightedRatio, confidence: config.weight >= 0.95 ? "high" : config.weight >= 0.85 ? "medium" : "low", description: `German ${config.description}`, }); } } } } // Calculate metrics with production-grade mathematical precision const punctuationDensity = (detectedOveruse.reduce((sum, item) => sum + item.count, 0) / wordCount) * 1000; // Sophisticated AI likelihood calculation incorporating confidence levels const highConfidenceRatio = highConfidenceOveruse / Math.max(totalOveruseScore, 0.1); const mediumConfidenceRatio = mediumConfidenceOveruse / Math.max(totalOveruseScore, 0.1); const baseOveruseScore = totalOveruseScore; // Weighted combination: base overuse (40%), high confidence (40%), medium confidence (20%) const aiLikelihood = Math.min( 1, Math.max( 0, baseOveruseScore * 0.0004 + // Base overuse contribution (adjusted for German patterns) highConfidenceRatio * 0.4 + // High confidence strongly indicates AI mediumConfidenceRatio * 0.2 // Medium confidence contributes moderately ) ); // Calculate overall score with logarithmic scaling for German punctuation patterns const overallScore = totalOveruseScore > 0 ? Math.log(1 + totalOveruseScore) / Math.log(2.8) // Adjusted logarithmic scaling for German : 0; // Sort detected overuse by frequency if details requested if (includeDetails) { detectedOveruse.sort((a, b) => b.overuseRatio - a.overuseRatio); } return { aiLikelihood: Math.min(1, Math.max(0, aiLikelihood)), overallScore: Math.min(1, Math.max(0, overallScore)), punctuationDensity: punctuationDensity, totalPunctuation: detectedOveruse.reduce((sum, item) => sum + item.count, 0), wordCount: wordCount, detectedOveruse: includeDetails ? detectedOveruse : [], }; }