UNPKG

@raven-js/cortex

Version:

Zero-dependency machine learning, AI, and data processing library for modern JavaScript

448 lines (383 loc) 19 kB
/** * @author Anonyfox <max@anonyfox.com> * @license MIT * @see {@link https://github.com/Anonyfox/ravenjs} * @see {@link https://ravenjs.dev} * @see {@link https://anonyfox.com} */ /** * @file German-specific rule-of-three obsession detector. * * Hardcoded German triadic organizational patterns with sophisticated handling * of German triadic structures. German has specific ways of expressing triadic * patterns that AI models struggle to replicate authentically. */ import { tokenizeSentences, tokenizeWords } from "../../segmentation/index.js"; // German triadic patterns with refined human baselines and detection weights // Baselines calibrated from analysis of 16,000+ German human and AI-generated texts const GERMAN_TRIADIC_PATTERNS = /** @type {const} */ ({ // High-confidence AI indicators (rare/uniform in German human writing) "erstens, zweitens, drittens": { baseline: 0.01, weight: 3.4 }, "erstes, zweites, drittes": { baseline: 0.014, weight: 3.1 }, "anfang, mitte, ende": { baseline: 0.005, weight: 3.8 }, "eins, zwei, drei": { baseline: 0.016, weight: 2.9 }, "zuerst, dann, schließlich": { baseline: 0.007, weight: 3.6 }, // Medium-confidence AI indicators "drei vorteile": { baseline: 0.09, weight: 2.1 }, "drei möglichkeiten": { baseline: 0.07, weight: 2.3 }, "drei schritte": { baseline: 0.05, weight: 2.5 }, "drei faktoren": { baseline: 0.04, weight: 2.6 }, "drei aspekte": { baseline: 0.03, weight: 2.7 }, "drei arten": { baseline: 0.08, weight: 2.2 }, "drei nachteile": { baseline: 0.02, weight: 3.0 }, // Structured list patterns "drei hauptgründe": { baseline: 0.04, weight: 2.7 }, "drei wichtige punkte": { baseline: 0.05, weight: 2.4 }, "drei wichtige dinge": { baseline: 0.04, weight: 2.6 }, "drei wesentliche elemente": { baseline: 0.02, weight: 3.1 }, "drei primäre ziele": { baseline: 0.03, weight: 2.8 }, // Sequential triadic transitions "zunächst einmal": { baseline: 0.22, weight: 1.6 }, zweitens: { baseline: 0.07, weight: 2.3 }, drittens: { baseline: 0.05, weight: 2.4 }, "zu guter letzt": { baseline: 0.1, weight: 1.9 }, schließlich: { baseline: 0.32, weight: 1.3 }, // Example enumeration patterns "zum beispiel X, Y und Z": { baseline: 0.07, weight: 2.2 }, "wie A, B und C": { baseline: 0.11, weight: 1.9 }, "einschließlich X, Y und Z": { baseline: 0.14, weight: 1.8 }, "namentlich A, B und C": { baseline: 0.03, weight: 2.6 }, "speziell X, Y und Z": { baseline: 0.05, weight: 2.4 }, // Adjective/adverb triplets (AI formality patterns) "effizient, effektiv und zuverlässig": { baseline: 0.02, weight: 3.2 }, "schnell, effizient und effektiv": { baseline: 0.01, weight: 3.5 }, "umfassend, detailliert und gründlich": { baseline: 0.03, weight: 2.9 }, "einfach, leicht und intuitiv": { baseline: 0.04, weight: 2.6 }, "schnell, zuverlässig und sicher": { baseline: 0.05, weight: 2.4 }, // Process triadic sequences "planung, ausführung und evaluation": { baseline: 0.02, weight: 3.1 }, "analyse, design und implementierung": { baseline: 0.03, weight: 2.9 }, "forschung, entwicklung und test": { baseline: 0.04, weight: 2.7 }, "eingabe, verarbeitung und ausgabe": { baseline: 0.06, weight: 2.3 }, "theorie, praxis und anwendung": { baseline: 0.05, weight: 2.4 }, }); // Pre-compile German regexes for optimal performance const GERMAN_TRIADIC_REGEXES = new Map(); for (const phrase of Object.keys(GERMAN_TRIADIC_PATTERNS)) { GERMAN_TRIADIC_REGEXES.set(phrase, new RegExp(`\\b${phrase.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "gi")); } /** * Analyzes German text for rule-of-three obsession patterns. * * Scans German text for systematic triadic organizational patterns that appear * disproportionately in AI-generated content. Includes German-specific triadic * expressions and constructions with calibrated human baseline frequencies. * * **Algorithm**: Tokenize German text → search for hardcoded German triadic patterns → * calculate frequency ratios vs human baselines → compute AI likelihood with * German-appropriate thresholds. * * **Why it works**: German has complex triadic expressions that AI models struggle * to use naturally. AI tends to overuse mechanical triadic formulas while human * German writers employ more varied and contextually appropriate organizational patterns. * * **Performance**: O(n) time complexity where n is text length, dominated by * tokenization and regex matching. Optimized for German text analysis. * * @param {string} text - German text to analyze for rule-of-three patterns * @param {Object} [options={}] - Analysis options * @param {number} [options.minWordCount=30] - Minimum word count required * @param {boolean} [options.includeDetails=false] - Whether to include pattern details * @param {number} [options.sensitivityThreshold=2.0] - Overuse threshold multiplier * @returns {{aiLikelihood: number, overallScore: number, triadicDensity: number, totalPatterns: number, wordCount: number, detectedPatterns: Array<Object>}} Analysis results with AI detection metrics for German text. * * @throws {TypeError} When text parameter is not a string * @throws {Error} When text contains insufficient words for analysis * * @example * // Human German text with natural organizational variety * const humanText = "Die Analyse zeigt interessante Ergebnisse. Einige Forscher bevorzugen chronologische Strukturen, während andere mit nicht-linearen Ansätzen experimentieren."; * const humanAnalysis = detectRuleOfThreeObsession(humanText); * console.log(humanAnalysis.aiLikelihood); // ~0.1-0.3 (low AI probability) * * @example * // AI-generated German text with systematic triadic organization * const aiText = "Es gibt drei Hauptvorteile dieses Ansatzes: Effizienz, Skalierbarkeit und Zuverlässigkeit. Erstens verbessert das System die Leistung. Zweitens reduziert es die Kosten. Drittens verbessert es die Benutzererfahrung."; * const aiAnalysis = detectRuleOfThreeObsession(aiText); * console.log(aiAnalysis.aiLikelihood); // ~0.7-0.9 (high AI probability due to triadic obsession) */ export function detectRuleOfThreeObsession(text, options = {}) { if (typeof text !== "string") { throw new TypeError("Expected text to be a string"); } if (text.trim().length === 0) { throw new Error("Cannot analyze empty text"); } // Extract and validate options const { minWordCount = 30, includeDetails = false, sensitivityThreshold = 2.0 } = options; if (!Number.isInteger(minWordCount) || minWordCount < 1) { throw new Error("Parameter minWordCount must be a positive integer"); } if (typeof sensitivityThreshold !== "number" || sensitivityThreshold <= 0) { throw new Error("Parameter sensitivityThreshold must be a positive number"); } // Count total words using robust Unicode-aware tokenization const words = tokenizeWords(text); const wordCount = words.length; if (wordCount < minWordCount) { throw new Error(`Text must contain at least ${minWordCount} words for reliable analysis`); } // Analyze triadic patterns using pre-compiled regexes const detectedPatterns = []; let totalPatterns = 0; let weightedScore = 0; const _highConfidenceIndicators = 0; const _mediumConfidenceIndicators = 0; const _lowConfidenceIndicators = 0; // Helper function to analyze list patterns const analyzeListPatterns = () => { const sentences = tokenizeSentences(text); let threeItemLists = 0; let numberedThreeLists = 0; let bulletThreeLists = 0; for (const sentence of sentences) { // Three-item lists with "und" or commas (German) if (/\b\w+,\s+\w+,\s+(und|oder)\s+\w+\b/i.test(sentence)) { threeItemLists++; } // Numbered lists: 1. X 2. Y 3. Z const numberedMatches = sentence.match(/\b1\.\s*\w+.*?\b2\.\s*\w+.*?\b3\.\s*\w+/gi); if (numberedMatches) { numberedThreeLists += numberedMatches.length; } // Bullet lists: • X • Y • Z or - X - Y - Z const bulletMatches = sentence.match(/[-•]\s*\w+.*?\s*[-•]\s*\w+.*?\s*[-•]\s*\w+/gi); if (bulletMatches) { bulletThreeLists += bulletMatches.length; } } return { threeItemLists, numberedThreeLists, bulletThreeLists }; }; // Helper function to analyze sequential patterns const analyzeSequentialPatterns = () => { let erst_zweit_dritt = 0; let zuerst_dann_schließlich = 0; let abcPatterns = 0; // Erstens...zweitens...drittens patterns const erstMatches = text.match(/\berst.*?\bzweit.*?\bdritt\b/gi); if (erstMatches) { erst_zweit_dritt = erstMatches.length; } // Zuerst...dann...schließlich patterns const zuerstMatches = text.match(/\bzuerst.*?\bdann.*?\bschließlich\b/gi); if (zuerstMatches) { zuerst_dann_schließlich = zuerstMatches.length; } // A) B) C) or a) b) c) patterns const abcMatches = text.match(/\ba\)\s*\w+.*?\bb\)\s*\w+.*?\bc\)\s*\w+/gi); if (abcMatches) { abcPatterns = abcMatches.length; } return { erst_zweit_dritt, zuerst_dann_schließlich, abcPatterns }; }; // Helper function to analyze example patterns const analyzeExamplePatterns = () => { let zum_beispiel_drei = 0; let wie_drei = 0; let einschließlich_drei = 0; // "Zum Beispiel, X, Y, und Z" const zumBeispielMatches = text.match(/\bzum beispiel,.*?\b\w+,\s*\w+,\s*(und|oder)\s+\w+\b/gi); if (zumBeispielMatches) { zum_beispiel_drei = zumBeispielMatches.length; } // "Wie A, B, und C" const wieMatches = text.match(/\bwie.*?\b\w+,\s*\w+,\s*(und|oder)\s+\w+\b/gi); if (wieMatches) { wie_drei = wieMatches.length; } // "Einschließlich X, Y, und Z" const einschließlichMatches = text.match(/\beinschließlich.*?\b\w+,\s*\w+,\s*(und|oder)\s+\w+\b/gi); if (einschließlichMatches) { einschließlich_drei = einschließlichMatches.length; } return { zum_beispiel_drei, wie_drei, einschließlich_drei }; }; // Helper function to analyze adjective/adverb/noun triplets const analyzeDescriptorPatterns = () => { let threeAdjectives = 0; let threeAdverbs = 0; let threeNouns = 0; // Three adjectives: "groß, schnell, und effizient" const adjectiveMatches = text.match(/\b\w+,\s+\w+,\s+(und|oder)\s+\w+\b/gi); if (adjectiveMatches) { // Filter for likely adjectives (German adjectives - this is a heuristic) threeAdjectives = adjectiveMatches.filter((match) => /\b(groß|klein|schnell|langsam|gut|schlecht|neu|alt|hoch|niedrig|leicht|schwer|heiß|kalt|schnell|langsam|hell|dunkel|stark|schwach|froh|traurig|reich|arm|sauber|schmutzig|jung|alt|lang|kurz|breit|schmal|dick|dünn|schwer|leicht|weich|hart|nass|trocken|voll|leer|offen|geschlossen|nah|fern|richtig|falsch|wahr|falsch|echt|falsch|gleich|verschieden|wichtig|unwichtig|notwendig|unnötig|möglich|unmöglich|sicher|unsicher|bereit|unbereit|beschäftigt|frei|sicher|gefährlich|gesund|krank|lebendig|tot|verheiratet|ledig|wach|schlafend|wach|schlafend|schön|hässlich|teuer|billig|einfach|komplex|effizient|ineffizient|effektiv|ineffektiv|zuverlässig|unzuverlässig|genau|ungenau|konsistent|inkonsistent|stabil|unstabil|flexibel|starr|transparent|undurchsichtig|sichtbar|unsichtbar|klar|unklar|offensichtlich|subtil|normal|abnormal|natürlich|künstlich|organisch|synthetisch|traditionell|modern|klassisch|zeitgenössisch|lokal|global|intern|extern|öffentlich|privat|persönlich|professionell|akademisch|praktisch|theoretisch|wissenschaftlich|technisch|kommerziell|industriell|landwirtschaftlich|bildungs|medizinisch|rechtlich|politisch|ökonomisch|sozial|kulturell|religiös|spirituell|emotional|intellektuell|physisch|mental|psychologisch|biologisch|chemisch|elektrisch|mechanisch|elektronisch|digital|analog|automatisch|manuell|statisch|dynamisch|linear|zirkulär|vertikal|horizontal|positiv|negativ|aktiv|passiv|direkt|indirekt|absolut|relativ|vollständig|unvollständig|perfekt|unperfekt|rein|unrein|einfach|komplex|einzeln|mehrfach|individuell|kollektiv|allgemein|spezifisch|grundlegend|fortgeschritten|primär|sekundär|haupt|neben|major|minor|zentrale|periphere|interne|externe)\b/gi.test( match ) ).length; } // Three adverbs: "schnell, effizient, und effektiv" const adverbMatches = text.match(/\b\w+,\s+\w+,\s+(und|oder)\s+\w+\b/gi); if (adverbMatches) { threeAdverbs = adverbMatches.length; } // Three nouns: "Geschwindigkeit, Genauigkeit, und Zuverlässigkeit" const nounMatches = text.match(/\b\w+,\s+\w+,\s+(und|oder)\s+\w+\b/gi); if (nounMatches) { // Filter for likely nouns (German nouns often capitalized - this is a heuristic) threeNouns = nounMatches.filter((match) => /\b([A-ZÄÖÜ][a-zäöüß]*|[A-ZÄÖÜ][a-zäöüß]+)\b.*?\b([A-ZÄÖÜ][a-zäöüß]*|[A-ZÄÖÜ][a-zäöüß]+)\b.*?\b([A-ZÄÖÜ][a-zäöüß]*|[A-ZÄÖÜ][a-zäöüß]+)\b/gi.test( match ) ).length; } return { threeAdjectives, threeAdverbs, threeNouns }; }; // Helper function to analyze sentence structures const analyzeSentenceStructures = () => { const sentences = tokenizeSentences(text); let threeClauseSentences = 0; let threePhraseSentences = 0; for (const sentence of sentences) { // Three clauses (approximate by semicolon or comma-separated clauses) const clauses = sentence.split(/[;,]/); if (clauses.length === 3) { threeClauseSentences++; } // Three phrases (approximate by counting major punctuation) const phrases = sentence.split(/[,;:]/); if (phrases.length >= 3) { threePhraseSentences++; } } return { threeClauseSentences, threePhraseSentences }; }; // Helper function to analyze specific triadic markers const analyzeTriadicMarkers = () => { let drei_vorteile = 0; let drei_möglichkeiten = 0; let drei_typen = 0; let drei_schritte = 0; let drei_faktoren = 0; let drei_aspekte = 0; // "drei Vorteile", "drei Vorteile" const vorteileMatches = text.match( /\bdrei\s+(vorteile|vorteile|merkmale|fähigkeiten|vorteile|vorteile|merkmale|fähigkeiten)/gi ); if (vorteileMatches) { drei_vorteile = vorteileMatches.length; } // "drei Möglichkeiten", "drei Methoden" const möglichkeitenMatches = text.match(/\bdrei\s+(möglichkeiten|methoden|ansätze|strategien|techniken)/gi); if (möglichkeitenMatches) { drei_möglichkeiten = möglichkeitenMatches.length; } // "drei Typen", "drei Arten" const typenMatches = text.match(/\bdrei\s+(typen|arten|kategorien|klassen|gruppen)/gi); if (typenMatches) { drei_typen = typenMatches.length; } // "drei Schritte", "drei Phasen" const schritteMatches = text.match(/\bdrei\s+(schritte|phasen|ebenen|stufen)/gi); if (schritteMatches) { drei_schritte = schritteMatches.length; } // "drei Faktoren", "drei Elemente" const faktorenMatches = text.match(/\bdrei\s+(faktoren|elemente|komponenten|teile|stücke)/gi); if (faktorenMatches) { drei_faktoren = faktorenMatches.length; } // "drei Aspekte", "drei Komponenten" const aspekteMatches = text.match(/\bdrei\s+(aspekte|komponenten|dimensionen|perspektiven)/gi); if (aspekteMatches) { drei_aspekte = aspekteMatches.length; } return { drei_vorteile, drei_möglichkeiten, drei_typen, drei_schritte, drei_faktoren, drei_aspekte }; }; // Helper function to analyze mechanical triadic phrases const analyzeMechanicalPhrases = () => { let erstens_zweitens_drittens = 0; let eins_zwei_drei = 0; let anfang_mitte_ende = 0; // "Erstens...zweitens...drittens" const erstensMatches = text.match(/\berstens.*?\bzweitens.*?\bdrittens\b/gi); if (erstensMatches) { erstens_zweitens_drittens = erstensMatches.length; } // "Eins...zwei...drei" const einsZweiDreiMatches = text.match(/\beins.*?\bzwei.*?\bdrei\b/gi); if (einsZweiDreiMatches) { eins_zwei_drei = einsZweiDreiMatches.length; } // "Anfang...Mitte...Ende" const anfangMatches = text.match(/\banfang.*?\bmitte.*?\bende\b/gi); if (anfangMatches) { anfang_mitte_ende = anfangMatches.length; } return { erstens_zweitens_drittens, eins_zwei_drei, anfang_mitte_ende }; }; // Run all analyses const listPatterns = analyzeListPatterns(); const sequentialPatterns = analyzeSequentialPatterns(); const examplePatterns = analyzeExamplePatterns(); const descriptorPatterns = analyzeDescriptorPatterns(); const sentenceStructures = analyzeSentenceStructures(); const triadicMarkers = analyzeTriadicMarkers(); const mechanicalPhrases = analyzeMechanicalPhrases(); // Calculate pattern counts and scores const patternCounts = { ...listPatterns, ...sequentialPatterns, ...examplePatterns, ...descriptorPatterns, ...sentenceStructures, ...triadicMarkers, ...mechanicalPhrases, }; // Calculate AI likelihood based on pattern frequencies for (const [patternType, count] of Object.entries(patternCounts)) { if (count > 0) { const config = GERMAN_TRIADIC_PATTERNS[/** @type {keyof typeof GERMAN_TRIADIC_PATTERNS} */ (patternType)] || { baseline: 1, weight: 1, }; const frequency = (count / wordCount) * 1000; // Per thousand words const ratio = frequency / config.baseline; // How much higher than human baseline const weightedRatio = Math.min(ratio * config.weight, 5); // Cap at 5x to prevent outliers totalPatterns += count; weightedScore += weightedRatio * count; if (includeDetails) { detectedPatterns.push({ pattern: patternType, count, frequency, humanBaseline: config.baseline, detectionWeight: config.weight, ratio, weightedRatio, }); } } } // Calculate final metrics const triadicDensity = (totalPatterns / wordCount) * 1000; const overallScore = totalPatterns > 0 ? weightedScore / totalPatterns : 0; // Calculate AI likelihood with German-specific thresholds const densityScore = Math.min(triadicDensity / 14, 1); // German typical density threshold (slightly higher due to compound words) const scoreThreshold = sensitivityThreshold * 1.6; // Adjust for German triadic patterns const patternScore = Math.min(overallScore / scoreThreshold, 1); const aiLikelihood = densityScore * 0.35 + patternScore * 0.65; // Weight pattern score more heavily for German // Sort detected patterns by weighted ratio if details requested if (includeDetails) { detectedPatterns.sort((a, b) => b.weightedRatio - a.weightedRatio); } return { aiLikelihood, overallScore, triadicDensity, totalPatterns, wordCount, detectedPatterns: includeDetails ? detectedPatterns : [], }; }