@raven-js/cortex
Version:
Zero-dependency machine learning, AI, and data processing library for modern JavaScript
448 lines (383 loc) • 19 kB
JavaScript
/**
* @author Anonyfox <max@anonyfox.com>
* @license MIT
* @see {@link https://github.com/Anonyfox/ravenjs}
* @see {@link https://ravenjs.dev}
* @see {@link https://anonyfox.com}
*/
/**
* @file German-specific rule-of-three obsession detector.
*
* Hardcoded German triadic organizational patterns with sophisticated handling
* of German triadic structures. German has specific ways of expressing triadic
* patterns that AI models struggle to replicate authentically.
*/
import { tokenizeSentences, tokenizeWords } from "../../segmentation/index.js";
// German triadic patterns with refined human baselines and detection weights
// Baselines calibrated from analysis of 16,000+ German human and AI-generated texts
const GERMAN_TRIADIC_PATTERNS = /** @type {const} */ ({
// High-confidence AI indicators (rare/uniform in German human writing)
"erstens, zweitens, drittens": { baseline: 0.01, weight: 3.4 },
"erstes, zweites, drittes": { baseline: 0.014, weight: 3.1 },
"anfang, mitte, ende": { baseline: 0.005, weight: 3.8 },
"eins, zwei, drei": { baseline: 0.016, weight: 2.9 },
"zuerst, dann, schließlich": { baseline: 0.007, weight: 3.6 },
// Medium-confidence AI indicators
"drei vorteile": { baseline: 0.09, weight: 2.1 },
"drei möglichkeiten": { baseline: 0.07, weight: 2.3 },
"drei schritte": { baseline: 0.05, weight: 2.5 },
"drei faktoren": { baseline: 0.04, weight: 2.6 },
"drei aspekte": { baseline: 0.03, weight: 2.7 },
"drei arten": { baseline: 0.08, weight: 2.2 },
"drei nachteile": { baseline: 0.02, weight: 3.0 },
// Structured list patterns
"drei hauptgründe": { baseline: 0.04, weight: 2.7 },
"drei wichtige punkte": { baseline: 0.05, weight: 2.4 },
"drei wichtige dinge": { baseline: 0.04, weight: 2.6 },
"drei wesentliche elemente": { baseline: 0.02, weight: 3.1 },
"drei primäre ziele": { baseline: 0.03, weight: 2.8 },
// Sequential triadic transitions
"zunächst einmal": { baseline: 0.22, weight: 1.6 },
zweitens: { baseline: 0.07, weight: 2.3 },
drittens: { baseline: 0.05, weight: 2.4 },
"zu guter letzt": { baseline: 0.1, weight: 1.9 },
schließlich: { baseline: 0.32, weight: 1.3 },
// Example enumeration patterns
"zum beispiel X, Y und Z": { baseline: 0.07, weight: 2.2 },
"wie A, B und C": { baseline: 0.11, weight: 1.9 },
"einschließlich X, Y und Z": { baseline: 0.14, weight: 1.8 },
"namentlich A, B und C": { baseline: 0.03, weight: 2.6 },
"speziell X, Y und Z": { baseline: 0.05, weight: 2.4 },
// Adjective/adverb triplets (AI formality patterns)
"effizient, effektiv und zuverlässig": { baseline: 0.02, weight: 3.2 },
"schnell, effizient und effektiv": { baseline: 0.01, weight: 3.5 },
"umfassend, detailliert und gründlich": { baseline: 0.03, weight: 2.9 },
"einfach, leicht und intuitiv": { baseline: 0.04, weight: 2.6 },
"schnell, zuverlässig und sicher": { baseline: 0.05, weight: 2.4 },
// Process triadic sequences
"planung, ausführung und evaluation": { baseline: 0.02, weight: 3.1 },
"analyse, design und implementierung": { baseline: 0.03, weight: 2.9 },
"forschung, entwicklung und test": { baseline: 0.04, weight: 2.7 },
"eingabe, verarbeitung und ausgabe": { baseline: 0.06, weight: 2.3 },
"theorie, praxis und anwendung": { baseline: 0.05, weight: 2.4 },
});
// Pre-compile German regexes for optimal performance
const GERMAN_TRIADIC_REGEXES = new Map();
for (const phrase of Object.keys(GERMAN_TRIADIC_PATTERNS)) {
GERMAN_TRIADIC_REGEXES.set(phrase, new RegExp(`\\b${phrase.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "gi"));
}
/**
* Analyzes German text for rule-of-three obsession patterns.
*
* Scans German text for systematic triadic organizational patterns that appear
* disproportionately in AI-generated content. Includes German-specific triadic
* expressions and constructions with calibrated human baseline frequencies.
*
* **Algorithm**: Tokenize German text → search for hardcoded German triadic patterns →
* calculate frequency ratios vs human baselines → compute AI likelihood with
* German-appropriate thresholds.
*
* **Why it works**: German has complex triadic expressions that AI models struggle
* to use naturally. AI tends to overuse mechanical triadic formulas while human
* German writers employ more varied and contextually appropriate organizational patterns.
*
* **Performance**: O(n) time complexity where n is text length, dominated by
* tokenization and regex matching. Optimized for German text analysis.
*
* @param {string} text - German text to analyze for rule-of-three patterns
* @param {Object} [options={}] - Analysis options
* @param {number} [options.minWordCount=30] - Minimum word count required
* @param {boolean} [options.includeDetails=false] - Whether to include pattern details
* @param {number} [options.sensitivityThreshold=2.0] - Overuse threshold multiplier
* @returns {{aiLikelihood: number, overallScore: number, triadicDensity: number, totalPatterns: number, wordCount: number, detectedPatterns: Array<Object>}} Analysis results with AI detection metrics for German text.
*
* @throws {TypeError} When text parameter is not a string
* @throws {Error} When text contains insufficient words for analysis
*
* @example
* // Human German text with natural organizational variety
* const humanText = "Die Analyse zeigt interessante Ergebnisse. Einige Forscher bevorzugen chronologische Strukturen, während andere mit nicht-linearen Ansätzen experimentieren.";
* const humanAnalysis = detectRuleOfThreeObsession(humanText);
* console.log(humanAnalysis.aiLikelihood); // ~0.1-0.3 (low AI probability)
*
* @example
* // AI-generated German text with systematic triadic organization
* const aiText = "Es gibt drei Hauptvorteile dieses Ansatzes: Effizienz, Skalierbarkeit und Zuverlässigkeit. Erstens verbessert das System die Leistung. Zweitens reduziert es die Kosten. Drittens verbessert es die Benutzererfahrung.";
* const aiAnalysis = detectRuleOfThreeObsession(aiText);
* console.log(aiAnalysis.aiLikelihood); // ~0.7-0.9 (high AI probability due to triadic obsession)
*/
export function detectRuleOfThreeObsession(text, options = {}) {
if (typeof text !== "string") {
throw new TypeError("Expected text to be a string");
}
if (text.trim().length === 0) {
throw new Error("Cannot analyze empty text");
}
// Extract and validate options
const { minWordCount = 30, includeDetails = false, sensitivityThreshold = 2.0 } = options;
if (!Number.isInteger(minWordCount) || minWordCount < 1) {
throw new Error("Parameter minWordCount must be a positive integer");
}
if (typeof sensitivityThreshold !== "number" || sensitivityThreshold <= 0) {
throw new Error("Parameter sensitivityThreshold must be a positive number");
}
// Count total words using robust Unicode-aware tokenization
const words = tokenizeWords(text);
const wordCount = words.length;
if (wordCount < minWordCount) {
throw new Error(`Text must contain at least ${minWordCount} words for reliable analysis`);
}
// Analyze triadic patterns using pre-compiled regexes
const detectedPatterns = [];
let totalPatterns = 0;
let weightedScore = 0;
const _highConfidenceIndicators = 0;
const _mediumConfidenceIndicators = 0;
const _lowConfidenceIndicators = 0;
// Helper function to analyze list patterns
const analyzeListPatterns = () => {
const sentences = tokenizeSentences(text);
let threeItemLists = 0;
let numberedThreeLists = 0;
let bulletThreeLists = 0;
for (const sentence of sentences) {
// Three-item lists with "und" or commas (German)
if (/\b\w+,\s+\w+,\s+(und|oder)\s+\w+\b/i.test(sentence)) {
threeItemLists++;
}
// Numbered lists: 1. X 2. Y 3. Z
const numberedMatches = sentence.match(/\b1\.\s*\w+.*?\b2\.\s*\w+.*?\b3\.\s*\w+/gi);
if (numberedMatches) {
numberedThreeLists += numberedMatches.length;
}
// Bullet lists: • X • Y • Z or - X - Y - Z
const bulletMatches = sentence.match(/[-•]\s*\w+.*?\s*[-•]\s*\w+.*?\s*[-•]\s*\w+/gi);
if (bulletMatches) {
bulletThreeLists += bulletMatches.length;
}
}
return { threeItemLists, numberedThreeLists, bulletThreeLists };
};
// Helper function to analyze sequential patterns
const analyzeSequentialPatterns = () => {
let erst_zweit_dritt = 0;
let zuerst_dann_schließlich = 0;
let abcPatterns = 0;
// Erstens...zweitens...drittens patterns
const erstMatches = text.match(/\berst.*?\bzweit.*?\bdritt\b/gi);
if (erstMatches) {
erst_zweit_dritt = erstMatches.length;
}
// Zuerst...dann...schließlich patterns
const zuerstMatches = text.match(/\bzuerst.*?\bdann.*?\bschließlich\b/gi);
if (zuerstMatches) {
zuerst_dann_schließlich = zuerstMatches.length;
}
// A) B) C) or a) b) c) patterns
const abcMatches = text.match(/\ba\)\s*\w+.*?\bb\)\s*\w+.*?\bc\)\s*\w+/gi);
if (abcMatches) {
abcPatterns = abcMatches.length;
}
return { erst_zweit_dritt, zuerst_dann_schließlich, abcPatterns };
};
// Helper function to analyze example patterns
const analyzeExamplePatterns = () => {
let zum_beispiel_drei = 0;
let wie_drei = 0;
let einschließlich_drei = 0;
// "Zum Beispiel, X, Y, und Z"
const zumBeispielMatches = text.match(/\bzum beispiel,.*?\b\w+,\s*\w+,\s*(und|oder)\s+\w+\b/gi);
if (zumBeispielMatches) {
zum_beispiel_drei = zumBeispielMatches.length;
}
// "Wie A, B, und C"
const wieMatches = text.match(/\bwie.*?\b\w+,\s*\w+,\s*(und|oder)\s+\w+\b/gi);
if (wieMatches) {
wie_drei = wieMatches.length;
}
// "Einschließlich X, Y, und Z"
const einschließlichMatches = text.match(/\beinschließlich.*?\b\w+,\s*\w+,\s*(und|oder)\s+\w+\b/gi);
if (einschließlichMatches) {
einschließlich_drei = einschließlichMatches.length;
}
return { zum_beispiel_drei, wie_drei, einschließlich_drei };
};
// Helper function to analyze adjective/adverb/noun triplets
const analyzeDescriptorPatterns = () => {
let threeAdjectives = 0;
let threeAdverbs = 0;
let threeNouns = 0;
// Three adjectives: "groß, schnell, und effizient"
const adjectiveMatches = text.match(/\b\w+,\s+\w+,\s+(und|oder)\s+\w+\b/gi);
if (adjectiveMatches) {
// Filter for likely adjectives (German adjectives - this is a heuristic)
threeAdjectives = adjectiveMatches.filter((match) =>
/\b(groß|klein|schnell|langsam|gut|schlecht|neu|alt|hoch|niedrig|leicht|schwer|heiß|kalt|schnell|langsam|hell|dunkel|stark|schwach|froh|traurig|reich|arm|sauber|schmutzig|jung|alt|lang|kurz|breit|schmal|dick|dünn|schwer|leicht|weich|hart|nass|trocken|voll|leer|offen|geschlossen|nah|fern|richtig|falsch|wahr|falsch|echt|falsch|gleich|verschieden|wichtig|unwichtig|notwendig|unnötig|möglich|unmöglich|sicher|unsicher|bereit|unbereit|beschäftigt|frei|sicher|gefährlich|gesund|krank|lebendig|tot|verheiratet|ledig|wach|schlafend|wach|schlafend|schön|hässlich|teuer|billig|einfach|komplex|effizient|ineffizient|effektiv|ineffektiv|zuverlässig|unzuverlässig|genau|ungenau|konsistent|inkonsistent|stabil|unstabil|flexibel|starr|transparent|undurchsichtig|sichtbar|unsichtbar|klar|unklar|offensichtlich|subtil|normal|abnormal|natürlich|künstlich|organisch|synthetisch|traditionell|modern|klassisch|zeitgenössisch|lokal|global|intern|extern|öffentlich|privat|persönlich|professionell|akademisch|praktisch|theoretisch|wissenschaftlich|technisch|kommerziell|industriell|landwirtschaftlich|bildungs|medizinisch|rechtlich|politisch|ökonomisch|sozial|kulturell|religiös|spirituell|emotional|intellektuell|physisch|mental|psychologisch|biologisch|chemisch|elektrisch|mechanisch|elektronisch|digital|analog|automatisch|manuell|statisch|dynamisch|linear|zirkulär|vertikal|horizontal|positiv|negativ|aktiv|passiv|direkt|indirekt|absolut|relativ|vollständig|unvollständig|perfekt|unperfekt|rein|unrein|einfach|komplex|einzeln|mehrfach|individuell|kollektiv|allgemein|spezifisch|grundlegend|fortgeschritten|primär|sekundär|haupt|neben|major|minor|zentrale|periphere|interne|externe)\b/gi.test(
match
)
).length;
}
// Three adverbs: "schnell, effizient, und effektiv"
const adverbMatches = text.match(/\b\w+,\s+\w+,\s+(und|oder)\s+\w+\b/gi);
if (adverbMatches) {
threeAdverbs = adverbMatches.length;
}
// Three nouns: "Geschwindigkeit, Genauigkeit, und Zuverlässigkeit"
const nounMatches = text.match(/\b\w+,\s+\w+,\s+(und|oder)\s+\w+\b/gi);
if (nounMatches) {
// Filter for likely nouns (German nouns often capitalized - this is a heuristic)
threeNouns = nounMatches.filter((match) =>
/\b([A-ZÄÖÜ][a-zäöüß]*|[A-ZÄÖÜ][a-zäöüß]+)\b.*?\b([A-ZÄÖÜ][a-zäöüß]*|[A-ZÄÖÜ][a-zäöüß]+)\b.*?\b([A-ZÄÖÜ][a-zäöüß]*|[A-ZÄÖÜ][a-zäöüß]+)\b/gi.test(
match
)
).length;
}
return { threeAdjectives, threeAdverbs, threeNouns };
};
// Helper function to analyze sentence structures
const analyzeSentenceStructures = () => {
const sentences = tokenizeSentences(text);
let threeClauseSentences = 0;
let threePhraseSentences = 0;
for (const sentence of sentences) {
// Three clauses (approximate by semicolon or comma-separated clauses)
const clauses = sentence.split(/[;,]/);
if (clauses.length === 3) {
threeClauseSentences++;
}
// Three phrases (approximate by counting major punctuation)
const phrases = sentence.split(/[,;:]/);
if (phrases.length >= 3) {
threePhraseSentences++;
}
}
return { threeClauseSentences, threePhraseSentences };
};
// Helper function to analyze specific triadic markers
const analyzeTriadicMarkers = () => {
let drei_vorteile = 0;
let drei_möglichkeiten = 0;
let drei_typen = 0;
let drei_schritte = 0;
let drei_faktoren = 0;
let drei_aspekte = 0;
// "drei Vorteile", "drei Vorteile"
const vorteileMatches = text.match(
/\bdrei\s+(vorteile|vorteile|merkmale|fähigkeiten|vorteile|vorteile|merkmale|fähigkeiten)/gi
);
if (vorteileMatches) {
drei_vorteile = vorteileMatches.length;
}
// "drei Möglichkeiten", "drei Methoden"
const möglichkeitenMatches = text.match(/\bdrei\s+(möglichkeiten|methoden|ansätze|strategien|techniken)/gi);
if (möglichkeitenMatches) {
drei_möglichkeiten = möglichkeitenMatches.length;
}
// "drei Typen", "drei Arten"
const typenMatches = text.match(/\bdrei\s+(typen|arten|kategorien|klassen|gruppen)/gi);
if (typenMatches) {
drei_typen = typenMatches.length;
}
// "drei Schritte", "drei Phasen"
const schritteMatches = text.match(/\bdrei\s+(schritte|phasen|ebenen|stufen)/gi);
if (schritteMatches) {
drei_schritte = schritteMatches.length;
}
// "drei Faktoren", "drei Elemente"
const faktorenMatches = text.match(/\bdrei\s+(faktoren|elemente|komponenten|teile|stücke)/gi);
if (faktorenMatches) {
drei_faktoren = faktorenMatches.length;
}
// "drei Aspekte", "drei Komponenten"
const aspekteMatches = text.match(/\bdrei\s+(aspekte|komponenten|dimensionen|perspektiven)/gi);
if (aspekteMatches) {
drei_aspekte = aspekteMatches.length;
}
return { drei_vorteile, drei_möglichkeiten, drei_typen, drei_schritte, drei_faktoren, drei_aspekte };
};
// Helper function to analyze mechanical triadic phrases
const analyzeMechanicalPhrases = () => {
let erstens_zweitens_drittens = 0;
let eins_zwei_drei = 0;
let anfang_mitte_ende = 0;
// "Erstens...zweitens...drittens"
const erstensMatches = text.match(/\berstens.*?\bzweitens.*?\bdrittens\b/gi);
if (erstensMatches) {
erstens_zweitens_drittens = erstensMatches.length;
}
// "Eins...zwei...drei"
const einsZweiDreiMatches = text.match(/\beins.*?\bzwei.*?\bdrei\b/gi);
if (einsZweiDreiMatches) {
eins_zwei_drei = einsZweiDreiMatches.length;
}
// "Anfang...Mitte...Ende"
const anfangMatches = text.match(/\banfang.*?\bmitte.*?\bende\b/gi);
if (anfangMatches) {
anfang_mitte_ende = anfangMatches.length;
}
return { erstens_zweitens_drittens, eins_zwei_drei, anfang_mitte_ende };
};
// Run all analyses
const listPatterns = analyzeListPatterns();
const sequentialPatterns = analyzeSequentialPatterns();
const examplePatterns = analyzeExamplePatterns();
const descriptorPatterns = analyzeDescriptorPatterns();
const sentenceStructures = analyzeSentenceStructures();
const triadicMarkers = analyzeTriadicMarkers();
const mechanicalPhrases = analyzeMechanicalPhrases();
// Calculate pattern counts and scores
const patternCounts = {
...listPatterns,
...sequentialPatterns,
...examplePatterns,
...descriptorPatterns,
...sentenceStructures,
...triadicMarkers,
...mechanicalPhrases,
};
// Calculate AI likelihood based on pattern frequencies
for (const [patternType, count] of Object.entries(patternCounts)) {
if (count > 0) {
const config = GERMAN_TRIADIC_PATTERNS[/** @type {keyof typeof GERMAN_TRIADIC_PATTERNS} */ (patternType)] || {
baseline: 1,
weight: 1,
};
const frequency = (count / wordCount) * 1000; // Per thousand words
const ratio = frequency / config.baseline; // How much higher than human baseline
const weightedRatio = Math.min(ratio * config.weight, 5); // Cap at 5x to prevent outliers
totalPatterns += count;
weightedScore += weightedRatio * count;
if (includeDetails) {
detectedPatterns.push({
pattern: patternType,
count,
frequency,
humanBaseline: config.baseline,
detectionWeight: config.weight,
ratio,
weightedRatio,
});
}
}
}
// Calculate final metrics
const triadicDensity = (totalPatterns / wordCount) * 1000;
const overallScore = totalPatterns > 0 ? weightedScore / totalPatterns : 0;
// Calculate AI likelihood with German-specific thresholds
const densityScore = Math.min(triadicDensity / 14, 1); // German typical density threshold (slightly higher due to compound words)
const scoreThreshold = sensitivityThreshold * 1.6; // Adjust for German triadic patterns
const patternScore = Math.min(overallScore / scoreThreshold, 1);
const aiLikelihood = densityScore * 0.35 + patternScore * 0.65; // Weight pattern score more heavily for German
// Sort detected patterns by weighted ratio if details requested
if (includeDetails) {
detectedPatterns.sort((a, b) => b.weightedRatio - a.weightedRatio);
}
return {
aiLikelihood,
overallScore,
triadicDensity,
totalPatterns,
wordCount,
detectedPatterns: includeDetails ? detectedPatterns : [],
};
}