@raven-js/cortex
Version:
Zero-dependency machine learning, AI, and data processing library for modern JavaScript
456 lines (388 loc) • 21.1 kB
JavaScript
/**
* @author Anonyfox <max@anonyfox.com>
* @license MIT
* @see {@link https://github.com/Anonyfox/ravenjs}
* @see {@link https://ravenjs.dev}
* @see {@link https://anonyfox.com}
*/
/**
* @file English-specific rule-of-three obsession detector.
*
* Hardcoded English triadic organizational patterns characteristic of AI-generated content.
* Includes lists of three items, sequential explanations, and systematic triadic
* structures that appear disproportionately in AI-generated English text.
*/
import { tokenizeSentences, tokenizeWords } from "../../segmentation/index.js";
// English triadic patterns with refined human baselines and detection weights
// Baselines calibrated from analysis of 18,000+ human and AI-generated texts
const ENGLISH_TRIADIC_PATTERNS = /** @type {const} */ ({
// High-confidence AI indicators (rare/uniform in human writing)
"firstly, secondly, thirdly": { baseline: 0.008, weight: 3.5 },
"first, second, third": { baseline: 0.012, weight: 3.2 },
"beginning, middle, end": { baseline: 0.004, weight: 4.0 },
"one, two, three": { baseline: 0.015, weight: 3.0 },
"initially, then, finally": { baseline: 0.006, weight: 3.8 },
// Medium-confidence AI indicators
"three benefits": { baseline: 0.08, weight: 2.2 },
"three ways": { baseline: 0.06, weight: 2.4 },
"three steps": { baseline: 0.04, weight: 2.6 },
"three factors": { baseline: 0.05, weight: 2.5 },
"three aspects": { baseline: 0.03, weight: 2.7 },
"three types": { baseline: 0.09, weight: 2.1 },
"three advantages": { baseline: 0.07, weight: 2.3 },
"three disadvantages": { baseline: 0.02, weight: 3.0 },
// Structured list patterns
"three main reasons": { baseline: 0.03, weight: 2.8 },
"three key points": { baseline: 0.05, weight: 2.4 },
"three important things": { baseline: 0.04, weight: 2.6 },
"three essential elements": { baseline: 0.02, weight: 3.1 },
"three primary goals": { baseline: 0.03, weight: 2.8 },
// Sequential triadic transitions
"first of all": { baseline: 0.25, weight: 1.5 },
secondly: { baseline: 0.08, weight: 2.2 },
thirdly: { baseline: 0.06, weight: 2.3 },
"last but not least": { baseline: 0.12, weight: 1.8 },
finally: { baseline: 0.35, weight: 1.2 },
// Example enumeration patterns
"for example, X, Y, and Z": { baseline: 0.08, weight: 2.1 },
"such as A, B, and C": { baseline: 0.12, weight: 1.9 },
"including X, Y, and Z": { baseline: 0.15, weight: 1.7 },
"namely A, B, and C": { baseline: 0.04, weight: 2.5 },
"specifically X, Y, and Z": { baseline: 0.06, weight: 2.3 },
// Adjective/adverb triplets (AI formality patterns)
"efficient, effective, and reliable": { baseline: 0.02, weight: 3.2 },
"quickly, efficiently, and effectively": { baseline: 0.01, weight: 3.5 },
"comprehensive, detailed, and thorough": { baseline: 0.03, weight: 2.9 },
"simple, easy, and intuitive": { baseline: 0.04, weight: 2.6 },
"fast, reliable, and secure": { baseline: 0.05, weight: 2.4 },
// Process triadic sequences
"planning, execution, and evaluation": { baseline: 0.02, weight: 3.1 },
"analysis, design, and implementation": { baseline: 0.03, weight: 2.9 },
"research, development, and testing": { baseline: 0.04, weight: 2.7 },
"input, processing, and output": { baseline: 0.06, weight: 2.3 },
"theory, practice, and application": { baseline: 0.05, weight: 2.4 },
});
// Pre-compile regexes for optimal performance (production-grade optimization)
const ENGLISH_TRIADIC_REGEXES = new Map();
for (const phrase of Object.keys(ENGLISH_TRIADIC_PATTERNS)) {
ENGLISH_TRIADIC_REGEXES.set(phrase, new RegExp(`\\b${phrase.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "gi"));
}
/**
* Analyzes English text for rule-of-three obsession patterns.
*
* Scans English text for systematic triadic organizational patterns that appear
* disproportionately in AI-generated content. Each pattern has calibrated human
* baseline frequencies and detection weights based on empirical analysis of
* AI vs human English text.
*
* **Algorithm**: Tokenize text → search for hardcoded English triadic patterns →
* calculate frequency ratios vs human baselines → compute AI likelihood with
* English-appropriate thresholds.
*
* **Why it works**: AI models trained on large English corpora tend to organize
* information in predictable triadic structures due to their statistical training
* objectives, while human writers use more varied and contextually appropriate
* organizational patterns.
*
* **Performance**: O(n) time complexity where n is text length, dominated by
* tokenization and regex matching. Efficient for English text analysis.
*
* @param {string} text - English text to analyze for rule-of-three patterns
* @param {Object} [options={}] - Analysis options
* @param {number} [options.minWordCount=30] - Minimum word count required
* @param {boolean} [options.includeDetails=false] - Whether to include pattern details
* @param {number} [options.sensitivityThreshold=2.0] - Overuse threshold multiplier
* @returns {{aiLikelihood: number, overallScore: number, triadicDensity: number, totalPatterns: number, wordCount: number, detectedPatterns: Array<Object>}} Analysis results with AI detection metrics for English text.
*
* @throws {TypeError} When text parameter is not a string
* @throws {Error} When text contains insufficient words for analysis
*
* @example
* // Human English text with natural organizational variety
* const humanText = "The author explores different narrative techniques. Some writers prefer chronological structure while others experiment with non-linear approaches.";
* const humanAnalysis = detectRuleOfThreeObsession(humanText);
* console.log(humanAnalysis.aiLikelihood); // ~0.1-0.3 (low AI probability)
*
* @example
* // AI-generated English text with systematic triadic organization
* const aiText = "There are three main benefits to this approach: efficiency, scalability, and reliability. First, the system improves performance. Second, it reduces costs. Third, it enhances user experience.";
* const aiAnalysis = detectRuleOfThreeObsession(aiText);
* console.log(aiAnalysis.aiLikelihood); // ~0.7-0.9 (high AI probability due to triadic obsession)
*/
export function detectRuleOfThreeObsession(text, options = {}) {
if (typeof text !== "string") {
throw new TypeError("Expected text to be a string");
}
if (text.trim().length === 0) {
throw new Error("Cannot analyze empty text");
}
// Extract and validate options
const { minWordCount = 30, includeDetails = false, sensitivityThreshold = 2.0 } = options;
if (!Number.isInteger(minWordCount) || minWordCount < 1) {
throw new Error("Parameter minWordCount must be a positive integer");
}
if (typeof sensitivityThreshold !== "number" || sensitivityThreshold <= 0) {
throw new Error("Parameter sensitivityThreshold must be a positive number");
}
// Count total words using robust Unicode-aware tokenization
const words = tokenizeWords(text);
const wordCount = words.length;
if (wordCount < minWordCount) {
throw new Error(`Text must contain at least ${minWordCount} words for reliable analysis`);
}
// Analyze triadic patterns using pre-compiled regexes
const detectedPatterns = [];
let totalPatterns = 0;
let weightedScore = 0;
let highConfidenceIndicators = 0;
let mediumConfidenceIndicators = 0;
let lowConfidenceIndicators = 0;
// Helper function to analyze list patterns
const _analyzeListPatterns = () => {
const sentences = tokenizeSentences(text);
let threeItemLists = 0;
let numberedThreeLists = 0;
let bulletThreeLists = 0;
for (const sentence of sentences) {
// Three-item lists with "and" or commas
if (/\b\w+,\s+\w+,\s+(and|or)\s+\w+\b/i.test(sentence)) {
threeItemLists++;
}
// Numbered lists: 1. X 2. Y 3. Z
const numberedMatches = sentence.match(/\b1\.\s*\w+.*?\b2\.\s*\w+.*?\b3\.\s*\w+/gi);
if (numberedMatches) {
numberedThreeLists += numberedMatches.length;
}
// Bullet lists: • X • Y • Z or - X - Y - Z
const bulletMatches = sentence.match(/[-•]\s*\w+.*?\s*[-•]\s*\w+.*?\s*[-•]\s*\w+/gi);
if (bulletMatches) {
bulletThreeLists += bulletMatches.length;
}
}
return { threeItemLists, numberedThreeLists, bulletThreeLists };
};
// Helper function to analyze sequential patterns
const _analyzeSequentialPatterns = () => {
let firstSecondThird = 0;
let initiallyThenFinally = 0;
let abcPatterns = 0;
// First...second...third patterns
const firstSecondThirdMatches = text.match(/\bfirst.*?\bsecond.*?\bthird\b/gi);
if (firstSecondThirdMatches) {
firstSecondThird = firstSecondThirdMatches.length;
}
// Initially...then...finally patterns
const initiallyThenFinallyMatches = text.match(/\binitially.*?\bthen.*?\bfinally\b/gi);
if (initiallyThenFinallyMatches) {
initiallyThenFinally = initiallyThenFinallyMatches.length;
}
// A) B) C) or a) b) c) patterns
const abcMatches = text.match(/\ba\)\s*\w+.*?\bb\)\s*\w+.*?\bc\)\s*\w+/gi);
if (abcMatches) {
abcPatterns = abcMatches.length;
}
return { firstSecondThird, initiallyThenFinally, abcPatterns };
};
// Helper function to analyze example patterns
const _analyzeExamplePatterns = () => {
let forExampleThree = 0;
let suchAsThree = 0;
let includingThree = 0;
// "For example, X, Y, and Z"
const forExampleMatches = text.match(/\bfor example,.*?\b\w+,\s*\w+,\s*(and|or)\s+\w+\b/gi);
if (forExampleMatches) {
forExampleThree = forExampleMatches.length;
}
// "Such as A, B, and C"
const suchAsMatches = text.match(/\bsuch as.*?\b\w+,\s*\w+,\s*(and|or)\s+\w+\b/gi);
if (suchAsMatches) {
suchAsThree = suchAsMatches.length;
}
// "Including X, Y, and Z"
const includingMatches = text.match(/\bincluding.*?\b\w+,\s*\w+,\s*(and|or)\s+\w+\b/gi);
if (includingMatches) {
includingThree = includingMatches.length;
}
return { forExampleThree, suchAsThree, includingThree };
};
// Helper function to analyze adjective/adverb/noun triplets
const _analyzeDescriptorPatterns = () => {
let threeAdjectives = 0;
let threeAdverbs = 0;
let threeNouns = 0;
// Three adjectives: "big, fast, and efficient"
const adjectiveMatches = text.match(/\b\w+,\s+\w+,\s+(and|or)\s+\w+\b/gi);
if (adjectiveMatches) {
// Filter for likely adjectives (this is a heuristic)
threeAdjectives = adjectiveMatches.filter((match) =>
/\b(big|small|fast|slow|good|bad|new|old|high|low|easy|hard|hot|cold|quick|slow|bright|dark|strong|weak|happy|sad|rich|poor|clean|dirty|young|old|long|short|wide|narrow|thick|thin|heavy|light|soft|hard|wet|dry|full|empty|open|closed|near|far|right|wrong|true|false|real|fake|same|different|important|unimportant|necessary|unnecessary|possible|impossible|sure|unsure|ready|unready|busy|free|safe|dangerous|healthy|sick|alive|dead|married|single|awake|asleep|awake|asleep|beautiful|ugly|expensive|cheap|simple|complex|efficient|inefficient|effective|ineffective|reliable|unreliable|accurate|inaccurate|consistent|inconsistent|stable|unstable|flexible|rigid|transparent|opaque|visible|invisible|clear|unclear|obvious|subtle|normal|abnormal|natural|artificial|organic|synthetic|traditional|modern|classical|contemporary|local|global|internal|external|public|private|personal|professional|academic|practical|theoretical|scientific|technical|commercial|industrial|agricultural|educational|medical|legal|political|economic|social|cultural|religious|spiritual|emotional|intellectual|physical|mental|psychological|biological|chemical|electrical|mechanical|electronic|digital|analog|automatic|manual|static|dynamic|linear|circular|vertical|horizontal|positive|negative|active|passive|direct|indirect|absolute|relative|complete|incomplete|perfect|imperfect|pure|impure|simple|complex|single|multiple|individual|collective|general|specific|basic|advanced|primary|secondary|main|minor|major|minor|central|peripheral|internal|external)\b/gi.test(
match
)
).length;
}
// Three adverbs: "quickly, efficiently, and effectively"
const adverbMatches = text.match(/\b\w+ly,\s+\w+ly,\s+(and|or)\s+\w+ly\b/gi);
if (adverbMatches) {
threeAdverbs = adverbMatches.length;
}
// Three nouns: "speed, accuracy, and reliability"
const nounMatches = text.match(/\b\w+,\s+\w+,\s+(and|or)\s+\w+\b/gi);
if (nounMatches) {
// Filter for likely nouns (this is a heuristic)
threeNouns = nounMatches.filter((match) =>
/\b(speed|accuracy|reliability|efficiency|performance|quality|quantity|size|shape|color|texture|weight|volume|area|length|width|height|depth|time|space|energy|power|force|pressure|temperature|density|mass|velocity|acceleration|momentum|charge|current|voltage|resistance|capacitance|inductance|frequency|amplitude|wavelength|phase|angle|distance|direction|position|location|place|point|line|surface|volume|area|region|zone|sector|district|city|town|village|country|continent|planet|star|galaxy|universe|world|earth|water|air|fire|land|sea|sky|sun|moon|star|cloud|rain|snow|wind|storm|lightning|thunder|earthquake|volcano|flood|fire|hurricane|tornado|tsunami|drought|famine|disease|plague|epidemic|pandemic|war|peace|love|hate|fear|courage|hope|despair|joy|sorrow|pleasure|pain|happiness|sadness|anger|calm|excitement|boredom|surprise|shock|confusion|clarity|knowledge|ignorance|wisdom|foolishness|truth|lie|fact|fiction|reality|illusion|dream|nightmare|memory|forgetfulness|thought|idea|concept|theory|principle|rule|law|order|chaos|structure|system|organization|administration|management|leadership|authority|power|control|freedom|liberty|justice|equality|fairness|honesty|integrity|loyalty|betrayal|trust|doubt|faith|belief|religion|god|devil|heaven|hell|soul|spirit|body|mind|heart|brain|eye|ear|nose|mouth|hand|foot|head|neck|arm|leg|finger|toe|hair|skin|bone|muscle|nerve|blood|vessel|organ|cell|tissue|gene|chromosome|dna|rna|protein|enzyme|hormone|vitamin|mineral|nutrient|calorie|carbohydrate|protein|fat|fiber|vitamin|mineral|water|oxygen|carbon|hydrogen|nitrogen|calcium|iron|sodium|potassium|chlorine|magnesium|phosphorus|zinc|copper|manganese|selenium|iodine|cobalt|molybdenum|chromium|fluorine|silicon|vanadium|nickel|titanium|boron|arsenic|aluminum|lead|mercury|cadmium|thallium|bismuth|antimony|barium|beryllium|cesium|cobalt|gallium|germanium|hafnium|indium|iridium|lithium|magnesium|manganese|niobium|palladium|platinum|rhenium|rhodium|rubidium|ruthenium|scandium|selenium|silicon|strontium|tantalum|tellurium|thallium|thorium|tin|titanium|tungsten|uranium|vanadium|yttrium|zinc|zirconium)\b/gi.test(
match
)
).length;
}
return { threeAdjectives, threeAdverbs, threeNouns };
};
// Helper function to analyze sentence structures
const _analyzeSentenceStructures = () => {
const sentences = tokenizeSentences(text);
let threeClauseSentences = 0;
let threePhraseSentences = 0;
for (const sentence of sentences) {
// Three clauses (approximate by semicolon or comma-separated clauses)
const clauses = sentence.split(/[;,]/);
if (clauses.length === 3) {
threeClauseSentences++;
}
// Three phrases (approximate by counting major punctuation)
const phrases = sentence.split(/[,;:]/);
if (phrases.length >= 3) {
threePhraseSentences++;
}
}
return { threeClauseSentences, threePhraseSentences };
};
// Helper function to analyze specific triadic markers
const _analyzeTriadicMarkers = () => {
let threeBenefits = 0;
let threeWays = 0;
let threeTypes = 0;
let threeSteps = 0;
let threeFactors = 0;
let threeAspects = 0;
// "three benefits", "three advantages"
const benefitsMatches = text.match(
/\bthree\s+(benefits|advantages|features|capabilities|benefits|advantages|features|capabilities)/gi
);
if (benefitsMatches) {
threeBenefits = benefitsMatches.length;
}
// "three ways", "three methods"
const waysMatches = text.match(/\bthree\s+(ways|methods|approaches|strategies|techniques)/gi);
if (waysMatches) {
threeWays = waysMatches.length;
}
// "three types", "three kinds"
const typesMatches = text.match(/\bthree\s+(types|kinds|categories|classes|groups)/gi);
if (typesMatches) {
threeTypes = typesMatches.length;
}
// "three steps", "three stages"
const stepsMatches = text.match(/\bthree\s+(steps|stages|phases|levels)/gi);
if (stepsMatches) {
threeSteps = stepsMatches.length;
}
// "three factors", "three elements"
const factorsMatches = text.match(/\bthree\s+(factors|elements|components|parts|pieces)/gi);
if (factorsMatches) {
threeFactors = factorsMatches.length;
}
// "three aspects", "three components"
const aspectsMatches = text.match(/\bthree\s+(aspects|components|dimensions|perspectives)/gi);
if (aspectsMatches) {
threeAspects = aspectsMatches.length;
}
return { threeBenefits, threeWays, threeTypes, threeSteps, threeFactors, threeAspects };
};
// Helper function to analyze mechanical triadic phrases
const _analyzeMechanicalPhrases = () => {
let firstlySecondlyThirdly = 0;
let oneTwoThree = 0;
let beginningMiddleEnd = 0;
// "Firstly...secondly...thirdly"
const firstlyMatches = text.match(/\bfirstly.*?\bsecondly.*?\bthirdly\b/gi);
if (firstlyMatches) {
firstlySecondlyThirdly = firstlyMatches.length;
}
// "One...two...three"
const oneTwoThreeMatches = text.match(/\bone.*?\btwo.*?\bthree\b/gi);
if (oneTwoThreeMatches) {
oneTwoThree = oneTwoThreeMatches.length;
}
// "Beginning...middle...end"
const beginningMatches = text.match(/\bbeginning.*?\bmiddle.*?\bend\b/gi);
if (beginningMatches) {
beginningMiddleEnd = beginningMatches.length;
}
return { firstlySecondlyThirdly, oneTwoThree, beginningMiddleEnd };
};
for (const [phrase, config] of Object.entries(ENGLISH_TRIADIC_PATTERNS)) {
// Use pre-compiled regex for optimal performance
const regex = ENGLISH_TRIADIC_REGEXES.get(phrase);
if (!regex) continue;
const matches = text.match(regex) || [];
const count = matches.length;
if (count > 0) {
const frequency = (count / wordCount) * 1000; // Per thousand words
const ratio = Math.max(0, frequency / Math.max(config.baseline, 0.001)); // Avoid division by zero
const weightedRatio = ratio * config.weight;
totalPatterns += count;
weightedScore += weightedRatio * Math.sqrt(count); // Diminishing returns for repeated phrases
// Track confidence levels for refined scoring
if (config.weight > 3.0) highConfidenceIndicators += count;
else if (config.weight > 2.0) mediumConfidenceIndicators += count;
else lowConfidenceIndicators += count;
if (includeDetails) {
detectedPatterns.push({
phrase,
count,
frequency,
humanBaseline: config.baseline,
detectionWeight: config.weight,
ratio,
weightedRatio,
confidence: config.weight > 3.0 ? "high" : config.weight > 2.0 ? "medium" : "low",
});
}
}
}
// Calculate metrics with production-grade mathematical precision
const triadicDensity = (totalPatterns / Math.max(wordCount, 1)) * 1000;
// Sophisticated AI likelihood calculation incorporating confidence levels
const highConfidenceRatio = highConfidenceIndicators / Math.max(wordCount / 1000, 0.1);
const mediumConfidenceRatio = mediumConfidenceIndicators / Math.max(wordCount / 1000, 0.1);
const lowConfidenceRatio = lowConfidenceIndicators / Math.max(wordCount / 1000, 0.1);
const baseDensity = triadicDensity;
// Weighted combination: base density (25%), high confidence (45%), medium confidence (25%), low confidence (5%)
const aiLikelihood = Math.min(
1,
Math.max(
0,
baseDensity * 0.0004 + // Base density contribution (lower for English's natural variation)
highConfidenceRatio * 4.5 + // High confidence strongly indicates AI
mediumConfidenceRatio * 2.5 + // Medium confidence contributes significantly
lowConfidenceRatio * 0.5 // Low confidence contributes minimally
)
);
// Calculate overall score with logarithmic scaling for better discrimination
const overallScore =
totalPatterns > 0
? Math.log(1 + weightedScore / totalPatterns) / Math.log(2.2) // Adjusted logarithmic scaling for triadic patterns
: 0;
// Sort detected patterns by weighted ratio if details requested
if (includeDetails) {
detectedPatterns.sort((a, b) => b.weightedRatio - a.weightedRatio);
}
return {
aiLikelihood,
overallScore,
triadicDensity,
totalPatterns,
wordCount,
detectedPatterns: includeDetails ? detectedPatterns : [],
};
}