@gaiaverse/semantic-turning-point-detector
Version:
Detects key semantic turning points in conversations using recursive semantic distance analysis. Ideal for conversation analysis, dialogue segmentation, insight detection, and AI-assisted reasoning tasks.
1,204 lines (1,038 loc) • 81.9 kB
text/typescript
// file: semanticTurningPointDetector.ts
import fs from 'fs-extra';
import winston from 'winston';
// setup winston
fs.ensureDirSync('results'); // Ensure the results directory exists
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.timestamp(
{ format: 'YYYY-MM-DD HH:mm:ss' }
),
winston.format.printf(({ timestamp, level, message }) => {
return `${timestamp} ${level}: ${message}`;
})
)
}),
new winston.transports.File({
filename: 'results/semanticTurningPointDetector.log',
format: winston.format.json()
})
]
});
/*****************************************************************************************
* SEMANTIC TURNING POINT DETECTOR
*
* A TypeScript implementation of the Adaptive Recursive Convergence (ARC) with
* Cascading Re-Dimensional Attention (CRA) framework for conversation analysis.
*
* This detector identifies semantic "Turning Points" in conversations as a concrete
* application of the ARC/CRA theoretical framework for multi-step reasoning
* and dynamic dimensional expansion.
*
* Framework implementation:
* 1. Analyze semantic relationships between messages using embeddings (dimension n)
* 2. Calculate semantic distances that correspond to the contraction mapping
* 3. Apply the complexity function χ to determine dimensional saturation
* 4. Use the transition operator Ψ to determine whether to stay in dimension n or escalate
* 5. Employ meta-messages and recursive analysis for dimensional expansion (n → n+1)
* 6. Merge and prune results to demonstrate formal convergence
*****************************************************************************************/
import async from 'async';
import { OpenAI } from 'openai';
import { LRUCache } from 'lru-cache';
import crypto from 'crypto';
import { countTokens } from './tokensUtil';
import { conversation } from './conversation';
import { ResponseFormatJSONSchema } from 'openai/resources/shared';
import { MetaMessage, type Message } from './Message';
import { returnFormattedMessageContent } from './stripContent';
import { formResponseFormatSchema, formSystemMessage, formSystemPromptEnding, formUserMessage } from './prompt';
// Cache for token counts to avoid recalculating - implements atomic memory concept
const tokenCountCache = new LRUCache<string, number>({
max: 10000,
ttl: 1000 * 60 * 60 * 24
});
// -----------------------------------------------------------------------------
// Embedding Generation
// -----------------------------------------------------------------------------
// -----------------------------------------------------------------------------
// Core Interfaces
// -----------------------------------------------------------------------------
/**
* Message span identifies a range of messages
* Used for tracking dimensional representations across recursion levels
*/
interface MessageSpan {
/** Start message ID */
startId: string;
/** End message ID */
endId: string;
/** Start index in the original message array */
startIndex: number;
/** End index in the original message array */
endIndex: number;
/** Original message span if this is a meta-message span */
originalSpan?: MessageSpan;
}
/**
* Represents a semantic turning point in a conversation
* This corresponds to a significant semantic shift detected by the system
*/
export interface TurningPoint {
/** Unique identifier for this turning point */
id: string;
/** Human-readable short description of what this turning point represents */
label: string;
/** The type of semantic shift this turning point represents */
category: TurningPointCategory;
/** The span of messages covered by this turning point */
span: MessageSpan;
/** Legacy span format no longer utilized due to new class instantiations for MetaMessages */
deprecatedSpan?: {
startIndex: number;
endIndex: number;
startMessageId: string;
endMessageId: string;
};
/** The semantic distance/shift that triggered this turning point */
semanticShiftMagnitude: number;
/** Key terms that characterize this turning point */
keywords?: string[];
/** Notable quotes from the messages in this turning point's span */
quotes?: string[];
/** The emotionality of this turning point if applicable */
emotionalTone?: string;
/**
* The dimension at which this turning point was detected.
* If detectionLevel > 0, it indicates that this turning point was analyzed based on a span of turning points, rather than a span of messages.
*/
detectionLevel: number;
/** Significance score (higher = more significant) */
significance: number;
/** An assessed best point representing the turning point */
/** The complexity score (1-5) representing saturation in current dimension */
complexityScore: number;
/**
* A potential label assigned by the LLM, which can be either 'positive' or 'negative'.
* However, this label is not definitive and may be improved using a zero-shot model,
* based on the classification provided by the LLM.
*/
sentiment?: string;
}
/**
* Categories of turning points
*/
export type TurningPointCategory =
| 'Topic' // A shift to a new subject
| 'Insight' // A realization or discovery
| 'Emotion' // An emotional shift or response
| 'Meta-Reflection' // Thinking about the conversation itself
| 'Decision' // A choice or commitment being made
| 'Question' // A significant question being posed
| 'Problem' // Identification of an issue or challenge
| 'Action' // A commitment to do something
| 'Clarification' // Clearing up a misunderstanding
| 'Objection' // Disagreement or pushback
| 'Other'; // Any other type of shift
/**
* Configuration options for the turning point detector
*/
export interface TurningPointDetectorConfig {
/** OpenAI API key */
apiKey: string;
/** Model for turning point classification */
classificationModel: string;
/** Model for generating embeddings, e.g 'text-embedding-3-small', or some model custom, from a configurable openai api compatible endpoint v1/embeddings endpoint */
embeddingModel: string;
/** Settable openai compatible embedding endpoint */
embeddingEndpoint?: string;
/** Semantic shift threshold for detecting potential turning points */
semanticShiftThreshold: number;
/** Minimum tokens per chunk when processing conversation */
minTokensPerChunk: number;
/** Maximum tokens per chunk */
maxTokensPerChunk: number;
/** Maximum recursive depth (dimensional expansion limit) */
maxRecursionDepth: number;
/** Whether to filter by significance */
onlySignificantTurningPoints: boolean;
/** Significance threshold for filtering */
significanceThreshold: number;
/** Minimum messages per chunk */
minMessagesPerChunk: number;
/** Maximum turning points in final results */
maxTurningPoints: number;
/** Enable verbose logging */
debug: boolean;
/** Custom OpenAI API endpoint (optional) */
endpoint?: string;
/** Complexity saturation threshold (dimension escalation trigger) */
complexitySaturationThreshold: number;
/** Enable convergence measurement across iterations */
measureConvergence: boolean;
customResponseFormatJsonSchema?: ResponseFormatJSONSchema;
/** Inject a custom system instruction into the LLM prompt, not recommended unless you know what you are doing
* - Will repeat your system instruction after the contextual aid text, and before the system prompt ending.
* - This enforces and reminds the llm of the task, which may become blured from the contextual aid text.
*/
customSystemInstruction?: string;
/**
* Inject a custom user message into the LLM prompt, in which the analysis content is added as context.
* - Not recommended unless you know what you are doing.
* - Repeats your custom user message after the system prompt ending.
*/
customUserInstruction?: string;
/** The maximum number of characters to use when adding a message content as context to analyze */
max_character_length?: number;
/**
* This option determines whether to fail and halt the process if an analysis
* encounters an error during a potential turning point.
*
* - Instead of returning a placeholder for an empty analysis that will be ignored,
* the process will stop on failure.
* - This option is provided because, although an analysis may fail,
* it is important to consider that the analysis spans multiple message intervals.
* A single failure in one interval is treated the same as an analysis
* that does not indicate a significant turning point.
* - This is useful, as one may debug and discover the appropriate settings for the llm request, and once discovered would set this to true, or incorporate a retry mechanism.
*/
throwOnError?: boolean;
}
/**
* Chunking result with message segments and metrics
*/
interface ChunkingResult {
/** Array of message chunks */
chunks: Message[][];
/** Total number of chunks created */
numChunks: number;
/** Average tokens per chunk */
avgTokensPerChunk: number;
}
/**
* Embedding with associated message data
*/
interface MessageEmbedding {
/** The message ID */
id: string;
/** The message index in original array */
index: number;
/** The embedding vector */
embedding: Float32Array;
}
/**
* Tracks state changes across iteration for convergence measurement
*/
interface ConvergenceState {
/** Previous state turning points */
previousTurningPoints: TurningPoint[];
/** Current state turning points */
currentTurningPoints: TurningPoint[];
/** Current dimension */
dimension: number;
/** Convergence measure between states (lower = more converged) */
distanceMeasure: number;
/** Whether the state has converged */
hasConverged: boolean;
/** Whether dimension escalation occurred */
didEscalate: boolean;
}
// -----------------------------------------------------------------------------
// Main Detector Class
// -----------------------------------------------------------------------------
export class SemanticTurningPointDetector {
private config: TurningPointDetectorConfig;
private openai: OpenAI;
private originalMessages: Message[] = [];
private convergenceHistory: ConvergenceState[] = [];
/**
* Creates a new instance of the semantic turning point detector
*/
constructor(config: Partial<TurningPointDetectorConfig> = {}) {
// Default configuration (from your provided code)
this.config = {
apiKey: config.apiKey || process.env.OPENAI_API_KEY || '',
classificationModel: config.classificationModel || 'gpt-4o-mini',
embeddingModel: config.embeddingModel || 'text-embedding-3-small',
embeddingEndpoint: config.embeddingEndpoint,
semanticShiftThreshold: config.semanticShiftThreshold || 0.22,
minTokensPerChunk: config.minTokensPerChunk || 250,
maxTokensPerChunk: config.maxTokensPerChunk || 2000,
maxRecursionDepth: config.maxRecursionDepth || 3,
onlySignificantTurningPoints: config.onlySignificantTurningPoints ?? true,
significanceThreshold: config.significanceThreshold || 0.5,
minMessagesPerChunk: config.minMessagesPerChunk || 3,
maxTurningPoints: config.maxTurningPoints || 5,
debug: config.debug || false,
endpoint: config.endpoint,
complexitySaturationThreshold: config.complexitySaturationThreshold || 4.5,
measureConvergence: config.measureConvergence ?? true
};
// Initialize OpenAI client
this.openai = new OpenAI({
apiKey: this.config.apiKey,
baseURL: this.config.endpoint
});
if (this.config.debug) {
logger.info('[TurningPointDetector] Initialized with config:', {
...this.config,
apiKey: '[REDACTED]'
});
}
}
/**
* Main entry point: Detect turning points in a conversation
* Implements the full ARC/CRA framework
*/
public async detectTurningPoints(messages: Message[]): Promise<TurningPoint[]> {
logger.info('Starting turning point detection using ARC/CRA framework for conversation with', messages.length, 'messages');
this.convergenceHistory = [];
// Store original messages for reference
const totalTokens = await this.getMessageArrayTokenCount(messages);
logger.info(`Total conversation tokens: ${totalTokens}`);
// Ensure originalMessages is a fresh copy if messages might be mutated elsewhere
this.originalMessages = messages.map(m => ({ ...m }));
// Begin dimensional analysis at level 0
return this.multiLayerDetection(messages, 0);
}
/**
* Multi-layer detection implementing the ARC/CRA dimensional processing
* This is the primary implementation of the transition operator Ψ
*/
private async multiLayerDetection(
messages: Message[],
dimension: number
): Promise<TurningPoint[]> {
logger.info(`Starting dimensional analysis at n=${dimension}`);
// Check recursion depth - hard limit on dimensional expansion
if (dimension >= this.config.maxRecursionDepth) {
logger.info(`Maximum dimension (n=${dimension}) reached, processing directly without further expansion`);
// Pass originalMessages context only at dimension 0 if needed by detectTurningPointsInChunk->classifyTurningPoint
return await this.detectTurningPointsInChunk(messages, dimension, 0, this.originalMessages);
}
// For very small conversations (or at deeper levels), use sliding window
let localTurningPoints: TurningPoint[] = [];
// Adjusted condition to handle small message counts more directly
if (messages.length < this.config.minMessagesPerChunk * 2 && dimension === 0) {
logger.info(`Dimension ${dimension}: Small conversation (${messages.length} msgs), processing directly`);
// Optionally adjust threshold for small conversations
const originalThreshold = this.config.semanticShiftThreshold;
this.config.semanticShiftThreshold = Math.max(0.3, originalThreshold * 1.1); // Slightly higher threshold
localTurningPoints = await this.detectTurningPointsInChunk(messages, dimension, 0, this.originalMessages);
// Restore config
this.config.semanticShiftThreshold = originalThreshold;
} else {
// Chunk the conversation
const { chunks } = await this.chunkConversation(messages, dimension);
logger.info(`Dimension ${dimension}: Split into ${chunks.length} chunks`);
if (chunks.length === 0) {
logger.info(`Dimension ${dimension}: No valid chunks created, returning empty.`);
return [];
}
// Process each chunk in parallel to find local turning points
const chunkTurningPoints: TurningPoint[][] = new Array(chunks.length);
const durationsSeconds: number[] = new Array(chunks.length).fill(-1);
const limit = this.config.endpoint ? 1 : 5; // Limit API calls
await async.eachOfLimit(
chunks,
limit,
async (chunk, indexStr) => {
const index = Number(indexStr);
const startTime = Date.now();
if (index % 10 === 0 || limit === 1 || this.config.debug) {
logger.info(` - Dimension ${dimension}: Processing chunk ${index + 1}/${chunks.length} (${chunk.length} messages)`);
}
// Pass originalMessages context only at dimension 0
chunkTurningPoints[index] = await this.detectTurningPointsInChunk(chunk, dimension, index, this.originalMessages);
const durationSecs = (Date.now() - startTime) / 1000;
durationsSeconds[index] = durationSecs;
if (index % 10 === 0 || limit === 1 || this.config.debug) {
const processedCount = durationsSeconds.filter(d => d > 0).length;
if (processedCount > 0) {
const averageDuration = durationsSeconds.filter(d => d > 0).reduce((a, b) => a + b, 0) / processedCount;
const remainingChunks = durationsSeconds.length - processedCount;
const remainingTime = (averageDuration * remainingChunks).toFixed(1);
const percentageComplete = (processedCount / durationsSeconds.length * 100);
logger.info(` - Chunk ${index + 1} processed in ${durationSecs.toFixed(1)}s. Est. remaining: ${remainingTime}s (${percentageComplete.toFixed(1)}% complete)`);
} else {
logger.info(` - Chunk ${index + 1} processed in ${durationSecs.toFixed(1)}s.`);
}
}
}
);
// Flatten all turning points from all chunks
localTurningPoints = chunkTurningPoints.flat();
}
logger.info(`Dimension ${dimension}: Found ${localTurningPoints.length} raw turning points`);
// If we found zero or one turning point at this level, return it directly (after potential filtering if needed)
if (localTurningPoints.length <= 1) {
// Apply filtering even for single points if configured
return this.config.onlySignificantTurningPoints
? this.filterSignificantTurningPoints(localTurningPoints)
: localTurningPoints;
}
// First merge any similar turning points at this level
const mergedLocalTurningPoints = this.mergeSimilarTurningPoints(localTurningPoints);
logger.info(`Dimension ${dimension}: Merged similar TPs to ${mergedLocalTurningPoints.length}`);
// If merging resulted in 0 or 1 TP, return it (after filtering)
if (mergedLocalTurningPoints.length <= 1) {
return this.config.onlySignificantTurningPoints
? this.filterSignificantTurningPoints(mergedLocalTurningPoints)
: mergedLocalTurningPoints;
}
// ------------------- CRITICAL ARC/CRA IMPLEMENTATION -------------------
// Determine whether to expand dimension based on complexity saturation
// Calculate the maximum complexity in this dimension
const maxComplexity = Math.max(0, ...mergedLocalTurningPoints.map(tp => tp.complexityScore)); // Ensure non-negative
// Implement Transition Operator Ψ
const needsDimensionalEscalation = maxComplexity >= this.config.complexitySaturationThreshold;
logger.info(`Dimension ${dimension}: Max complexity = ${maxComplexity.toFixed(2)}, Saturation threshold = ${this.config.complexitySaturationThreshold}`);
logger.info(`Dimension ${dimension}: Needs Escalation (Ψ)? ${needsDimensionalEscalation}`);
// Conditions to STOP escalation and finalize at this dimension:
// 1. Max recursion depth reached
// 2. Too few turning points to warrant higher-level analysis
// 3. Complexity hasn't saturated (no need to escalate)
if (dimension >= this.config.maxRecursionDepth - 1 ||
mergedLocalTurningPoints.length <= 2 || // Adjusted slightly, maybe 2 TPs isn't enough to find meta-patterns
!needsDimensionalEscalation) {
logger.info(`Dimension ${dimension}: Finalizing at this level. Applying final filtering.`);
// Track convergence for this dimension
if (this.config.measureConvergence) {
this.convergenceHistory.push({
previousTurningPoints: [], // No previous state at the final level of processing
currentTurningPoints: mergedLocalTurningPoints, // TPs before final filtering
dimension,
distanceMeasure: 0, // No comparison needed at final step
hasConverged: true, // Considered converged as processing stops here
didEscalate: false
});
}
// Filter the merged points before returning
return this.filterSignificantTurningPoints(mergedLocalTurningPoints);
}
// ----- DIMENSIONAL ESCALATION (n → n+1) -----
logger.info(`Dimension ${dimension}: Escalating to dimension ${dimension + 1}`);
// Create meta-messages from the merged turning points at this level
// Pass originalMessages for context if needed by createMetaMessagesFromTurningPoints
const metaMessages = this.createMetaMessagesFromTurningPoints(mergedLocalTurningPoints, this.originalMessages);
logger.info(`Dimension ${dimension}: Created ${metaMessages.length} meta-messages for dimension ${dimension + 1}`);
if (metaMessages.length < 2) {
logger.info(`Dimension ${dimension}: Not enough meta-messages (${metaMessages.length}) to perform higher-level analysis. Finalizing with current TPs.`);
if (this.config.measureConvergence) {
this.convergenceHistory.push({
previousTurningPoints: mergedLocalTurningPoints, // State before attempted escalation
currentTurningPoints: mergedLocalTurningPoints, // State after failed escalation
dimension: dimension + 1, // Represents the attempted next dimension
distanceMeasure: 0, // No change
hasConverged: true, // Converged because escalation failed
didEscalate: false // Escalation attempted but yielded no processable result
});
}
return this.filterSignificantTurningPoints(mergedLocalTurningPoints);
}
// Recursively process the meta-messages to find higher-dimensional turning points
const higherDimensionTurningPoints = await this.multiLayerDetection(metaMessages, dimension + 1);
logger.info(`Dimension ${dimension + 1}: Found ${higherDimensionTurningPoints.length} higher-dimension TPs.`);
// Track convergence and dimension escalation
if (this.config.measureConvergence) {
const convergenceState: ConvergenceState = {
previousTurningPoints: mergedLocalTurningPoints, // TPs from dim n
currentTurningPoints: higherDimensionTurningPoints, // TPs found in dim n+1
dimension: dimension + 1,
distanceMeasure: this.calculateStateDifference(mergedLocalTurningPoints, higherDimensionTurningPoints),
hasConverged: higherDimensionTurningPoints.length > 0, // Converged if TPs were found at higher level
didEscalate: true
};
this.convergenceHistory.push(convergenceState);
logger.info(`Dimension ${dimension} → ${dimension + 1}: Convergence distance: ${convergenceState.distanceMeasure.toFixed(3)}. Converged: ${convergenceState.hasConverged}`);
}
// Combine turning points from local (n) and higher (n+1) dimensions
// The combine function will handle merging, prioritizing higher-dim, and filtering
return this.combineTurningPoints(mergedLocalTurningPoints, higherDimensionTurningPoints);
}
/**
* Calculate a difference measure between two states (sets of turning points)
* Used for convergence tracking. Considers significance and location.
*/
private calculateStateDifference(
state1: TurningPoint[],
state2: TurningPoint[]
): number {
// Handle empty states
if (state1.length === 0 && state2.length === 0) return 0.0; // No difference
if (state1.length === 0 || state2.length === 0) return 1.0; // Maximum difference
// 1. Average Significance Difference
const avgSig1 = state1.reduce((sum, tp) => sum + tp.significance, 0) / state1.length;
const avgSig2 = state2.reduce((sum, tp) => sum + tp.significance, 0) / state2.length;
const sigDiff = Math.abs(avgSig1 - avgSig2); // Range [0, 1]
// 2. Structural Difference (using Jaccard index on span ranges)
const spans1 = new Set(state1.map(tp => `${tp.span.startIndex}-${tp.span.endIndex}`));
const spans2 = new Set(state2.map(tp => `${tp.span.startIndex}-${tp.span.endIndex}`));
const intersection = new Set([...spans1].filter(span => spans2.has(span)));
const union = new Set([...spans1, ...spans2]);
const jaccardDistance = union.size > 0 ? 1.0 - (intersection.size / union.size) : 0.0; // Range [0, 1]
// Combine the measures (e.g., weighted average)
const combinedDistance = (sigDiff * 0.5) + (jaccardDistance * 0.5);
return Math.min(1.0, Math.max(0.0, combinedDistance)); // Ensure bounds [0, 1]
}
/**
* Apply complexity function χ from the ARC/CRA framework
*/
private calculateComplexityScore(significance: number, semanticShiftMagnitude: number): number {
// Base complexity from significance (maps [0,1] to [1, 5])
let complexity = 1 + significance * 4;
// Adjust based on semantic shift magnitude (distance, scaled 0-1)
// Larger shifts slightly increase complexity, centered around a baseline distance
const baselineDistance = 0.3; // Assumes threshold is around here
complexity += (semanticShiftMagnitude - baselineDistance) * 1.0; // Adjust sensitivity as needed
// Ensure complexity is within the [1, 5] range
return Math.max(1, Math.min(5, complexity));
}
/**
* Detect turning points within a single chunk of the conversation
*/
/**
* Detect turning points within a single chunk of the conversation
* This represents the local refinement process in the current dimension
*/
private async detectTurningPointsInChunk(
messages: MetaMessage[] | Message[],
dimension: number,
chunkIndex: number, // Optional index for logging purposes
originalMessages: Message[],
): Promise<TurningPoint[]> {
if (messages.length < 2) return [];
// Generate embeddings for all messages in the chunk
const embeddings = await this.generateMessageEmbeddings(messages, dimension);
// Find significant semantic shifts between adjacent messages
const turningPoints: TurningPoint[] = [];
const distances: {
current: number;
next: number;
distance: number;
}[] = []; // Store distances for logging
const allDistances: {
current: number;
next: number;
distance: number;
}[] = []; // Store all distances for logging
for (let i = 0; i < embeddings.length - 1; i++) {
const current = embeddings[i];
const next = embeddings[i + 1];
// Calculate semantic distance between current and next message
const distance = this.calculateSemanticDistance(
current.embedding,
next.embedding,
);
const beforeMessage = messages.find((m) => m.id === current.id);
const afterMessage = messages.find((m) => m.id === next.id);
let thresholdScaleFactor;
const baseThreshold = this.config.semanticShiftThreshold;
if (baseThreshold > 0.7) {
// For high initial thresholds (like 0.75), scale down more aggressively
thresholdScaleFactor = Math.pow(0.25, dimension); // More aggressive (0.25 instead of 0.4)
} else if (baseThreshold > 0.5) {
// For medium thresholds
thresholdScaleFactor = Math.pow(0.35, dimension);
} else {
// For already low thresholds
thresholdScaleFactor = Math.pow(0.5, dimension);
}
const dimensionAdjustedThreshold = baseThreshold * thresholdScaleFactor;
if (
dimensionAdjustedThreshold <= distance
) {
distances.push({
current: current.index,
next: next.index,
distance: distance,
}); // Store distance for logging
}
allDistances.push({
current: current.index,
next: next.index,
distance: distance,
});
}
logger.info(
`For a total number of points: ${embeddings.length}, there were ${distances.length} distances found as being greater than the threshold of ${this.config.semanticShiftThreshold}.
- The top 3 greatest distances are: ${allDistances.slice(0, 3).sort((a, b) => b.distance - a.distance).map(d => d.distance.toFixed(3)).join(', ')}
This means there were ${distances.length} potential turning points detected ${dimension === 0 ? "with valid user-assistant turn pairs" : "with valid meta-messages"}`,
);
if (distances.length === 0) {
logger.info(
`No significant semantic shifts detected in chunk ${chunkIndex}`,
);
return [];
}
for (let d = 0; d < distances.length - 1; d++) {
const distanceObj = distances[d];
const i = distanceObj.current; // Current message index
const current = embeddings[i]; // Current message embedding
const next = embeddings[distanceObj.next]; // Next message embedding
// If the distance exceeds our threshold, we've found a turning point
// Use direct array indices to get the messages
const distance = distanceObj.distance; // Semantic distance between current and next message
const beforeMessage = messages[i];
const afterMessage = messages[i + 1];
if (beforeMessage == undefined || afterMessage == undefined) {
logger.info(
`detectTurningPointsInChunk: warning beforeMessage or afterMessage is undefined, beforeMessage: ${beforeMessage}, afterMessage: ${afterMessage}`,
);
continue;
}
// Classify the turning point using LLM
const turningPoint = await this.classifyTurningPoint(
beforeMessage,
afterMessage,
distance,
dimension,
originalMessages,
d,
);
logger.info(
` ...${chunkIndex ? `[Chunk ${chunkIndex}] ` : ""
}Potential turning point detected between messages ${current.id
} and ${next.id} (distance: ${distance.toFixed(
3,
)}, complexity: ${turningPoint.complexityScore.toFixed(
1,
)}), signif: ${turningPoint.significance.toFixed(2)} category: ${turningPoint.category
}`,
);
if (turningPoint.significance > 1) {
if (turningPoint.significance > 10) {
turningPoint.significance = turningPoint.significance / 100;
} else {
turningPoint.significance = turningPoint.significance / 10; // Adjusting for scale
}
}
turningPoints.push(turningPoint);
}
return turningPoints;
}
/**
* Use LLM to classify a turning point and generate metadata.
* *** MODIFIED to prioritize message.spanData over regex ***
*/
private async classifyTurningPoint(
beforeMessage: Message,
afterMessage: Message,
distance: number,
dimension: number,
originalMessages: Message[],
index: number = 0
): Promise<TurningPoint> {
let span: MessageSpan;
if (dimension > 0) {
if (beforeMessage instanceof MetaMessage === false || afterMessage instanceof MetaMessage === false) {
throw new Error("Before or after message is not a MetaMessage");
}
const beforeMessageMeta = beforeMessage as MetaMessage;
const afterMessageMeta = afterMessage as MetaMessage;
// For higher dimensions, use meta-message and inner methods to get the the span ids for the start and end
span = {
startId: beforeMessageMeta.getMessagesInTurningPointSpanToMessagesArray()[0].id,
endId: afterMessageMeta.getMessagesInTurningPointSpanToMessagesArray()[0].id,
startIndex: this.originalMessages.findIndex((candidateM) => {
return beforeMessageMeta.getMessagesInTurningPointSpanToMessagesArray()[0].id === candidateM.id;
}),
endIndex: this.originalMessages.findIndex((candidateM) => {
return afterMessageMeta.getMessagesInTurningPointSpanToMessagesArray()[0].id === candidateM.id;
}),
originalSpan: {
startId: beforeMessage.id,
endId: afterMessage.id,
startIndex: index,
endIndex: index + 1,
}
};
} else {
// For dimension 0, use original message IDs and find indices
span = {
startId: beforeMessage.id,
endId: afterMessage.id,
startIndex: MetaMessage.findIndexOfMessageFromId({
id: beforeMessage.id,
beforeMessage,
afterMessage,
messages: originalMessages,
}),
endIndex: MetaMessage.findIndexOfMessageFromId({
id: afterMessage.id,
beforeMessage,
afterMessage,
messages: originalMessages,
}),
};
}
// --- REMOVED Regex block for extracting originalSpan from meta-message content ---
// const originalSpan = { startIndex: 0, endIndex: 0, startMessageId: '', endMessageId: '' };
// if (beforeMessage.author === 'meta' || afterMessage.author === 'meta') {
// ... regex matching logic ...
// }
// --- End Removal ---
// --- LLM Prompt Setup (using original prompt structure) ---
const systemPrompt = formSystemMessage({
dimension,
distance
})
const userMessage = formUserMessage({
config: this.config,
afterMessage,
beforeMessage,
dimension,
addUserInstructions: this.config.customUserInstruction && this.config.customUserInstruction.length > 0 ? true : false,
})
const contextualAidText = this.prepareContextualInfoMeta(
beforeMessage,
afterMessage,
span,
originalMessages,
dimension,
2,
dimension > 0);
try {
// --- Call LLM (using original parameters and schema) ---
const response = await this.openai.chat.completions.create({
model: this.config.classificationModel,
messages: [
{
role: 'system', content:
`${this.config.customSystemInstruction ? this.config.customSystemInstruction : systemPrompt
}\n\n${contextualAidText}\n------- end of contextual background info see below as reminder of instructions -------\n\n${this.config.customSystemInstruction ? this.config.customSystemInstruction : formSystemPromptEnding(dimension)
}`,
},
{ role: 'user', content: this.config.customUserInstruction ? `${this.config.customUserInstruction}\n\n${userMessage}\n\n${this.config.customUserInstruction}` : userMessage },
],
temperature: 0.6,
//@ts-ignore - Allow vendor-specific params if needed
repeat_penalty: this.config.endpoint ? 1.005 : undefined,
top_k: this.config.endpoint ? 20 : undefined,
stop: ['<|im_end|>'],
response_format: formResponseFormatSchema(dimension),
top_p: 0.9,
});
const content = response.choices[0]?.message?.content || '{}';
let classification: any = {};
try {
classification = JSON.parse(content);
console.info(` got classification: ${JSON.stringify(classification, null, 2)}`);
} catch (err: any) {
logger.info('Error parsing LLM response as JSON:', err.message);
// Attempt to extract JSON from markdown code block if necessary
const jsonMatch = content.match(/```json\s*([\s\S]*?)\s*```/);
if (jsonMatch && jsonMatch[1]) {
try {
classification = JSON.parse(jsonMatch[1]);
logger.info('Successfully extracted JSON from markdown block.');
} catch (parseErr: any) {
logger.info('Failed to parse extracted JSON:', parseErr.message);
classification = {}; // Reset on secondary failure
}
} else {
const plainJsonMatch = content.match(/\{[\s\S]*\}/); // Fallback to find any JSON structure
if (plainJsonMatch) {
try {
classification = JSON.parse(plainJsonMatch[0]);
logger.info('Successfully extracted JSON using simple match.');
} catch (parseErr: any) {
logger.info('Failed to parse simple JSON match:', parseErr.message);
classification = {};
}
} else {
logger.info('Could not extract JSON from response:', content);
classification = {};
}
}
// Provide default values if parsing failed completely
if (Object.keys(classification).length === 0) {
classification = {
label: 'Parsing Error - Unclassified', category: 'Other', keywords: [],
emotionalTone: 'neutral', sentiment: 'neutral', significance: 0.1,
quotes: [], best_id: span.startId
};
}
}
// --- Validate and Sanitize LLM Output ---
const validatedClassification = {
label: typeof classification.label === 'string' ? classification.label.substring(0, 50) : 'Unknown Turning Point',
category: typeof classification.category === 'string' ? classification.category as TurningPointCategory : 'Other',
keywords: Array.isArray(classification.keywords) ? classification.keywords.map(String).slice(0, 4) : [], // Limit count
emotionalTone: typeof classification.emotionalTone === 'string' ? classification.emotionalTone : 'neutral',
sentiment: ['positive', 'negative', 'neutral'].includes(classification.sentiment) ? classification.sentiment : 'neutral',
significance: typeof classification.significance === 'number' ? Math.max(0, Math.min(1, classification.significance)) : 0.5,
quotes: Array.isArray(classification.quotes) ? classification.quotes.map(String).slice(0, 3) : [], // Limit count
best_id: typeof classification.best_id === 'string' ? classification.best_id : span.startId, // Default to start of span
};
// Calculate complexity score
const complexityScore = this.calculateComplexityScore(
validatedClassification.significance,
distance // Use the raw distance (0-1)
);
// --- Construct TurningPoint Object ---
return {
id: `tp-${dimension}-${span.startIndex}-${span.endIndex}`,
label: validatedClassification.label,
category: validatedClassification.category,
span: span, // Use the span derived at the beginning
// deprecatedSpan is no longer populated from regex results
semanticShiftMagnitude: distance,
keywords: validatedClassification.keywords,
quotes: validatedClassification.quotes,
emotionalTone: validatedClassification.emotionalTone,
sentiment: validatedClassification.sentiment,
detectionLevel: dimension,
significance: validatedClassification.significance,
complexityScore: complexityScore
};
} catch (err: any) {
logger.info(`Error during LLM call for turning point classification: ${err.message}`);
// Fallback classification on API error
if (this.config.throwOnError) {
} else {
return {
id: `tp-err-${dimension}-${span.startId}`,
label: 'LLM Error - Unclassified',
category: 'Other',
span: span,
semanticShiftMagnitude: distance,
keywords: [],
quotes: [],
emotionalTone: 'neutral',
sentiment: 'neutral',
detectionLevel: dimension,
significance: 0.1,
complexityScore: 1.0 // Minimum complexity
};
}
}
}
/**
* Updated to utilize new classes of Message and MetaMessage for better structure and clarity
* @param turningPoints
* @param originalMessages
* @returns
*/
private createMetaMessagesFromTurningPoints(
turningPoints: TurningPoint[],
originalMessages: Message[],
): Message[] {
if (turningPoints.length === 0) return [];
// Group turning points by category (first-level abstraction)
const groupedByCategory: Record<string, TurningPoint[]> = {};
turningPoints.forEach((tp) => {
const category = tp.category;
if (!groupedByCategory[category]) {
groupedByCategory[category] = [];
}
groupedByCategory[category].push(tp);
});
logger.info(
`Grouped categories:\n` +
JSON.stringify(groupedByCategory, null, 2),
);
// Create meta-messages (one per category to find higher-level patterns)
const metaMessages: Message[] = [];
// First create category messages - represents dimension n to n+1 transformation
Object.entries(groupedByCategory).forEach(([category, points], index) => {
// Use the factory method from MetaMessage class to create a properly typed meta-message
const metaMessage = MetaMessage.createCategoryMetaMessage(
category,
points,
index,
originalMessages,
);
metaMessages.push(metaMessage);
});
// Create timeline/section meta-messages
const sortedPoints = [...turningPoints].sort(
(a, b) => a.span.startIndex - b.span.startIndex,
);
const sectionCount = Math.min(4, Math.ceil(sortedPoints.length / 2));
const pointsPerSection = Math.ceil(sortedPoints.length / sectionCount);
// Create chronological section meta-messages
for (let i = 0; i < sectionCount; i++) {
const sectionPoints = sortedPoints.slice(
i * pointsPerSection,
Math.min((i + 1) * pointsPerSection, sortedPoints.length),
);
if (sectionPoints.length === 0) continue;
// Create a section meta-message using the factory method
const sectionMetaMessage = MetaMessage.createSectionMetaMessage(
sectionPoints,
i,
this.originalMessages,
);
console.info('created sectionMetageMessage')
metaMessages.push(sectionMetaMessage);
}
logger.info(
`Created ${
metaMessages.length
} meta-messages for dimensional expansion: ${metaMessages
.map((m) => m.id)
.join(", ")}`,
);
return metaMessages;
}
// --- Remaining methods are kept identical to your second provided version ---
/**
* Filter turning points to keep only significant ones
* (Using original logic from the second code block)
*/
private filterSignificantTurningPoints(turningPoints: TurningPoint[]): TurningPoint[] {
if (!this.config.onlySignificantTurningPoints || turningPoints.length === 0) {
// Ensure sorted return even if not filtering
return turningPoints.sort((a, b) => a.span.startIndex - b.span.startIndex);
}
logger.info(`Filtering ${turningPoints.length} TPs based on significance >= ${this.config.significanceThreshold} and maxPoints = ${this.config.maxTurningPoints}`);
// Sort by significance, complexity, magnitude
const sorted = [...turningPoints].sort((a, b) => {
if (b.significance !== a.significance) return b.significance - a.significance;
if (b.complexityScore !== a.complexityScore) return b.complexityScore - a.complexityScore;
return b.semanticShiftMagnitude - a.semanticShiftMagnitude;
});
const result: TurningPoint[] = [];
const coveredIndices: Set<number> = new Set(); // Use indices for overlap check
const maxPoints = this.config.maxTurningPoints;
for (const tp of sorted) {
// Check significance threshold first
if (tp.significance < this.config.significanceThreshold) {
// Only consider points below threshold if we haven't found enough significant ones yet
if (result.length >= Math.ceil(maxPoints / 2)) { // Heuristic: if we have half the max points, stop adding insignificant ones
continue;
}
}
// Check for significant overlap with already selected points
let overlapRatio = 0;
let isOverlapping = false;
const tpSpanSize = tp.span.endIndex - tp.span.startIndex + 1;
if (tpSpanSize > 0) {
let overlapCount = 0;
for (let i = tp.span.startIndex; i <= tp.span.endIndex; i++) {
if (coveredIndices.has(i)) {
overlapCount++;
}
}
overlapRatio = overlapCount / tpSpanSize;
}
// Define significant overlap threshold (e.g., 40% from original code)
const overlapThreshold = 0.4;
isOverlapping = overlapRatio > overlapThreshold;
if (!isOverlapping && result.length < maxPoints) {
result.push(tp);
// Mark indices covered by this TP
for (let i = tp.span.startIndex; i <= tp.span.endIndex; i++) {
coveredIndices.add(i);
}
} else if (isOverlapping) {
logger.info(` TP ${tp.id} (Sig: ${tp.significance.toFixed(2)}) overlaps significantly (${(overlapRatio * 100).toFixed(0)}%) with existing TPs. Skipping.`);
} else if (result.length >= maxPoints) {
logger.info(` Reached max turning points (${maxPoints}). Skipping TP ${tp.id}.`);
}
}
// Ensure at least one TP is returned if any were found initially
if (result.length === 0 && sorted.length > 0) {
logger.info("No TPs met significance/overlap criteria, returning the single most significant one.");
result.push(sorted[0]);
}
// Add a second diverse TP if only one was kept and more exist (original logic)
else if (result.length === 1 && sorted.length > 1) {
for (let i = 1; i < sorted.length; i++) {
const nextTp = sorted[i];
// Check if it's sufficiently far from the first one (e.g., > 3 messages gap)
if (Math.abs(nextTp.span.startIndex - result[0].span.startIndex) > 3) {
// Check minimal overlap with the first one
let overlapsFirst = false;
for (let j = nextTp.span.startIndex; j <= nextTp.span.endIndex; j++) {
if (j >= result[0].span.startIndex && j <= result[0].span.endIndex) {
overlapsFirst = true;
break;
}
}
if (!overlapsFirst) {
logger.info("Adding a second, non-overlapping TP for diversity.");
result.push(nextTp);
break;
}
}
}
}
logger.info(`Filtered down to ${result.length} significant turning points.`);
// Final sort by position in conversation
return result.sort((a, b) => a.span.startIndex - b.span.startIndex);
}
/**
* Combine turning points from different dimensions
* (Using original logic from the second code block)
*/
private combineTurningPoints(
localTurningPoints: TurningPoint[],
higherDimensionTurningPoints: TurningPoint[]
): TurningPoint[] {
logger.info(`Combining ${localTurningPoints.length} local (dim ${localTurningPoints[0]?.detectionLevel ?? 'N/A'}) and ${higherDimensionTurningPoints.length} higher (dim ${higherDimensionTurningPoints[0]?.detectionLevel ?? 'N/A'}) TPs.`);
// Prioritize higher-dimensional turning points by boosting their significance (original logic)
const boostedHigher = higherDimensionTurningPoints.map(tp => ({
...tp,
// Apply a boost, ensuring it doesn't exceed 1.0
significance: Math.min(1.0, tp.significance * 1.2), // Adjusted boost factor slightly
// Keep original detectionLevel for merging logic
}));
// Combine all turning points
const allTurningPoints = [...localTurningPoints, ...boostedHigher];
logger.info(`Total TPs before cross-level merge: ${allTurningPoints.length}`);
// Merge overlapping turning points across dimensions, prioritizing higher dimensions/significance
const mergedTurningPoints = this.mergeAcrossLevels(allTurningPoints);
logger.info(`Merged across levels to ${mergedTurningPoints.length} TPs.`);
// Filter the combined & merged list to keep the most significant ones overall
const filteredTurningPoints = this.filterSignificantTurningPoints(mergedTurningPoints);
logger.info(`Final combined and filtered TPs: ${filteredTurningPoints.length}`);
// Sort by position in conversation before returning
return filteredTurningPoints.sort((a, b) => a.span.startIndex - b.span.startIndex);
}
/**
* Merge similar or overlapping turning points *within* the same dimension
* (Using original logic from the second code block)
*/
private mergeSimilarTurningPoints(turningPoints: TurningPoint[]): TurningPoint[] {
if (turningPoints.length <= 1) return turningPoints;
// Sort turning points by start index
const sorted = [...turningPoints].sort((a, b) => a.span.startIndex - b.span.startIndex);
const merged: TurningPoint[] = [];
let currentTp = sorted[0]; // Use a more descriptive name
for (let i = 1; i < sorted.length; i++) {
const nextTp = sorted[i];
// Check conditions for merging (original logic)
const isOverlapping = (nextTp.span.startIndex <= currentTp.span.endIndex + 2); // Allow small gap
const isSimilarCategory = (nextTp.category === currentTp.category);
// Added closeness check from original code
const hasCloseIndices = (nextTp.span.startIndex - currentTp.span.endIndex) <= 3;
// Merge if overlapping OR close, AND same category
if ((isOverlapping || hasCloseIndices) && isSimilarCategory) {
logger.info(` Merging similar TPs (Dim ${currentTp.detectionLevel}): ${currentTp.id} and ${nextTp.id}`);
// Merge the turning points
const newLabel = this.createMergedLabel(currentTp.label, nextTp.label);
// Create merged span (min start, max end)
const mergedSpan = this.ensureChronologicalSpan({
startId: currentTp.span.startIndex <= nextTp.span.startIndex ? currentTp.span.startId : nextTp.span.startId,
endId: currentTp.span.endIndex >= nextTp.span.endIndex ? currentTp.span.endId : nextTp.span.endId,
startIndex: Math.min(currentTp.span.startIndex, nextTp.span.startIndex),
endIndex: Math.max(currentTp.span.endIndex, nextTp.span.endIndex)
});
// Update the deprecated span too (original logic, though less relevant now)
// Note: deprecatedSpan might not exist if TPs came from meta-messages
const mergedDeprecatedSpan = (currentTp.deprecatedSpan && nextTp.deprecatedSpan) ? {
startIndex: Math.min(currentTp.deprecatedSpan.startIndex, nextTp.deprecatedSpan.startIndex),
endIndex: Math.max(currentTp.deprecatedSpan.endIndex, nextTp.deprecatedSpan.endIndex),
startMessageId: mergedSpan.startIndex === currentTp.deprecatedSpan.startIndex ?
currentTp.deprecatedSpan.startMessageId : nextTp.deprecatedSpan.startMessageId,
endMe