convokit
Version:
A flexible TypeScript framework for ingesting, processing, and exporting chat/conversation data for LLM training and analysis.
263 lines • 13.9 kB
JavaScript
import { ConvoKitLogging as ckl, PluginRegistry } from "../../../index.js";
// --- Default Constants ---
// These will be used if not provided in options
const DEFAULT_MAX_TIME_GAP_MINUTES = 120;
const DEFAULT_NEW_CONVERSATION_MARKER = '<NC>';
const DEFAULT_LINE_DELIMITER = '<NL>';
const DEFAULT_GROUP_TIME_GAP_MINUTES = 5;
const DEFAULT_TIME_WEIGHT = 1;
const DEFAULT_TOTAL_MESSAGES_WEIGHT = 0.5;
const DEFAULT_FREQUENCY_WEIGHT = 0.5;
const DEFAULT_RATIO_WEIGHT = 0.5;
const DEFAULT_LENGTH_WEIGHT = 0.5;
const DEFAULT_MINIMUM_ALLOWED_IMPORTANCE_PER_MESSAGE = 100;
const DEFAULT_MINIMUM_ALLOWED_IMPORTANCE_CHAT = 120;
/**
* Processes an array of ConvoKitConversation objects to generate a formatted training string.
* Applies filtering, message grouping, importance scoring, and speaker labeling.
*
* @param conversations Array of conversation objects to process.
* @param options Configuration options, including the TARGET_USER_ID.
* @returns An object containing the combined processed data string and optional statistics.
*/
export function ParseToCKContext(conversations, options) {
ckl.time("CKContext", "Generating CKContext");
const { targetUsers, maxTimeGapMinutes = DEFAULT_MAX_TIME_GAP_MINUTES, newConversationMarker = DEFAULT_NEW_CONVERSATION_MARKER, lineDelimiter = DEFAULT_LINE_DELIMITER, groupTimeGapMinutes = DEFAULT_GROUP_TIME_GAP_MINUTES, timeWeight = DEFAULT_TIME_WEIGHT, totalMessagesWeight = DEFAULT_TOTAL_MESSAGES_WEIGHT, frequencyWeight = DEFAULT_FREQUENCY_WEIGHT, ratioWeight = DEFAULT_RATIO_WEIGHT, lengthWeight = DEFAULT_LENGTH_WEIGHT, minimumAllowedImportancePerMessage = DEFAULT_MINIMUM_ALLOWED_IMPORTANCE_PER_MESSAGE, minimumAllowedImportanceChat = DEFAULT_MINIMUM_ALLOWED_IMPORTANCE_CHAT, } = options;
// Load filter plugins dynamically (cache instances)
const filterPluginCtors = PluginRegistry.listFilters();
const filterPluginCache = new Map();
for (const id of filterPluginCtors) {
const Ctor = PluginRegistry.getFilter(id);
if (Ctor)
filterPluginCache.set(id, new Ctor());
}
const filters = Array.from(filterPluginCache.values());
// Function to apply filters (Now uses plugin registry)
function passesFilters(content) {
for (let i = 0; i < filters.length; ++i) {
const filter = filters[i];
const result = filter.apply(content);
if (filter.filterType === 'MUST' && !result)
return false;
if (filter.filterType === 'MUST_NOT' && result)
return false;
}
return true;
}
let allProcessedLines = [];
// --- Initialize Stats ---
let stats = {
conversationsProcessed: 0,
conversationsSkipped_NoTargetUser: 0,
conversationsSkipped_LowImportance: 0,
conversationsSkipped_NoMessages: 0,
totalMessagesConsidered: 0,
totalMessagesIncluded: 0,
totalMessagesFilteredOut: 0,
};
// --- Importance Scoring Function (Optimized) ---
function calculateImportance(msg, sortedMessages, weights, cached) {
if (cached.rawMessageCount === 0)
return 0;
const msgAuthorId = msg.author.id;
const targetMessageCount = cached.authorMessageCounts.get(msgAuthorId) || 0;
const otherMessageCount = cached.rawMessageCount - targetMessageCount;
const msgLen = cached.messageLenCache.get(msg) || msg.message.trim().length;
const timeScore = cached.endTimestamp === cached.startTimestamp ? 0 : ((new Date(msg.timestamp).getTime() - cached.startTimestamp) / (cached.endTimestamp - cached.startTimestamp)) * weights.time;
const lengthScore = (msgLen / cached.maxMessageLength) * weights.length;
const ratioScore = (targetMessageCount / (otherMessageCount || 1)) * weights.ratio;
const frequencyScore = (cached.rawMessageCount / cached.conversationDurationMinutes) * weights.frequency;
const totalScore = timeScore + lengthScore + ratioScore + frequencyScore + weights.total;
return Math.round(totalScore * 100);
}
// Bundle weights for passing to calculateImportance
const importanceWeights = {
time: timeWeight,
length: lengthWeight,
ratio: ratioWeight,
frequency: frequencyWeight,
total: totalMessagesWeight
};
// --- Process Each Conversation ---
for (let index = 0; index < conversations.length; ++index) {
const convo = conversations[index];
ckl.info("CKContext", `Processing conversation: ${convo.metadata.conversationId} - ${index + 1} of ${conversations.length}...`);
// Determine target user ID for this conversation based on provider
const provider = convo.metadata.providerId;
let targetEntry = undefined;
for (let i = 0; i < targetUsers.length; ++i) {
if (targetUsers[i].providerId === provider) {
targetEntry = targetUsers[i];
break;
}
}
if (!targetEntry) {
ckl.warn("CKContext", `Skipping conversation ${convo.metadata.conversationId}: no targetUsers entry for provider ${provider}.`);
stats.conversationsSkipped_NoTargetUser++;
stats.totalMessagesConsidered += convo.messages.length;
continue;
}
const TARGET_USER_ID = targetEntry.id;
const conversationMessages = convo.messages;
stats.totalMessagesConsidered += conversationMessages.length;
// Check if target user is involved in this conversation
let hasTarget = false;
for (let i = 0; i < conversationMessages.length; ++i) {
if (conversationMessages[i].author.id === TARGET_USER_ID) {
hasTarget = true;
break;
}
}
if (!hasTarget) {
ckl.warn("CKContext", `Skipping conversation ${convo.metadata.conversationId}: target ID ${TARGET_USER_ID} not found.`);
stats.conversationsSkipped_NoTargetUser++;
continue;
}
// Sort messages by timestamp (essential for processing order and time gaps)
const sortedMessages = conversationMessages.slice();
sortedMessages.sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
if (sortedMessages.length === 0) {
ckl.warn("CKContext", `Skipping conversation ${convo.metadata.conversationId}: No messages.`);
stats.conversationsSkipped_NoMessages++;
continue;
}
// Precompute/cached values for this conversation
let maxMessageLength = 1;
let startTimestamp = new Date(sortedMessages[0].timestamp).getTime();
let endTimestamp = new Date(sortedMessages[sortedMessages.length - 1].timestamp).getTime();
let authorMessageCounts = new Map();
// Cache message lengths in a Map for performance
let messageLenCache = new Map();
for (let i = 0; i < sortedMessages.length; ++i) {
const msg = sortedMessages[i];
const trimmed = msg.message.trim();
const len = trimmed.length;
messageLenCache.set(msg, len);
if (len > maxMessageLength)
maxMessageLength = len;
const id = msg.author.id;
authorMessageCounts.set(id, (authorMessageCounts.get(id) || 0) + 1);
}
const rawMessageCount = sortedMessages.length;
const conversationDurationMinutes = Math.max(1, (endTimestamp - startTimestamp) / (1000 * 60));
const cached = { startTimestamp, endTimestamp, rawMessageCount, maxMessageLength, conversationDurationMinutes, authorMessageCounts, messageLenCache };
const processedMessagesForThisConvo = [];
let includedMessagesCount = 0;
let filteredMessagesCount = 0;
let sumImportance = 0;
let importanceCount = 0;
let lastMessageTimestamp = null;
let newConversationSegments = 0;
let currentGroupSpeaker = null;
let currentGroupMessages = [];
let currentGroupLastTimestamp = null;
function flushGroup() {
if (currentGroupSpeaker && currentGroupMessages.length > 0 && currentGroupLastTimestamp) {
let groupContents = '';
for (let i = 0; i < currentGroupMessages.length; ++i) {
if (i > 0)
groupContents += ' ';
groupContents += currentGroupMessages[i].message.replace(/\r?\n/g, lineDelimiter).trim();
}
const firstTs = formatTimestamp(new Date(currentGroupMessages[0].timestamp));
const score = calculateImportance(currentGroupMessages[currentGroupMessages.length - 1], sortedMessages, importanceWeights, cached);
processedMessagesForThisConvo.push(`${score}|${currentGroupSpeaker}:${groupContents}|${firstTs}`);
}
currentGroupSpeaker = null;
currentGroupMessages = [];
currentGroupLastTimestamp = null;
}
for (let i = 0; i < sortedMessages.length; ++i) {
const message = sortedMessages[i];
if (!message.message || typeof message.message !== 'string' || message.message.trim() === '' || !message.author || !message.author.id || !message.timestamp) {
continue;
}
const currentMessageTimestamp = new Date(message.timestamp);
if (lastMessageTimestamp) {
const timeDifferenceMinutes = (currentMessageTimestamp.getTime() - lastMessageTimestamp.getTime()) / (1000 * 60);
if (timeDifferenceMinutes > maxTimeGapMinutes) {
flushGroup();
if (processedMessagesForThisConvo.length > 0 && processedMessagesForThisConvo[processedMessagesForThisConvo.length - 1] !== newConversationMarker) {
processedMessagesForThisConvo.push(newConversationMarker);
newConversationSegments++;
}
currentGroupSpeaker = null;
currentGroupMessages = [];
currentGroupLastTimestamp = null;
}
}
if (!passesFilters(message.message)) {
filteredMessagesCount++;
continue;
}
const importance = calculateImportance(message, sortedMessages, importanceWeights, cached);
if (importance < minimumAllowedImportancePerMessage) {
filteredMessagesCount++;
continue;
}
sumImportance += importance;
importanceCount++;
const speakerLabel = message.author.id === TARGET_USER_ID ? 'A' : 'U';
if (currentGroupSpeaker === speakerLabel && currentGroupLastTimestamp) {
const groupGapMinutes = (currentMessageTimestamp.getTime() - currentGroupLastTimestamp.getTime()) / (1000 * 60);
if (groupGapMinutes <= groupTimeGapMinutes) {
currentGroupMessages.push(message);
}
else {
flushGroup();
currentGroupSpeaker = speakerLabel;
currentGroupMessages = [message];
}
}
else {
flushGroup();
currentGroupSpeaker = speakerLabel;
currentGroupMessages = [message];
}
currentGroupLastTimestamp = currentMessageTimestamp;
lastMessageTimestamp = currentMessageTimestamp;
includedMessagesCount++;
}
flushGroup();
if (includedMessagesCount === 0) {
ckl.warn("CKContext", `Skipping conversation ${convo.metadata.conversationId}: No messages included after filtering/scoring.`);
stats.conversationsSkipped_NoMessages++;
stats.totalMessagesFilteredOut += filteredMessagesCount;
continue;
}
const averageConversationImportance = sumImportance / (importanceCount || 1);
if (averageConversationImportance < minimumAllowedImportanceChat) {
ckl.warn("CKContext", ` Skipping conversation ${convo.metadata.conversationId}: Average importance ${averageConversationImportance.toFixed(2)} below threshold ${minimumAllowedImportanceChat}.`);
stats.conversationsSkipped_LowImportance++;
stats.totalMessagesFilteredOut += filteredMessagesCount;
continue;
}
if (allProcessedLines.length > 0) {
if (allProcessedLines[allProcessedLines.length - 1] !== newConversationMarker) {
allProcessedLines.push(newConversationMarker);
}
}
allProcessedLines = allProcessedLines.concat(processedMessagesForThisConvo);
stats.conversationsProcessed++;
stats.totalMessagesIncluded += includedMessagesCount;
stats.totalMessagesFilteredOut += filteredMessagesCount;
ckl.info("CKContext", `Processed conversation ${convo.metadata.conversationId}: Included ${includedMessagesCount} messages (${newConversationSegments} internal segments).`);
}
const combinedOutputString = allProcessedLines.join('\n');
ckl.timeEnd("CKContext", "Generating CKContext");
return {
processedData: combinedOutputString,
stats: stats
};
}
// Helper to format timestamp
function formatTimestamp(date) {
const yyyy = date.getFullYear();
const mm = String(date.getMonth() + 1).padStart(2, '0');
const dd = String(date.getDate()).padStart(2, '0');
const hh = String(date.getHours()).padStart(2, '0');
const mi = String(date.getMinutes()).padStart(2, '0');
const ss = String(date.getSeconds()).padStart(2, '0');
return `${yyyy}-${mm}-${dd} ${hh}:${mi}:${ss}`;
}
//# sourceMappingURL=ConvoKitContext.js.map