UNPKG

convokit

Version:

A flexible TypeScript framework for ingesting, processing, and exporting chat/conversation data for LLM training and analysis.

415 lines 18.8 kB
import fs from 'fs/promises'; import Path from 'path'; import { ConvoKitLogging as ckl } from "./ck/ConvoKitLogging.js"; import { config } from "dotenv"; import { ProviderRegistry } from './ck/ProviderRegistry.js'; import { PluginRegistry } from './ck/PluginRegistry.js'; import { loadConfig, getConfig } from './ck/ConvoKitConfig.js'; import { fileURLToPath, pathToFileURL } from 'url'; import { CKContextToCKTurnList } from './ck/internal_plugins/formatters/CKTurnList.js'; import { CKWeightedSample } from './ck/internal_plugins/formatters/CKWeightedSample.js'; import crypto from 'crypto'; const __filename = fileURLToPath(import.meta.url); const __dirname = Path.dirname(__filename); config(); // Load environment variables export * from './ck/types/ConvoKitTypes.js'; export * from './ck/types/ConvoKitProvider.js'; export * from './ck/types/PluginTypes.js'; export * from './ck/ConvoKitConfig.js'; export * from './ck/ConvoKitLogging.js'; export * from './ck/ProviderRegistry.js'; export * from './ck/PluginRegistry.js'; export class ConvoKit { loadedProviderModules = []; convoKitFormattedData = []; ckContextResult = null; ckTurnListConversations = []; sampledConversations = []; constructor() { } /** * Load a provider module from a file. * @param filePath Path to the provider module file. */ async addProviderFromFile(filePath) { try { const modulePath = Path.resolve(filePath); ckl.debug('ConvoKit', `Importing provider module: ${modulePath}`); // Convert the path to a URL to ensure import() works correctly const moduleUrl = pathToFileURL(modulePath).href; await import(moduleUrl); } catch (err) { ckl.error('ConvoKit', `Error loading provider module from ${filePath}: ${err}`); throw err; // Re-throw to allow tests to catch errors } } /** * Dynamically loads all plugin modules so they self-register. */ async loadPlugins() { const pluginsBase = Path.join(__dirname, 'plugins'); for (const type of ['formatters', 'converters', 'filters']) { const dir = Path.join(pluginsBase, type); try { const files = await fs.readdir(dir); for (const file of files) { if (file.endsWith('.ts') && !file.endsWith(".d.ts") || file.endsWith('.js')) { const filePath = Path.join(dir, file); ckl.debug('PluginLoader', `Importing plugin module: ${filePath}`); try { await import(pathToFileURL(filePath).href); } catch (importErr) { ckl.error('PluginLoader', `Failed to load plugin module ${filePath}: ${importErr}`); } } } } catch (err) { ckl.debug('PluginLoader', `No plugin directory for type ${type} (${err.message}).`); } } // Load local plugins if (getConfig().localPluginsDir) { const localPluginsBase = `./${getConfig().localPluginsDir}`; for (const type of ['formatters', 'converters', 'filters']) { const dir = Path.join(localPluginsBase, type); try { const files = await fs.readdir(dir); for (const file of files) { if (file.endsWith('.ts') && !file.endsWith(".d.ts") || file.endsWith('.js')) { const filePath = Path.join(dir, file); ckl.debug('PluginLoader', `Importing local plugin module: ${filePath}`); try { await import(pathToFileURL(filePath).href); } catch (importErr) { ckl.error('PluginLoader', `Failed to load local plugin module ${filePath}: ${importErr}`); } } } } catch (err) { ckl.debug('PluginLoader', `No local plugin directory for type ${type} (${err.message}).`); } } } } /** * Anonymizes provider data by renaming files in the input data directory. */ async anonymizeProviderData() { ckl.time("ConvoKit", "Anonymizing provider data"); try { const { inputDataDirName } = getConfig(); const baseDataDir = Path.join(`./${inputDataDirName}`); // Check if directory exists before trying to read it try { await fs.access(baseDataDir); } catch (err) { ckl.error("ConvoKit", `Input data directory ${baseDataDir} does not exist or is not accessible: ${err.message}`); return; } let providerDirs; try { providerDirs = await fs.readdir(baseDataDir); } catch (err) { ckl.error("ConvoKit", `Error reading base data directory ${baseDataDir}: ${err.message}`); return; } for (const folder of providerDirs) { const providerDir = Path.join(baseDataDir, folder); let stat; try { stat = await fs.stat(providerDir); } catch (err) { ckl.error("ConvoKit", `Error accessing ${providerDir}: ${err.message}`); continue; } if (!stat.isDirectory()) continue; let files; try { files = await fs.readdir(providerDir); } catch (err) { ckl.error("ConvoKit", `Error reading directory ${providerDir}: ${err.message}`); continue; } for (const file of files) { const oldFilePath = Path.join(providerDir, file); const ext = Path.extname(file); let newName; try { // Use node crypto if available, fallback to Math.random if (typeof crypto !== 'undefined' && crypto.randomUUID) { newName = crypto.randomUUID().replace(/-/g, '').slice(0, 16) + ext; } else { newName = Math.random().toString(36).substring(2, 10) + Math.random().toString(36).substring(2, 10) + ext; } } catch (err) { // If crypto fails, use simple Math.random newName = Math.random().toString(36).substring(2, 10) + Math.random().toString(36).substring(2, 10) + ext; } const newFilePath = Path.join(providerDir, newName); try { await fs.rename(oldFilePath, newFilePath); ckl.debug("ConvoKit", `Renamed ${oldFilePath} to ${newFilePath}`); } catch (err) { ckl.error("ConvoKit", `Failed to rename ${oldFilePath}: ${err.message}`); } } } } catch (err) { ckl.error("ConvoKit", `Error during anonymization: ${err.message}`); } ckl.timeEnd("ConvoKit", "Anonymizing provider data"); } // Load provider modules dynamically async loadProviders() { await loadConfig(); // Load plugin classes await this.loadPlugins(); // Dynamically import all included provider modules so they self-register const providersDir = Path.join(__dirname, 'providers'); try { const providerFiles = await fs.readdir(providersDir); for (const file of providerFiles) { if (file.endsWith('.ts') && !file.endsWith(".d.ts") || file.endsWith('.js')) { const modulePath = Path.join(providersDir, file); console.log(`Importing provider module: ${modulePath}`); await import(modulePath); } } } catch (err) { ckl.error('ConvoKit', `Error loading provider modules: ${err}`); } // Dynamically load all local provider modules so they self-register if (getConfig().localProvidersDir) { const localProvidersDir = `./${getConfig().localProvidersDir}`; try { const providerFiles = await fs.readdir(localProvidersDir); for (const file of providerFiles) { if (file.endsWith('.ts') && !file.endsWith(".d.ts") || file.endsWith('.js')) { const modulePath = Path.join(localProvidersDir, file); await import(modulePath); } } } catch (err) { ckl.error('ConvoKit', `Error loading local provider modules: ${err}`); } } const { inputDataDirName } = getConfig(); this.loadedProviderModules = []; const inputDataDir = `./${inputDataDirName}`; // For each registered provider, check data folder and instantiate let dirsInInputDirectory = []; try { dirsInInputDirectory = await fs.readdir(inputDataDir); } catch (err) { ckl.error('ConvoKit', `Error reading input data directory ${inputDataDir}: ${err}`); } for (const entry of ProviderRegistry.list()) { const { id, ctor: ProviderClass, info: ProviderInfo } = entry; ckl.info('ConvoKit', `Loading provider [${id}]: ${ProviderInfo.name} v${ProviderInfo.version}`); const providerDirName = ProviderInfo.InputDataInfo.directoryName; if (!dirsInInputDirectory.includes(providerDirName)) { ckl.warn('ConvoKit', `Input directory missing for provider ${id}: ${providerDirName}. Skipping.`); continue; } const providerDataDir = Path.join(inputDataDir, providerDirName); let providerFiles = []; try { providerFiles = await fs.readdir(providerDataDir); } catch (err) { ckl.warn('ConvoKit', `Cannot read data directory for provider ${id}: ${err}. Skipping.`); continue; } const matchingFiles = providerFiles.filter(f => f.endsWith(ProviderInfo.InputDataInfo.fileExtension)); if (matchingFiles.length === 0) { ckl.warn('ConvoKit', `No matching files for provider ${id} in ${providerDataDir}. Skipping.`); continue; } this.loadedProviderModules.push({ Provider: ProviderClass, ProviderInfo }); } ckl.info('ConvoKit', `Loaded ${this.loadedProviderModules.length} providers via registry.`); } // Process data using loaded providers async processDataFromProviders() { this.convoKitFormattedData = []; // Clear previous results const inputDataDir = `./${getConfig().inputDataDirName}`; if (!getConfig().inputDataDirName) { ckl.error("ConvoKit", "INPUT_DATA_DIR_NAME environment variable is not set. Cannot process data."); return []; } const processingPromises = this.loadedProviderModules.map(async (providerModule) => { const providerInfo = providerModule.ProviderInfo; const providerDataDir = Path.join(inputDataDir, providerInfo.InputDataInfo.directoryName); ckl.info(`Provider: ${providerInfo.name}`, `Loading data from ${providerDataDir}`); try { const inputDataFiles = (await fs.readdir(providerDataDir)) .filter(file => file.endsWith(providerInfo.InputDataInfo.fileExtension)); for (const file of inputDataFiles) { const filePath = Path.join(providerDataDir, file); try { const fileContent = await fs.readFile(filePath, 'utf8'); const chat_data = JSON.parse(fileContent); const providerInstance = new providerModule.Provider(chat_data); const isCompatible = providerInstance.Test(); if (isCompatible) { const ConvoKitFormat = providerInstance.Convert(); this.convoKitFormattedData.push(ConvoKitFormat); ckl.info(`Provider: ${providerInfo.name}`, `Converted data from ${file} to ConvoKit format`); } else { ckl.error(`Provider: ${providerInfo.name}`, `Data in ${file} is NOT compatible with the provider.`); } } catch (err) { ckl.error(`Provider: ${providerInfo.name}`, `Error processing file ${file}: ${err}`); } } } catch (err) { ckl.error(`Provider: ${providerInfo.name}`, `Error reading directory ${providerDataDir}: ${err}`); } }); await Promise.all(processingPromises); ckl.info("ConvoKit", `Provider processing complete. Total conversations formatted: ${this.convoKitFormattedData.length}`); return this.convoKitFormattedData; } // Parse raw ConvoKit data into CKContext format async parseToContext(options) { if (!this.convoKitFormattedData.length) { ckl.warn("ConvoKit", "No ConvoKit formatted data available to parse. Run processDataFromProviders() first."); // Return a default empty result or throw an error return { processedData: '', stats: { conversationsProcessed: 0, conversationsSkipped_NoTargetUser: 0, conversationsSkipped_LowImportance: 0, conversationsSkipped_NoMessages: 0, totalMessagesConsidered: 0, totalMessagesIncluded: 0, totalMessagesFilteredOut: 0 } }; } this.ckContextResult = await this.runFormatter('context', options); return this.ckContextResult; } // Convert CKContext string to CKIntermediate format (Turn List) async convertToCKTurnList() { if (!this.ckContextResult || !this.ckContextResult.processedData) { ckl.warn("ConvoKit", "No CKContext data available to convert. Run parseToContext() first."); return []; } this.ckTurnListConversations = await CKContextToCKTurnList(this.ckContextResult.processedData); return this.ckTurnListConversations; } // Get weighted sample from CKIntermediate conversations async getWeightedSample(samples) { if (this.ckTurnListConversations.length === 0) { ckl.warn("ConvoKit", "No intermediate conversations available for sampling. Run convertToCKTurnList() first."); return []; } this.sampledConversations = await CKWeightedSample(this.ckTurnListConversations, samples); return this.sampledConversations; } // Convert sampled conversations to ChatML format async exportToChatML(systemPrompt) { if (!this.sampledConversations.length) { ckl.warn("ConvoKit", "No sampled conversations available for ChatML export. Run getWeightedSample() first."); return []; } return await this.runConverter('chatml', systemPrompt); } // Convert sampled conversations to Gemini format async exportToGemini(systemPrompt) { if (!this.sampledConversations.length) { ckl.warn("ConvoKit", "No sampled conversations available for Gemini export. Run getWeightedSample() first."); return []; } return await this.runConverter('gemini', systemPrompt); } /** * List registered provider IDs. */ listProviders() { return ProviderRegistry.list().map(entry => entry.id); } /** * List all registered formatter plugin IDs. */ listFormatters() { return PluginRegistry.listFormatters(); } /** * List all registered converter plugin IDs. */ listConverters() { return PluginRegistry.listConverters(); } /** * List all registered filter plugin IDs. */ listFilters() { return PluginRegistry.listFilters(); } /** * Runs a registered formatter plugin by ID on current formatted data. * @param id Formatter plugin ID. * @param options Optional context options. */ async runFormatter(id, options) { const pluginCtor = PluginRegistry.getFormatter(id); if (!pluginCtor) throw new Error(`Formatter plugin "${id}" not found.`); const plugin = new pluginCtor(); return await plugin.apply(this.convoKitFormattedData, options); } /** * Runs a registered converter plugin by ID on current sampled conversations. * @param id Converter plugin ID. * @param systemPrompt System prompt string for converter. */ async runConverter(id, systemPrompt) { const pluginCtor = PluginRegistry.getConverter(id); if (!pluginCtor) throw new Error(`Converter plugin "${id}" not found.`); const plugin = new pluginCtor(); return await plugin.apply(this.sampledConversations, systemPrompt); } /** * Tests a content string against a registered filter plugin. * @param id Filter plugin ID. * @param content Message content to test. * @returns True if content passes the filter, false otherwise. */ runFilter(id, content) { const pluginCtor = PluginRegistry.getFilter(id); if (!pluginCtor) throw new Error(`Filter plugin "${id}" not found.`); const plugin = new pluginCtor(); const result = plugin.apply(content); return plugin.filterType === 'MUST' ? result : !result; } // --- Getters for internal state --- getFormattedData() { return this.convoKitFormattedData; } getContextResult() { return this.ckContextResult; } getIntermediateConversations() { return this.ckTurnListConversations; } getSampledConversations() { return this.sampledConversations; } } //# sourceMappingURL=index.js.map