convokit
Version:
A flexible TypeScript framework for ingesting, processing, and exporting chat/conversation data for LLM training and analysis.
288 lines • 13.1 kB
JavaScript
import { Command, Option } from 'commander';
import { ConvoKit } from './index.js';
import { ConvoKitLogging as ckl } from './ck/ConvoKitLogging.js';
import fs from 'fs/promises';
import Path from 'path';
import { loadConfig } from './ck/ConvoKitConfig.js';
import { ProviderRegistry } from './ck/ProviderRegistry.js';
import { PluginRegistry } from './ck/PluginRegistry.js';
const program = new Command();
// --- Helper Functions ---
async function initializeConvoKit(providerIds) {
let config;
try {
config = await loadConfig();
}
catch (error) {
console.error('CLI', `Failed to load configuration: ${error.message}`);
console.info('CLI', 'Ensure convokit.config.json exists or environment variables are set.');
console.info('CLI', 'You can create an example config using: npx convokit create-config');
process.exit(1);
}
const ck = new ConvoKit();
try {
// Load all providers and plugins first to populate registries
await ck.loadProviders(); // This loads config, plugins, and providers
// Filter which providers to actually *use* based on the command option
if (providerIds && providerIds.length > 0) {
const availableProviders = ProviderRegistry.list().map(p => p.id);
const invalidProviders = providerIds.filter(id => !availableProviders.includes(id));
if (invalidProviders.length > 0) {
throw new Error(`Unknown provider(s) specified: ${invalidProviders.join(', ')}. Available: ${availableProviders.join(', ')}`);
}
// Modify the internal list of providers to use for processing
ck.loadedProviderModules = ck.loadedProviderModules.filter((mod) => {
const registeredId = ProviderRegistry.findIdByConstructor(mod.Provider);
return registeredId ? providerIds.includes(registeredId) : false;
});
if (ck.loadedProviderModules.length === 0) {
throw new Error(`None of the specified providers (${providerIds.join(', ')}) have corresponding data folders or files in '${config.inputDataDirName}'.`);
}
ckl.info('CLI', `Using specified providers: ${providerIds.join(', ')}`);
}
else {
const loadedProviderIds = ck.loadedProviderModules.map((mod) => ProviderRegistry.findIdByConstructor(mod.Provider)).filter((id) => id);
ckl.info('CLI', `Using all available providers with data: ${loadedProviderIds.join(', ')}`);
}
}
catch (error) {
ckl.error('CLI', `Initialization failed: ${error.message}`);
process.exit(1);
}
return { ck, config };
}
async function runPipelineToContext(ck, config) {
ckl.info('CLI', 'Processing provider data...');
const processedData = await ck.processDataFromProviders();
if (!processedData || processedData.length === 0) {
throw new Error("No data generated from providers. Check input data directory and provider configurations.");
}
ckl.info('CLI', `Processing complete. ${processedData.length} conversations processed.`);
if (!config.targetUsers || config.targetUsers.length === 0) {
throw new Error("Target Users are required for context generation. Set 'targetUsers' in config.");
}
ckl.info('CLI', 'Parsing data to CKContext format...');
const contextOptions = {
targetUsers: config.targetUsers,
minimumAllowedImportanceChat: config.minImportanceChat,
minimumAllowedImportancePerMessage: config.minImportanceMessage,
};
const contextResult = await ck.parseToContext(contextOptions);
ckl.info('CLI', 'Parsing to CKContext complete.');
ckl.info('CLI', `Context Stats: ${JSON.stringify(contextResult.stats)}`);
return contextResult;
}
async function runFullExportPipeline(ck, config, converterId) {
await runPipelineToContext(ck, config);
ckl.info('CLI', 'Converting to CKTurnList format...');
const turnListData = await ck.convertToCKTurnList();
if (!turnListData || turnListData.length === 0) {
throw new Error("Conversion to TurnList resulted in no data.");
}
ckl.info('CLI', `Conversion to CKTurnList complete. ${turnListData.length} conversations in list.`);
ckl.info('CLI', `Generating ${config.sampleSize} weighted samples...`);
const sampledData = await ck.getWeightedSample(config.sampleSize);
if (!sampledData || sampledData.length === 0) {
throw new Error("Sampling resulted in no data.");
}
ckl.info('CLI', `Sampling complete. ${sampledData.length} conversations sampled.`);
ckl.info('CLI', `Exporting data to ${converterId} format...`);
if (!ck.listConverters().includes(converterId)) {
throw new Error(`Unsupported export format: ${converterId}. Available converters: ${ck.listConverters().join(', ')}`);
}
const exportedData = await ck.runConverter(converterId, config.systemPrompt);
ckl.info('CLI', `Export complete. ${exportedData.length} items exported.`);
return exportedData;
}
async function saveOutput(outputPath, data, asJson = true) {
const absolutePath = Path.resolve(outputPath);
try {
const content = asJson ? JSON.stringify(data, null, 2) : String(data);
await fs.writeFile(absolutePath, content);
ckl.success('CLI', `Output saved successfully to ${absolutePath}`);
}
catch (error) {
ckl.error('CLI', `Failed to write output to ${absolutePath}: ${error.message}`);
process.exit(1);
}
}
// --- CLI Commands ---
program
.name('convokit')
.description('CLI tool for the ConvoKit conversation processing framework.')
.version('0.1.0'); // TODO: Link to package.json version
program
.command('create-config')
.alias('cfg')
.description('Create an example convokit.config.json file in the current directory.')
.action(async () => {
const exampleConfigPath = Path.resolve(__dirname, '..', 'convokit.config.json.example'); // Adjust path as needed
const targetConfigPath = Path.resolve(process.cwd(), 'convokit.config.json');
try {
await fs.copyFile(exampleConfigPath, targetConfigPath, fs.constants.COPYFILE_EXCL);
ckl.success('CLI', `Example configuration file created at ${targetConfigPath}`);
ckl.info('CLI', 'Please edit this file with your specific settings.');
}
catch (error) {
if (error.code === 'EEXIST') {
ckl.error('CLI', `Configuration file already exists at ${targetConfigPath}.`);
}
else {
ckl.error('CLI', `Failed to create configuration file: ${error.message}`);
}
process.exit(1);
}
});
program
.command('providers')
.description('List available built-in and local providers.')
.action(async () => {
ckl.info('CLI', 'Listing available providers...');
let localDir;
try {
const config = await loadConfig();
localDir = config.localProvidersDir;
}
catch { /* Ignore config loading errors for listing */ }
const ck = new ConvoKit();
try {
await ck.loadProviders();
const providers = ProviderRegistry.list();
if (providers.length === 0) {
ckl.warn('CLI', 'No providers found or registered.');
return;
}
ckl.info('CLI', '------------------------------------');
providers.forEach(p => {
ckl.info('CLI', `ID: ${p.id}`);
ckl.info('CLI', ` Name: ${p.info.name}`);
ckl.info('CLI', ` Version: ${p.info.version}`);
ckl.info('CLI', ` Author: ${p.info.author}`);
ckl.info('CLI', ` Input Dir: ${p.info.InputDataInfo.directoryName}`);
ckl.info('CLI', ` Input Ext: ${p.info.InputDataInfo.fileExtension}`);
ckl.info('CLI', '------------------------------------');
});
if (localDir) {
ckl.info('CLI', `(Local providers loaded from: ${localDir})`);
}
}
catch (error) {
ckl.error('CLI', `Failed to list providers: ${error.message}`);
process.exit(1);
}
});
program
.command('plugins')
.description('List available built-in and local plugins (formatters, converters, filters).')
.action(async () => {
ckl.info('CLI', 'Listing available plugins...');
let localDir;
try {
const config = await loadConfig();
localDir = config.localPluginsDir;
}
catch { /* Ignore config loading errors for listing */ }
const ck = new ConvoKit();
try {
await ck.loadProviders();
const listPlugins = (type, lister) => {
const plugins = lister();
if (plugins.length > 0) {
ckl.info('CLI', `--- ${type.toUpperCase()} ---`);
plugins.forEach(id => {
const pluginInfo = PluginRegistry.getPluginInfo(id);
if (pluginInfo) {
ckl.info('CLI', ` ID: ${pluginInfo.id}`);
ckl.info('CLI', ` Name: ${pluginInfo.name}`);
ckl.info('CLI', ` Version: ${pluginInfo.version}`);
}
else {
ckl.warn('CLI', ` ID: ${id} (Info not found - registration issue?)`);
}
});
}
else {
ckl.info('CLI', `--- No ${type.toUpperCase()} registered ---`);
}
};
listPlugins('Formatters', PluginRegistry.listFormatters);
listPlugins('Converters', PluginRegistry.listConverters);
listPlugins('Filters', PluginRegistry.listFilters);
if (localDir) {
ckl.info('CLI', `(Local plugins loaded from: ${localDir})`);
}
}
catch (error) {
ckl.error('CLI', `Failed to list plugins: ${error.message}`);
process.exit(1);
}
});
const providersOption = new Option('-p, --providers <ids>', 'Comma-separated list of provider IDs to use (e.g., discord,telegram). Uses all available if omitted.').argParser((value) => value.split(',').map(s => s.trim()).filter(s => s));
program
.command('context')
.description('Process data and generate CKContext output. Outputs plain text context by default.')
.addOption(providersOption)
.addOption(new Option('-o, --output <file>', 'Output file path for the CKContext result.'))
.addOption(new Option('--stats', 'Output the full JSON object including stats, instead of just the context text.'))
.action(async (options) => {
ckl.info('CLI', 'Starting CKContext generation...');
try {
const { ck, config } = await initializeConvoKit(options.providers);
const contextResult = await runPipelineToContext(ck, config);
if (options.output) {
if (options.stats) {
// Save the full JSON object (stats + processedData)
await saveOutput(options.output, contextResult, true);
}
else {
// Save only the processedData string as plain text
await saveOutput(options.output, contextResult.processedData, false);
}
}
else {
ckl.warn('CLI', 'CKContext generated in memory. Use --output to save.');
ckl.info('CLI', `Generated context string length: ${contextResult.processedData.length}`);
if (options.stats) {
ckl.info('CLI', `Stats: ${JSON.stringify(contextResult.stats)}`);
}
}
ckl.success('CLI', 'CKContext generation finished.');
}
catch (error) {
ckl.error('CLI', `CKContext generation failed: ${error.message}`);
process.exit(1);
}
});
program
.command('export')
.description('Run the full pipeline (process, context, turnlist, sample) and export to a specified format.')
.argument('<converter_id>', 'The ID of the converter plugin to use (e.g., chatml, gemini).')
.addOption(providersOption)
.addOption(new Option('-o, --output <file>', 'Output file path for the exported data (JSON array of strings).'))
.action(async (converterId, options) => {
ckl.info('CLI', `Starting export process for format: ${converterId}...`);
try {
const { ck, config } = await initializeConvoKit(options.providers);
const exportedData = await runFullExportPipeline(ck, config, converterId);
if (options.output) {
await saveOutput(options.output, exportedData);
}
else {
ckl.warn('CLI', 'Exported data generated in memory. Use --output to save.');
ckl.info('CLI', `Generated ${exportedData.length} export items.`);
}
ckl.success('CLI', 'Export process finished.');
}
catch (error) {
ckl.error('CLI', `Export failed: ${error.message}`);
process.exit(1);
}
});
program.parseAsync(process.argv)
.catch(err => {
ckl.error('CLI', `Unhandled error: ${err.message}`);
console.error(err.stack);
process.exit(1);
});
//# sourceMappingURL=cli.js.map