UNPKG

@lewist9x/distil

Version:

An opinionated library for managing LLM pipelines. Define, track, rate, and curate prompt–completion pairs for fine-tuning.

266 lines (265 loc) • 9.65 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const express_1 = require("express"); const elasticsearch_1 = require("@elastic/elasticsearch"); const config_1 = require("../config"); const openai_1 = __importDefault(require("openai")); const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const level_1 = require("level"); const router = (0, express_1.Router)(); const esClient = new elasticsearch_1.Client({ node: config_1.config.elastic.host }); const openai = new openai_1.default({ apiKey: config_1.config.openai.apiKey }); // Initialize LevelDB const db = new level_1.Level('./db', { valueEncoding: 'json' }); function isValidGenerationDocument(doc) { return (typeof doc === 'object' && doc !== null && typeof doc.metadata === 'object' && doc.metadata !== null && typeof doc.metadata.input === 'object' && doc.metadata.input !== null && typeof doc.metadata.input.preprocessed === 'object' && doc.metadata.input.preprocessed !== null && typeof doc.metadata.input.preprocessed.systemPrompt === 'string' && typeof doc.metadata.input.preprocessed.userPrompt === 'string' && typeof doc.metadata.pipelineName === 'string' && typeof doc.processedOutput === 'string'); } // Prepare data for OpenAI finetuning format function formatForFinetuning(generations) { return generations.map(gen => ({ messages: [ { role: "system", content: gen.metadata.input.preprocessed.systemPrompt }, { role: "user", content: gen.metadata.input.preprocessed.userPrompt }, { role: "assistant", content: gen.processedOutput } ] })); } function formatForOpenAI() { return { role: "system", content: "You are a helpful assistant." }; } function formatMessages(generation) { var _a, _b, _c; const messages = [ formatForOpenAI(), { role: "user", content: (_b = (_a = generation.metadata) === null || _a === void 0 ? void 0 : _a.input.parameters) !== null && _b !== void 0 ? _b : "" }, { role: "assistant", content: (_c = generation.metadata) === null || _c === void 0 ? void 0 : _c.output } ]; return messages; } function formatForOpenAIGeneration(generation) { return { messages: [ { role: "user", content: generation.metadata.input.parameters || JSON.stringify(generation.metadata.input) }, { role: "assistant", content: JSON.stringify(generation.metadata.output) } ] }; } // Ensure temp directory exists const tempDir = path_1.default.join(__dirname, '../../temp'); if (!fs_1.default.existsSync(tempDir)) { fs_1.default.mkdirSync(tempDir, { recursive: true }); } router.post('/prepare', async (req, res) => { try { const { pipelineHash, generationIds, minRating, maxRating, model } = req.body; let generations = []; if (pipelineHash) { // Get all generations for a pipeline version const must = [{ term: { pipelineHash } }]; if (minRating !== undefined) { must.push({ range: { rating: { gte: minRating } } }); } if (maxRating !== undefined) { must.push({ range: { rating: { lte: maxRating } } }); } const response = await esClient.search({ index: '*', query: { bool: { must } }, size: 1000 }); generations = response.hits.hits .filter((hit) => hit._source !== undefined && isValidGenerationDocument(hit._source) && typeof hit._id === 'string') .map(hit => ({ ...hit._source, id: hit._id })); } else if (generationIds) { // Get specific generations by IDs const response = await esClient.mget({ index: '*', ids: generationIds }); generations = response.docs .filter((doc) => 'found' in doc && doc.found === true && '_source' in doc && typeof doc._id === 'string' && isValidGenerationDocument(doc._source)) .map(doc => ({ ...doc._source, id: doc._id })); } if (generations.length === 0) { return res.status(400).json({ error: 'No generations found matching criteria' }); } // Format data for finetuning const finetuningData = formatForFinetuning(generations); // Create a JSONL file const filePath = path_1.default.join(tempDir, `finetune_${Date.now()}.jsonl`); const jsonl = finetuningData.map(item => JSON.stringify(item)).join('\n'); fs_1.default.writeFileSync(filePath, jsonl); // Create file for fine-tuning const file = await openai.files.create({ file: fs_1.default.createReadStream(filePath), purpose: 'fine-tune' }); // Clean up the temp file fs_1.default.unlinkSync(filePath); // Start finetuning job const fineTune = await openai.fineTuning.jobs.create({ training_file: file.id, model }); // Update generations as finetuned await Promise.all(generations.map(gen => esClient.update({ index: gen.metadata.pipelineName.toLowerCase(), id: gen.id, doc: { isFinetuned: true, finetuneJobId: fineTune.id } }))); res.json({ success: true, jobId: fineTune.id, numGenerations: generations.length }); } catch (error) { console.error('Finetune preparation error:', error); res.status(500).json({ error: error.message || 'Unknown error occurred' }); } }); router.post('/prepare-openai', async (req, res) => { let filePath = null; try { const { pipelineName, generationIds, model } = req.body; if (!pipelineName || !Array.isArray(generationIds)) { throw new Error('Pipeline name and generation IDs are required'); } // Get generations from LevelDB const generations = []; for (const id of generationIds) { try { const gen = await db.get(`generation:${pipelineName}:${id}`); if (gen) { generations.push(JSON.parse(gen)); } } catch (err) { console.warn(`Generation ${id} not found`); } } if (generations.length === 0) { throw new Error('No generations found for the given IDs'); } // Format for OpenAI const trainingData = generations.map(formatMessages); const jsonl = trainingData.map(data => JSON.stringify({ messages: data })).join('\n'); // Create a JSONL file filePath = path_1.default.join(tempDir, `training_data_${Date.now()}.jsonl`); fs_1.default.writeFileSync(filePath, jsonl); // Create file for fine-tuning const response = await openai.files.create({ file: fs_1.default.createReadStream(filePath), purpose: 'fine-tune' }); // Clean up the temp file fs_1.default.unlinkSync(filePath); filePath = null; res.json({ success: true, file: response.id, message: `Created file ${response.id} with ${generations.length} examples from pipeline ${pipelineName}` }); } catch (error) { console.error('Error preparing fine-tune data:', error); // Clean up temp file if it exists if (filePath && fs_1.default.existsSync(filePath)) { fs_1.default.unlinkSync(filePath); } res.status(500).json({ success: false, error: error.message || 'Unknown error occurred' }); } }); router.post('/start', async (req, res) => { try { const { fileId, model = "gpt-3.5-turbo" } = req.body; if (!fileId) { throw new Error('File ID is required'); } const fineTune = await openai.fineTuning.jobs.create({ training_file: fileId, model }); res.json({ success: true, jobId: fineTune.id, message: `Started fine-tuning job ${fineTune.id}` }); } catch (error) { console.error('Error starting fine-tune:', error); res.status(500).json({ success: false, error: error.message || 'Unknown error occurred' }); } }); router.get('/status/:jobId', async (req, res) => { try { const { jobId } = req.params; const status = await openai.fineTuning.jobs.retrieve(jobId); res.json({ success: true, status }); } catch (error) { console.error('Error checking fine-tune status:', error); res.status(500).json({ success: false, error: error.message || 'Unknown error occurred' }); } }); exports.default = router;