@lewist9x/distil
Version:
An opinionated library for managing LLM pipelines. Define, track, rate, and curate prompt–completion pairs for fine-tuning.
266 lines (265 loc) • 9.65 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const elasticsearch_1 = require("@elastic/elasticsearch");
const config_1 = require("../config");
const openai_1 = __importDefault(require("openai"));
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const level_1 = require("level");
const router = (0, express_1.Router)();
const esClient = new elasticsearch_1.Client({ node: config_1.config.elastic.host });
const openai = new openai_1.default({ apiKey: config_1.config.openai.apiKey });
// Initialize LevelDB
const db = new level_1.Level('./db', { valueEncoding: 'json' });
function isValidGenerationDocument(doc) {
return (typeof doc === 'object' &&
doc !== null &&
typeof doc.metadata === 'object' &&
doc.metadata !== null &&
typeof doc.metadata.input === 'object' &&
doc.metadata.input !== null &&
typeof doc.metadata.input.preprocessed === 'object' &&
doc.metadata.input.preprocessed !== null &&
typeof doc.metadata.input.preprocessed.systemPrompt === 'string' &&
typeof doc.metadata.input.preprocessed.userPrompt === 'string' &&
typeof doc.metadata.pipelineName === 'string' &&
typeof doc.processedOutput === 'string');
}
// Prepare data for OpenAI finetuning format
function formatForFinetuning(generations) {
return generations.map(gen => ({
messages: [
{ role: "system", content: gen.metadata.input.preprocessed.systemPrompt },
{ role: "user", content: gen.metadata.input.preprocessed.userPrompt },
{ role: "assistant", content: gen.processedOutput }
]
}));
}
function formatForOpenAI() {
return {
role: "system",
content: "You are a helpful assistant."
};
}
function formatMessages(generation) {
var _a, _b, _c;
const messages = [
formatForOpenAI(),
{
role: "user",
content: (_b = (_a = generation.metadata) === null || _a === void 0 ? void 0 : _a.input.parameters) !== null && _b !== void 0 ? _b : ""
},
{
role: "assistant",
content: (_c = generation.metadata) === null || _c === void 0 ? void 0 : _c.output
}
];
return messages;
}
function formatForOpenAIGeneration(generation) {
return {
messages: [
{
role: "user",
content: generation.metadata.input.parameters || JSON.stringify(generation.metadata.input)
},
{
role: "assistant",
content: JSON.stringify(generation.metadata.output)
}
]
};
}
// Ensure temp directory exists
const tempDir = path_1.default.join(__dirname, '../../temp');
if (!fs_1.default.existsSync(tempDir)) {
fs_1.default.mkdirSync(tempDir, { recursive: true });
}
router.post('/prepare', async (req, res) => {
try {
const { pipelineHash, generationIds, minRating, maxRating, model } = req.body;
let generations = [];
if (pipelineHash) {
// Get all generations for a pipeline version
const must = [{ term: { pipelineHash } }];
if (minRating !== undefined) {
must.push({ range: { rating: { gte: minRating } } });
}
if (maxRating !== undefined) {
must.push({ range: { rating: { lte: maxRating } } });
}
const response = await esClient.search({
index: '*',
query: {
bool: { must }
},
size: 1000
});
generations = response.hits.hits
.filter((hit) => hit._source !== undefined &&
isValidGenerationDocument(hit._source) &&
typeof hit._id === 'string')
.map(hit => ({
...hit._source,
id: hit._id
}));
}
else if (generationIds) {
// Get specific generations by IDs
const response = await esClient.mget({
index: '*',
ids: generationIds
});
generations = response.docs
.filter((doc) => 'found' in doc &&
doc.found === true &&
'_source' in doc &&
typeof doc._id === 'string' &&
isValidGenerationDocument(doc._source))
.map(doc => ({
...doc._source,
id: doc._id
}));
}
if (generations.length === 0) {
return res.status(400).json({ error: 'No generations found matching criteria' });
}
// Format data for finetuning
const finetuningData = formatForFinetuning(generations);
// Create a JSONL file
const filePath = path_1.default.join(tempDir, `finetune_${Date.now()}.jsonl`);
const jsonl = finetuningData.map(item => JSON.stringify(item)).join('\n');
fs_1.default.writeFileSync(filePath, jsonl);
// Create file for fine-tuning
const file = await openai.files.create({
file: fs_1.default.createReadStream(filePath),
purpose: 'fine-tune'
});
// Clean up the temp file
fs_1.default.unlinkSync(filePath);
// Start finetuning job
const fineTune = await openai.fineTuning.jobs.create({
training_file: file.id,
model
});
// Update generations as finetuned
await Promise.all(generations.map(gen => esClient.update({
index: gen.metadata.pipelineName.toLowerCase(),
id: gen.id,
doc: {
isFinetuned: true,
finetuneJobId: fineTune.id
}
})));
res.json({
success: true,
jobId: fineTune.id,
numGenerations: generations.length
});
}
catch (error) {
console.error('Finetune preparation error:', error);
res.status(500).json({ error: error.message || 'Unknown error occurred' });
}
});
router.post('/prepare-openai', async (req, res) => {
let filePath = null;
try {
const { pipelineName, generationIds, model } = req.body;
if (!pipelineName || !Array.isArray(generationIds)) {
throw new Error('Pipeline name and generation IDs are required');
}
// Get generations from LevelDB
const generations = [];
for (const id of generationIds) {
try {
const gen = await db.get(`generation:${pipelineName}:${id}`);
if (gen) {
generations.push(JSON.parse(gen));
}
}
catch (err) {
console.warn(`Generation ${id} not found`);
}
}
if (generations.length === 0) {
throw new Error('No generations found for the given IDs');
}
// Format for OpenAI
const trainingData = generations.map(formatMessages);
const jsonl = trainingData.map(data => JSON.stringify({ messages: data })).join('\n');
// Create a JSONL file
filePath = path_1.default.join(tempDir, `training_data_${Date.now()}.jsonl`);
fs_1.default.writeFileSync(filePath, jsonl);
// Create file for fine-tuning
const response = await openai.files.create({
file: fs_1.default.createReadStream(filePath),
purpose: 'fine-tune'
});
// Clean up the temp file
fs_1.default.unlinkSync(filePath);
filePath = null;
res.json({
success: true,
file: response.id,
message: `Created file ${response.id} with ${generations.length} examples from pipeline ${pipelineName}`
});
}
catch (error) {
console.error('Error preparing fine-tune data:', error);
// Clean up temp file if it exists
if (filePath && fs_1.default.existsSync(filePath)) {
fs_1.default.unlinkSync(filePath);
}
res.status(500).json({
success: false,
error: error.message || 'Unknown error occurred'
});
}
});
router.post('/start', async (req, res) => {
try {
const { fileId, model = "gpt-3.5-turbo" } = req.body;
if (!fileId) {
throw new Error('File ID is required');
}
const fineTune = await openai.fineTuning.jobs.create({
training_file: fileId,
model
});
res.json({
success: true,
jobId: fineTune.id,
message: `Started fine-tuning job ${fineTune.id}`
});
}
catch (error) {
console.error('Error starting fine-tune:', error);
res.status(500).json({
success: false,
error: error.message || 'Unknown error occurred'
});
}
});
router.get('/status/:jobId', async (req, res) => {
try {
const { jobId } = req.params;
const status = await openai.fineTuning.jobs.retrieve(jobId);
res.json({
success: true,
status
});
}
catch (error) {
console.error('Error checking fine-tune status:', error);
res.status(500).json({
success: false,
error: error.message || 'Unknown error occurred'
});
}
});
exports.default = router;