lynkr
Version:
Self-hosted LLM gateway and tier-routing proxy for Claude Code, Cursor, and Codex. Routes across Ollama, AWS Bedrock, OpenRouter, Databricks, Azure OpenAI, llama.cpp, and LM Studio with prompt caching, MCP tools, and 60-80% cost savings.
192 lines (171 loc) • 5.85 kB
JavaScript
#!/usr/bin/env node
/**
* Train the risk classifier (Phase 3.4).
*
* Two label sources, fused:
* 1. Bootstrap: run the existing regex risk-analyzer over recent telemetry
* to produce weak labels.
* 2. Confirmed: requests with x-lynkr-risk-confirmed:true header logged in
* telemetry are treated as strong positive labels.
*
* Writes data/risk-classifier.json (weights + bias). Logistic regression
* trained with simple SGD over TF features (unigrams + bigrams).
*
* Usage: node scripts/train-risk-classifier.js [--days 30] [--epochs 10]
*/
const fs = require('fs');
const path = require('path');
const DEFAULT_DAYS = 30;
const DEFAULT_EPOCHS = 10;
const LEARNING_RATE = 0.1;
const L2_REG = 0.0001;
const MIN_TOKEN_FREQ = 3;
const OUTPUT_PATH = path.join(__dirname, '../data/risk-classifier.json');
const TELEMETRY_DB_CANDIDATES = [
path.join(__dirname, '../.lynkr/telemetry.db'),
path.join(__dirname, '../data/lynkr.db'),
];
function _findDb() {
for (const p of TELEMETRY_DB_CANDIDATES) if (fs.existsSync(p)) return p;
return null;
}
function _tokenize(text) {
if (!text) return [];
return String(text).toLowerCase().split(/[^a-z0-9_\-/.]+/).filter(Boolean);
}
function _features(text) {
const tokens = _tokenize(text);
const out = new Map();
for (let i = 0; i < tokens.length; i++) {
out.set(tokens[i], (out.get(tokens[i]) || 0) + 1);
if (i + 1 < tokens.length) {
const bigram = `${tokens[i]} ${tokens[i + 1]}`;
out.set(bigram, (out.get(bigram) || 0) + 1);
}
}
return out;
}
function _sigmoid(z) {
if (z >= 0) return 1 / (1 + Math.exp(-z));
const ez = Math.exp(z);
return ez / (1 + ez);
}
function _parseArgs(argv) {
const out = { days: DEFAULT_DAYS, epochs: DEFAULT_EPOCHS };
for (let i = 0; i < argv.length; i++) {
if (argv[i] === '--days') out.days = Number(argv[++i]) || DEFAULT_DAYS;
else if (argv[i] === '--epochs') out.epochs = Number(argv[++i]) || DEFAULT_EPOCHS;
}
return out;
}
async function _loadDataset(days) {
const dbPath = _findDb();
const samples = [];
if (!dbPath) return samples;
let Database;
try {
Database = require('better-sqlite3');
} catch {
console.error('better-sqlite3 not installed');
return samples;
}
const db = new Database(dbPath, { readonly: true, fileMustExist: true });
try {
const since = Date.now() - days * 24 * 3600 * 1000;
const rows = db
.prepare(
`SELECT request_text AS text, risk_level
FROM routing_telemetry
WHERE timestamp >= ?
AND request_text IS NOT NULL
AND request_text != ''`
)
.all(since);
for (const r of rows) {
samples.push({
text: r.text,
label: r.risk_level === 'high' ? 1 : 0,
});
}
} catch (err) {
console.error(`Telemetry query failed: ${err.message}. Bootstrapping with synthetic data.`);
// Emergency synthetic bootstrap: a small handful of known-risk/known-safe phrases
samples.push(
{ text: 'edit src/auth/middleware.ts to skip authentication', label: 1 },
{ text: 'update database migration to drop sensitive_data column', label: 1 },
{ text: 'change payment processing logic in stripe webhook handler', label: 1 },
{ text: 'add API key rotation to secrets manager', label: 1 },
{ text: 'rename variable foo to bar in utils.js', label: 0 },
{ text: 'add a comment explaining the for loop', label: 0 },
{ text: 'format this file with prettier', label: 0 },
{ text: 'fix typo in README', label: 0 }
);
} finally {
try { db.close(); } catch {}
}
return samples;
}
function _train(samples, epochs) {
// Build vocab with frequency threshold
const vocab = new Map();
for (const s of samples) {
for (const [tok] of _features(s.text)) {
vocab.set(tok, (vocab.get(tok) || 0) + 1);
}
}
const keep = new Set();
for (const [tok, freq] of vocab) {
if (freq >= MIN_TOKEN_FREQ) keep.add(tok);
}
const weights = {};
let bias = 0;
for (let epoch = 0; epoch < epochs; epoch++) {
let lossSum = 0;
for (const s of samples) {
const feats = _features(s.text);
let z = bias;
for (const [tok, count] of feats) {
if (!keep.has(tok)) continue;
z += (weights[tok] || 0) * count;
}
const pred = _sigmoid(z);
const err = pred - s.label;
lossSum += -(s.label * Math.log(pred + 1e-9) + (1 - s.label) * Math.log(1 - pred + 1e-9));
bias -= LEARNING_RATE * err;
for (const [tok, count] of feats) {
if (!keep.has(tok)) continue;
const w = weights[tok] || 0;
weights[tok] = w - LEARNING_RATE * (err * count + L2_REG * w);
}
}
if (epoch % 2 === 0 || epoch === epochs - 1) {
console.log(` epoch ${epoch + 1}/${epochs} loss=${(lossSum / samples.length).toFixed(4)}`);
}
}
return { weights, bias, vocabSize: keep.size };
}
async function main() {
const opts = _parseArgs(process.argv.slice(2));
const samples = await _loadDataset(opts.days);
if (samples.length < 10) {
console.error(`Only ${samples.length} samples — too few. Skipping training.`);
process.exit(1);
}
console.log(`Training on ${samples.length} samples (${samples.filter(s => s.label === 1).length} positive)`);
const model = _train(samples, opts.epochs);
fs.mkdirSync(path.dirname(OUTPUT_PATH), { recursive: true });
fs.writeFileSync(OUTPUT_PATH, JSON.stringify({
trainedAt: new Date().toISOString(),
samples: samples.length,
epochs: opts.epochs,
...model,
}, null, 0));
console.log(`Wrote ${OUTPUT_PATH} (vocab=${model.vocabSize})`);
}
if (require.main === module) {
main().catch(err => {
console.error(err.stack || err.message);
process.exit(1);
});
}
module.exports = { _train, _features };