@guyycodes/plugin-sdk
Version:
AI-powered plugin scaffolding tool - Create full-stack applications with 7+ AI models, 50+ business integrations, and production-ready infrastructure
1,295 lines (1,075 loc) ⢠262 kB
JavaScript
// create a folder in the models folder
// and the sub folders /Flux, /Phi4_multimodal, /Qwen25Math, /DeepHermes3
// create related pythong scripts and then use this inside createBackend.js
const fs = require('fs-extra');
const path = require('path');
const chalk = require('chalk');
async function createModelFiles(serverPath) {
console.log(chalk.blue('š¦ Creating model management files...'));
// Create models directory structure
const modelsPath = path.join(serverPath, 'models');
fs.ensureDirSync(modelsPath);
// Create subdirectories for each model
const modelDirs = ['DeepHermes3', 'Flux', 'Phi4_multimodal', 'Qwen25Math', 'Qwen25VL', 'Qwen25Code', 'FluxKontext'];
modelDirs.forEach(dir => {
const modelDirPath = path.join(modelsPath, dir);
fs.ensureDirSync(modelDirPath);
// Create empty 'model' subdirectory inside each model directory
fs.ensureDirSync(path.join(modelDirPath, 'model'));
// Create images_output directory for Flux model
if (dir === 'Flux') {
fs.ensureDirSync(path.join(modelDirPath, 'images_output'));
}
});
// Create main model manager
await createModelManager(modelsPath);
await createModelLoader(modelsPath);
await createToolAdapter(modelsPath);
await createB2Downloader(modelsPath);
await createDownloadTracker(modelsPath);
await createMemoryOptimizer(modelsPath);
await createAddModelReadme(modelsPath);
// Create download scripts for each model
await createQwen25MathDownloader(path.join(modelsPath, 'Qwen25Math'));
await createQwen25MathEntrypoint(path.join(modelsPath, 'Qwen25Math'));
await createQwen25MathReadme(path.join(modelsPath, 'Qwen25Math'));
await createQwen25VLDownloader(path.join(modelsPath, 'Qwen25VL'));
await createQwen25VLEntrypoint(path.join(modelsPath, 'Qwen25VL'));
await createQwen25VLReadme(path.join(modelsPath, 'Qwen25VL'));
await createFluxKontextDownloader(path.join(modelsPath, 'FluxKontext'));
await createFluxKontextEntrypoint(path.join(modelsPath, 'FluxKontext'));
await createFluxKontextReadme(path.join(modelsPath, 'FluxKontext'));
await createFluxDownloader(path.join(modelsPath, 'Flux'));
await createFluxEntrypoint(path.join(modelsPath, 'Flux'));
await createFluxReadme(path.join(modelsPath, 'Flux'));
await createQwen25CodeDownloader(path.join(modelsPath, 'Qwen25Code'));
await createQwen25CodeEntrypoint(path.join(modelsPath, 'Qwen25Code'));
// no reademe for code model
await createDeepHermes3Downloader(path.join(modelsPath, 'DeepHermes3'));
await createDeepHermes3Entrypoint(path.join(modelsPath, 'DeepHermes3'));
await createDeepHermes3Readme(path.join(modelsPath, 'DeepHermes3'));
await createPhi4MultimodalDownloader(path.join(modelsPath, 'Phi4_multimodal'));
await createPhi4MultimodalEntrypoint(path.join(modelsPath, 'Phi4_multimodal'));
await createPhi4MultimodalReadme(path.join(modelsPath, 'Phi4_multimodal'));
console.log(chalk.green('ā
Model management files created successfully'));
}
async function createAddModelReadme(modelsPath) {
const readmePy = `\# Adding a New Model to the System
This guide explains how to add a new model to the plugin system. Follow these steps in order:
\#\# 1. Create Model Directory
- Create a new directory under \`/models/\` with your model name (e.g., \`YourModelName/\`)
- Inside this directory, create:
- \`download_model.py\` - Handles model downloading from HuggingFace and B2 buckets
- \`entrypoint.py\` - Implements the LangChain chat interface for your model
- \`model/\` - Empty directory where the model files will be downloaded
\#\# 2. Frontend Configuration
Update \`src/client/src/pages/Settings.jsx\`:
- Add your model to the \`models\` object with a key, label, size, and description
- The key you use here (e.g., \`YourModelName\`) must match the directory name
#\# 3. Backend Configuration
### 3.1 App Configuration
Update \`app.config.json\`:
- Add your model to the \`models\` array with:
- \`name\`: Must match the frontend key
- \`entrypoint\`: Path to your model's entrypoint.py
### 3.2 Model Manager
Update \`src/server/models/model_manager.py\`:
- Add your model to the \`SUPPORTED_MODELS\` dictionary with:
- HuggingFace model ID
- Download script name (usually \`download_model.py\`)
- If your directory name differs from the model key, add special handling in \`is_model_available()\` and \`download_model()\` methods
\#\#\# 3.3 Model Loader
Update \`src/server/models/model_loader.py\`:
- Add your model's factory function (e.g., \`create_your_model_chat\`) to the \`_load_local_model()\` method
- Also add the class name as a fallback (e.g., \`YourModelChat\`)
#\# 4. Implementation Requirements
\#\#\# download_model.py
Your download script should:
- Support both HuggingFace and B2 bucket downloads
- Use environment detection to choose the appropriate source
- Accept \`--force-download\` flag
- Handle the model size appropriately
#\#\# entrypoint.py
Your entrypoint should:
- Implement \`BaseChatModel\` from LangChain
- Include a factory function (e.g., \`create_your_model_chat()\`)
- Handle device selection (MPS, CUDA, CPU)
- Implement required methods: \`_generate\`, \`_agenerate\`, \`_stream\`, \`_astream\`
- Set appropriate model properties and parameters
#\# 5. Testing
After adding all configurations:
1. Check that the model appears in the Settings page
2. Test the download functionality
3. Verify the model loads correctly when selected
4. Test chat functionality with your model
#\# Directory Structure Example
\`\`\`
models/
āāā YourModelName/
ā āāā download_model.py
ā āāā entrypoint.py
ā āāā model/
ā āāā (downloaded model files)
āāā Qwen25Math/
āāā Qwen25VL/
āāā Phi4_multimodal/
āāā README.md
\`\`\`
#\# Important Notes
- Model names must be consistent across all configuration files
- The frontend key, app.config.json name, and model_manager key must all match
- Special directory mappings (like phi4 ā Phi4_multimodal) require additional handling
- Always test the full flow from download to usage
`;
fs.writeFileSync(path.join(modelsPath, 'README.md'), readmePy);
}
async function createMemoryOptimizer(modelsPath) {
const memoryOptimizerPy = `# /server/models/memory_optimizer.py
# Memory optimizer for models
import argparse
import logging
import psutil
import os
import gc
import sys
import torch
from typing import Dict, Any, Optional, Tuple
from transformers import AutoModelForCausalLM, AutoTokenizer
# ----------------------------------------------------------------------
# Logger configuration
# ----------------------------------------------------------------------
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# ----------------------------------------------------------------------
# Core Utilities
# ----------------------------------------------------------------------
def get_device_info() -> Dict[str, Any]:
"""
Get comprehensive device and memory information.
Returns:
device_info (dict): Dictionary containing RAM/VRAM availability,
CUDA/MPS detection, and recommended device.
"""
device_info = {
"ram_total_gb": 0.0,
"ram_available_gb": 0.0,
"has_cuda": False,
"has_mps": False,
"cuda_devices": {},
"mps_info": {},
"recommended_device": "cpu"
}
# Get RAM info
ram = psutil.virtual_memory()
device_info["ram_total_gb"] = ram.total / (1024**3)
device_info["ram_available_gb"] = ram.available / (1024**3)
# Check CUDA availability and get VRAM info
if torch.cuda.is_available():
device_info["has_cuda"] = True
device_info["recommended_device"] = "cuda"
logger.info(f"CUDA detected: {torch.cuda.device_count()} device(s)")
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
torch.cuda.set_device(i)
total_memory = props.total_memory / (1024**3)
allocated_memory = torch.cuda.memory_allocated(i) / (1024**3)
available_memory = total_memory - allocated_memory
device_info["cuda_devices"][f"cuda:{i}"] = {
"name": props.name,
"compute_capability": f"{props.major}.{props.minor}",
"total_memory_gb": total_memory,
"allocated_memory_gb": allocated_memory,
"available_memory_gb": available_memory,
"multiprocessor_count": props.multi_processor_count,
"supports_bf16": torch.cuda.is_bf16_supported(i)
}
logger.info(f" GPU {i}: {props.name} | {available_memory:.1f}GB available / {total_memory:.1f}GB total")
# Check MPS availability (Apple Silicon)
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
device_info["has_mps"] = True
if not device_info["has_cuda"]: # Only use MPS if no CUDA
device_info["recommended_device"] = "mps"
device_info["mps_info"] = {
"is_built": torch.backends.mps.is_built(),
"is_available": torch.backends.mps.is_available()
}
logger.info(f"MPS detected: Built={torch.backends.mps.is_built()}, Available={torch.backends.mps.is_available()}")
# Log summary
logger.info(f"System Memory: {device_info['ram_available_gb']:.1f}GB available / {device_info['ram_total_gb']:.1f}GB total")
logger.info(f"Recommended device: {device_info['recommended_device']}")
return device_info
def get_optimal_model_config(model_size_gb: float = 8.0) -> Dict[str, Any]:
"""
Get optimal model loading config for GPU-only deployment.
Maximizes GPU memory usage for provisioned GPU workloads. If CPU is the only fallback, exit with an error message.
Args:
model_size_gb (float): Estimated model size in GB.
Returns:
config (dict): Dictionary containing recommended loading parameters
(device_map, torch_dtype, quantization, etc.) optimized for GPU-only usage.
"""
device_info = get_device_info()
ram_available = device_info["ram_available_gb"]
has_cuda = device_info["has_cuda"]
has_mps = device_info["has_mps"]
# If no CUDA or MPS, exit cleanly as requested
if not has_cuda and not has_mps:
logger.error("ā CPU only - GPU-intensive operations CPU not supported. Exiting.")
sys.exit(0)
logger.info(f"Optimizing for a {model_size_gb:.1f}GB model")
# Base configuration
config = {
"low_cpu_mem_usage": True,
"trust_remote_code": True,
}
# === CUDA OPTIMIZATION SCENARIOS ===
if has_cuda:
logger.info("š CUDA detected - applying CUDA optimizations")
# Summed VRAM across all GPUs
total_vram = sum(gpu["available_memory_gb"] for gpu in device_info["cuda_devices"].values())
primary_gpu = list(device_info["cuda_devices"].values())[0] # Inspect first GPU for BF16
logger.info(f"Total VRAM available: {total_vram:.1f}GB across {len(device_info['cuda_devices'])} GPU(s)")
# Choose optimal data type based on GPU capabilities
# Choose BF16 if supported, else float16
if primary_gpu["supports_bf16"]:
config["torch_dtype"] = torch.bfloat16
logger.info("ā Using bfloat16")
else:
config["torch_dtype"] = torch.float16
logger.info("ā Using float16")
# Example scenarios (same logic you used, slightly consolidated):
# Scenario 1: Very low RAM - Using 4-bit or 8-bit quantization.
if ram_available < 16 and total_vram <= model_size_gb * 1.0:
logger.info("šļø Very low RAM - Using 4-bit or 8-bit quantization.")
config.update({
"load_in_4bit": True,
"bnb_4bit_compute_dtype": config["torch_dtype"],
"bnb_4bit_use_double_quant": True,
"bnb_4bit_quant_type": "nf4",
"device_map": "auto",
"max_memory": _get_cuda_max_memory_config(device_info, ram_available),
})
elif ram_available < 32 and total_vram <= model_size_gb * 1.0:
logger.info("šÆ Low RAM + Sufficient VRAM - Using 8-bit quantization.")
config.update({
"load_in_8bit": True,
"llm_int8_enable_fp32_cpu_offload": True,
"device_map": "auto",
"max_memory": _get_cuda_max_memory_config(device_info, ram_available),
})
# Scenario 3: Sufficient RAM - GPU-only loading
else:
logger.info("ā
GPU-only mode - maximizing GPU memory usage.")
config.update({
"device_map": "auto",
"max_memory": _get_cuda_max_memory_config(device_info, ram_available),
})
# === MPS FALLBACK (less optimized) ===
elif has_mps:
logger.info("š MPS detected - minimal MPS config.")
config.update({
"torch_dtype": torch.float16,
"device_map": None,
})
logger.info(f"Final config: {config}")
return config
def _get_cuda_max_memory_config(device_info: Dict[str, Any], ram_available: float) -> Dict[str, str]:
"""
Generate max_memory configuration dictionary for CUDA device mapping.
This function calculates optimal memory allocation for GPUs only,
maximizing GPU usage for provisioned GPU workloads.
Args:
device_info (Dict[str, Any]): Device information from get_device_info()
ram_available (float): Available system RAM in GB
Returns:
Dict[str, str]: Memory configuration dictionary where:
- Keys: Device identifiers (integers for GPUs only)
- Values: Memory allocation strings (e.g., "22GB", "18GB")
- Example: {0: "22GB", 1: "18GB"}
"""
max_memory = {}
# GPU-only configuration - no CPU allocation for offloading
logger.info("š GPU-only configuration - maximizing GPU memory usage.")
# Maximize GPU memory allocation for each device
for device_id, gpu in device_info["cuda_devices"].items():
# Extract integer device ID from string like "cuda:0" -> 0
gpu_id = int(device_id.split(":")[-1])
available_vram = gpu["available_memory_gb"]
# More aggressive GPU memory allocation since we're GPU-only
if available_vram > 32:
# Reserve only 1GB for CUDA overhead on high-end GPUs
vram_for_model = available_vram - 1
logger.info(f"GPU {device_id}: reserving {vram_for_model:.1f}GB for model (GPU-only mode).")
elif available_vram > 16:
# Reserve 1GB for CUDA overhead on mid-range GPUs
vram_for_model = available_vram - 1
logger.info(f"GPU {device_id}: reserving {vram_for_model:.1f}GB for model (GPU-only mode).")
else:
# Reserve minimal overhead on smaller GPUs
vram_for_model = max(0.5, available_vram - 0.5)
logger.info(f"GPU {device_id}: reserving {vram_for_model:.1f}GB for model (GPU-only mode).")
# Use integer key for GPU devices
max_memory[gpu_id] = f"{vram_for_model:.1f}GB"
logger.info(f"max_memory config: {max_memory}")
return max_memory
def optimize_for_generation(model, tokenizer) -> Tuple[Any, Any]:
"""
Optimize loaded model and tokenizer for inference.
"""
device_info = get_device_info()
# If CPU only, exit as requested
if not device_info["has_cuda"] and not device_info["has_mps"]:
logger.error("ā CPU only - GPU-intensive ops not supported. Exiting.")
sys.exit(0)
if device_info["has_cuda"]:
# Enable xformers memory efficient attention if available
if hasattr(model, 'enable_xformers_memory_efficient_attention'):
try:
model.enable_xformers_memory_efficient_attention()
logger.info("ā Enabled xformers memory efficient attention.")
except Exception as e:
logger.info(f"ā xformers not available: {e}")
# Check if Flash Attention 2 is active (some models/hubs may have it)
if hasattr(model.config, '_attn_implementation') and model.config._attn_implementation == 'flash_attention_2':
logger.info("ā Flash Attention 2 is active.")
# Enable attention slicing if available
if hasattr(model, 'enable_attention_slicing'):
model.enable_attention_slicing("max")
logger.info("ā Enabled attention slicing for CUDA.")
# Enable gradient checkpointing for memory efficiency (during inference this is minor)
if hasattr(model, 'gradient_checkpointing_enable'):
model.gradient_checkpointing_enable()
logger.info("ā Enabled gradient checkpointing.")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
logger.info("ā Enabled TF32 for CUDA.")
elif device_info["has_mps"]:
logger.info("š Applying MPS optimizations.")
if hasattr(model, 'enable_attention_slicing'):
model.enable_attention_slicing("max")
logger.info("ā Enabled attention slicing for MPS.")
# === UNIVERSAL OPTIMIZATIONS ===
model.eval()
for param in model.parameters():
param.requires_grad = False
logger.info("ā Model set to eval mode; gradients disabled.")
return model, tokenizer
def cleanup_memory():
"""
Free GPU caches and run garbage collection.
"""
device_info = get_device_info()
# If CPU only, just do a GC
if not device_info["has_cuda"] and not device_info["has_mps"]:
logger.error("ā CPU only - skipping GPU memory cleanup. Exiting.")
sys.exit(0)
if device_info["has_cuda"]:
try:
# Clear CUDA cache on all devices
for i in range(torch.cuda.device_count()):
torch.cuda.set_device(i)
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
# Reset memory stats
torch.cuda.reset_peak_memory_stats()
total_memory = sum(
torch.cuda.get_device_properties(i).total_memory / (1024**3)
for i in range(torch.cuda.device_count())
)
allocated_memory = sum(
torch.cuda.memory_allocated(i) / (1024**3)
for i in range(torch.cuda.device_count())
)
logger.info(f"ā CUDA cleanup complete: {allocated_memory:.1f}GB / {total_memory:.1f}GB allocated.")
except Exception as e:
logger.warning(f"ā CUDA cleanup failed: {e}")
elif device_info["has_mps"]:
# MPS has limited cleanup capabilities
try:
if hasattr(torch.backends.mps, 'empty_cache'):
torch.backends.mps.empty_cache()
logger.info("ā MPS cache cleared.")
else:
logger.info("MPS cleanup not available in this PyTorch version.")
except Exception as e:
logger.warning(f"ā MPS cleanup failed: {e}")
collected = gc.collect()
logger.info(f"ā Python garbage collection: {collected} objects collected.")
ram = psutil.virtual_memory()
logger.info(f"ā System RAM after cleanup: {ram.available / (1024**3):.1f}GB available / {ram.total / (1024**3):.1f}GB total.")
# ----------------------------------------------------------------------
# Convenience Functions
# ----------------------------------------------------------------------
def get_memory_info() -> Dict[str, Any]:
"""Backward compatibility function - returns device_info."""
return get_device_info()
def log_memory_summary():
"""
Log a comprehensive memory summary.
"""
device_info = get_device_info()
logger.info("=== MEMORY SUMMARY ===")
logger.info(f"RAM: {device_info['ram_available_gb']:.1f}GB / {device_info['ram_total_gb']:.1f}GB")
logger.info(f"Recommended device: {device_info['recommended_device']}")
if device_info["has_cuda"]:
logger.info("CUDA GPUs:")
for device_id, gpu in device_info["cuda_devices"].items():
logger.info(f" {device_id}: {gpu['name']}")
logger.info(f" VRAM: {gpu['available_memory_gb']:.1f}GB / {gpu['total_memory_gb']:.1f}GB")
logger.info(f" Compute: {gpu['compute_capability']}, BF16: {gpu['supports_bf16']}")
if device_info["has_mps"]:
logger.info(f"MPS: Built={device_info['mps_info']['is_built']}, "
f"Available={device_info['mps_info']['is_available']}")
logger.info("======================")
def get_recommended_batch_size(model_size_gb: float = 8.0) -> int:
"""
Heuristic for recommended batch size based on VRAM.
"""
device_info = get_device_info()
if device_info["has_cuda"]:
max_vram = max(g["available_memory_gb"] for g in device_info["cuda_devices"].values())
available_for_batching = max_vram - model_size_gb
if available_for_batching > 16:
return 4
elif available_for_batching > 8:
return 2
else:
return 1
# Default for CPU/MPS
return 1
# ----------------------------------------------------------------------
# Entry Point for Testing
# ----------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="Memory optimizer test script.")
parser.add_argument("--model_size_gb", type=float, default=8.0, help="Estimated model size in GB.")
args = parser.parse_args()
logger.info("Starting memory optimizer test...")
device_info = get_device_info()
config = get_optimal_model_config(args.model_size_gb)
log_memory_summary()
recommended_bs = get_recommended_batch_size(args.model_size_gb)
logger.info(f"Recommended batch size: {recommended_bs}")
logger.info("Loading a small test model (gpt2) to demonstrate optimization...")
if not device_info["has_cuda"] and not device_info["has_mps"]:
logger.error("ā CPU only - GPU-intensive ops not supported. Exiting.")
sys.exit(0)
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model, tokenizer = optimize_for_generation(model, tokenizer)
cleanup_memory()
logger.info("Memory optimizer test complete.")
# ----------------------------------------------------------------------
if __name__ == "__main__":
main()
`;
fs.writeFileSync(path.join(modelsPath, 'memory_optimizer.py'), memoryOptimizerPy);
}
async function createDownloadTracker(modelsPath) {
const downloadTrackerPy = `"""
Download progress tracker for model downloads
"""
import time
from pathlib import Path
def track_download_progress(model_dir, expected_size_gb, stop_event):
"""Track download progress by monitoring directory size.
Args:
model_dir: Path to the model directory
expected_size_gb: Expected total size in GB
stop_event: Threading event to stop the tracker
"""
expected_bytes = expected_size_gb * 1024 * 1024 * 1024
model_dir = Path(model_dir)
while not stop_event.is_set():
try:
# Calculate current directory size
current_bytes = sum(f.stat().st_size for f in model_dir.rglob('*') if f.is_file())
current_gb = current_bytes / (1024 * 1024 * 1024)
percentage = min(100, (current_bytes / expected_bytes) * 100)
print(f"\\rš¦ Progress: {percentage:.1f}% ({current_gb:.1f}GB / {expected_size_gb}GB)", end='', flush=True)
time.sleep(2) # Update every 2 seconds
except Exception:
pass # Ignore errors during directory traversal
print() # New line after progress tracking ends
`;
fs.writeFileSync(path.join(modelsPath, 'download_tracker.py'), downloadTrackerPy);
}
async function createB2Downloader(modelsPath) {
const b2DownloaderPy = `"""
Backblaze B2 model downloader
"""
import os
import sys
import subprocess
from pathlib import Path
import threading
import time
import logging
# Import the download tracker
sys.path.append(str(Path(__file__).parent))
from download_tracker import track_download_progress
logger = logging.getLogger(__name__)
# Environment variables - optional, checked at runtime
B2_MODEL_BUCKET_ID_API_KEY = os.getenv("B2_MODEL_BUCKET_ID_API_KEY")
B2_MODEL_BUCKET_API_KEY = os.getenv("B2_MODEL_BUCKET_API_KEY")
def download_from_b2(model_dir: Path, model_name: str, force_download: bool = False,
environment: str = "localhost", bucket_name: str = "bucket-models",
model_size_gb: float = 10.0, use_custom_dirname: bool = False, custom_dirname: str = ""):
"""Download model from B2 bucket.
Args:
model_dir: Directory to download model to
model_name: Name of the model (used as folder name in B2)
force_download: Force re-download even if model exists
environment: Environment name (affects progress tracking)
bucket_name: B2 bucket name
model_size_gb: Approximate model size in GB for progress tracking
"""
# Skip re-download if the directory is non-empty
if model_dir.exists() and any(model_dir.iterdir()) and not force_download:
print(f"Model already exists in {model_dir}. Skipping download (use --force-download to overwrite).")
return
print(f"Downloading {model_name} from B2 bucket...")
print("This may take time depending on model size...")
try:
# Create model directory
model_dir.mkdir(parents=True, exist_ok=True)
# Make sure environment variables are set or at least try the default B2 ones
b2_key_id = B2_MODEL_BUCKET_ID_API_KEY
b2_key_value = B2_MODEL_BUCKET_API_KEY
if not b2_key_id or not b2_key_value:
print("ā Error: B2 credentials not found in environment.")
print("B2 model downloading requires B2_MODEL_BUCKET_ID_API_KEY and B2_MODEL_BUCKET_API_KEY environment variables.")
print("These are optional for basic app functionality but required for B2 model downloads.")
raise ValueError("B2 credentials not available")
# Authorize B2 (headless, no interactive prompt)
auth_result = subprocess.run(
["b2", "account", "authorize", b2_key_id, b2_key_value],
capture_output=True, text=True
)
if auth_result.returncode != 0:
print("ā Error: B2 authentication failed")
print(auth_result.stderr)
raise RuntimeError(f"B2 authentication failed: {auth_result.stderr}")
# Decide how many threads to use in a container
import multiprocessing
cpu_count = multiprocessing.cpu_count()
optimal_threads = min(4, max(1, cpu_count // 2))
print(f"Optimal threads: {optimal_threads} (Detected {cpu_count} CPUs)")
# b2 sync command
if use_custom_dirname:
b2_path = f"b2://{bucket_name}/{model_name}/{custom_dirname}/"
else:
b2_path = f"b2://{bucket_name}/{model_name}/model/"
print(f"Downloading from: {b2_path}")
print(f"Downloading to: {model_dir}")
sync_cmd = [
"b2", "sync",
"--threads", str(optimal_threads),
"--write-buffer-size", "100000000", # Adjust if you have enough memory
b2_path,
f"{model_dir}/" # trailing slash
]
stop_event = None
progress_thread = None
try:
if environment != "localhost":
stop_event = threading.Event()
progress_thread = threading.Thread(
target=track_download_progress,
args=(model_dir, model_size_gb, stop_event),
daemon=False
)
progress_thread.start()
# Run download
result = subprocess.run(sync_cmd, capture_output=True, text=True)
finally:
# Always clean up the thread, even if an exception occurred
if progress_thread and stop_event:
stop_event.set()
progress_thread.join(timeout=1.0) # Properly wait for thread to finish
if result.returncode != 0:
print(f"ā Error downloading from B2: {result.stderr}")
raise RuntimeError(f"B2 download failed: {result.stderr}")
print(f"ā
Successfully downloaded {model_name} from B2")
print(f"Model saved to: {model_dir}")
except Exception as e:
print(f"Error downloading from B2: {e}")
raise
class B2ModelDownloader:
"""Download models from Backblaze B2 bucket"""
`;
fs.writeFileSync(path.join(modelsPath, 'b2_downloader.py'), b2DownloaderPy);
}
async function createModelManager(modelsPath) {
const modelManagerPy = `"""
/server/models/model_manager.py
Simple model manager for checking and downloading models
"""
import os
import subprocess
from pathlib import Path
from typing import Dict, Any, Optional
import logging
import json
import sys
sys.path.append(str(Path(__file__).parent.parent))
from config.utils import get_environment
logger = logging.getLogger(__name__)
class ModelManager:
"""Simple model manager for handling model downloads"""
# Map of model names to their HuggingFace IDs and download scripts
SUPPORTED_MODELS = {
"Qwen25Math": {
"hf_id": "Qwen/Qwen2.5-Math-7B",
"download_script": "download_model.py"
},
"Qwen25VL": {
"hf_id": "Qwen/Qwen2.5-VL-7B-Instruct",
"download_script": "download_model.py"
},
"DeepHermes3": {
"hf_id": "NousResearch/DeepHermes-3-Llama-3-8B-Preview",
"download_script": "download_model.py"
},
"Flux": {
"hf_id": "black-forest-labs/FLUX.1-schnell",
"download_script": "download_model.py"
},
"FluxKontext": {
"hf_id": "black-forest-labs/FLUX.1-Kontext-Dev",
"download_script": "download_model.py"
},
"phi4": {
"hf_id": "microsoft/Phi-4-multimodal-instruct",
"download_script": "download_model.py"
},
"Qwen25Code": {
"hf_id": "Qwen/Qwen2.5-Coder-32B-Instruct",
"download_script": "download_model.py"
},
# Add more models here as needed
}
def __init__(self):
self.models_dir = Path(__file__).parent
def is_model_available(self, model_name: str) -> bool:
"""Check if a model is downloaded and ready to use"""
if model_name.startswith("gpt-"):
# OpenAI models are always available (API-based)
return True
if model_name not in self.SUPPORTED_MODELS:
return False
# Check if model directory exists and has files
# Handle special directory names
model_dir_name = "Phi4_multimodal" if model_name == "phi4" else model_name
model_dir = self.models_dir / model_dir_name / "model"
if model_dir.exists() and any(model_dir.iterdir()):
return True
return False
def download_model(self, model_name: str, progress_callback=None) -> Dict[str, Any]:
"""Download a model if it's supported
Args:
model_name: Name of the model to download
progress_callback: Optional callback function to receive progress updates
"""
if model_name not in self.SUPPORTED_MODELS:
return {
"success": False,
"error": f"Model '{model_name}' is not supported. Supported models: {list(self.SUPPORTED_MODELS.keys())}"
}
model_info = self.SUPPORTED_MODELS[model_name]
# Handle special directory names
model_dir_name = "Phi4_multimodal" if model_name == "phi4" else model_name
download_script = self.models_dir / model_dir_name / model_info["download_script"]
if not download_script.exists():
return {
"success": False,
"error": f"Download script not found for {model_name}"
}
try:
logger.info(f"Starting download for {model_name}...")
environment = get_environment()
if environment == "localhost":
# Run download with output directly to console (not captured)
# This allows HuggingFace download progress to be visible in development
result = subprocess.run(
["python", str(download_script)],
cwd=download_script.parent,
# Don't capture output - let it display directly
capture_output=False,
text=True
)
if result.returncode == 0:
logger.info(f"Successfully downloaded {model_name}")
return {
"success": True,
"message": f"Model '{model_name}' downloaded successfully and is ready to use"
}
else:
logger.error(f"Download failed with return code: {result.returncode}")
return {
"success": False,
"error": f"Download failed with return code: {result.returncode}"
}
else:
# Stream output for non-hugging-face environments
if progress_callback:
progress_callback(f"Starting download for {model_name}...")
process = subprocess.Popen(
["python", str(download_script)],
cwd=download_script.parent,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1
)
# Stream output line by line
for line in process.stdout:
line = line.strip()
if line:
logger.info(line)
if progress_callback:
# Send progress updates
if "Progress:" in line or "Downloading" in line or "%" in line:
progress_callback(line)
process.wait()
if process.returncode == 0:
logger.info(f"Successfully downloaded {model_name}")
return {
"success": True,
"message": f"Model '{model_name}' downloaded successfully and is ready to use"
}
else:
logger.error(f"Download failed with return code: {process.returncode}")
return {
"success": False,
"error": f"Download failed with return code: {process.returncode}"
}
except Exception as e:
logger.error(f"Error downloading model: {e}")
return {
"success": False,
"error": f"Error downloading model: {str(e)}"
}
def get_available_models(self) -> Dict[str, bool]:
"""Get a list of all supported models and their availability status"""
models = {"gpt-4o-mini": True} # Always available
for model_name in self.SUPPORTED_MODELS:
models[model_name] = self.is_model_available(model_name)
return models
# Global instance
model_manager = ModelManager()`;
fs.writeFileSync(path.join(modelsPath, 'model_manager.py'), modelManagerPy);
}
async function createToolAdapter(modelsPath) {
const toolAdapterPy = `"""
/server/models/tool_adapter.py
Tool adapter for models that don't support native tool binding
"""
import json
import re
import logging
from typing import List, Dict, Any, Optional, Union
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import AIMessage, BaseMessage, SystemMessage
from langchain_core.tools import BaseTool
from langchain_core.outputs import ChatResult, ChatGeneration
from langchain_core.callbacks.manager import CallbackManagerForLLMRun, AsyncCallbackManagerForLLMRun
logger = logging.getLogger(__name__)
class ToolSupportWrapper(BaseChatModel):
"""
Wrapper that adds tool support to models that don't have native tool binding.
Uses prompt engineering to instruct the model how to call tools.
"""
model: BaseChatModel
tools: List[BaseTool] = []
def __init__(self, model: BaseChatModel, **kwargs):
# Include model in kwargs for Pydantic validation
kwargs['model'] = model
super().__init__(**kwargs)
def bind_tools(self, tools: List[BaseTool]) -> "ToolSupportWrapper":
"""Bind tools to this wrapper"""
self.tools = tools
return self
def _format_tool_instructions(self, tool: BaseTool) -> str:
"""
Format instructions for a specific tool with clear JSON format and examples.
"""
tool_name = tool.name
tool_desc = tool.description
# Tool-specific formatting with examples
if tool_name == "web_search":
return f"""
You have access to a 'web_search' tool that can find current information.
To use it, respond with ONLY this JSON format:
{{
"tool_use": {{
"name": "web_search",
"arguments": {{
"query": "search query"
}}
}}
}}
Example usage:
{{
"tool_use": {{
"name": "web_search",
"arguments": {{
"query": "latest AI research breakthroughs 2025"
}}
}}
}}"""
elif tool_name == "calculator":
return f"""
You have access to a 'calculator' tool for solving mathematical expressions.
To use it, respond with ONLY this JSON format:
{{
"tool_use": {{
"name": "calculator",
"arguments": {{
"expression": "mathematical expression"
}}
}}
}}
Example usage:
{{
"tool_use": {{
"name": "calculator",
"arguments": {{
"expression": "sqrt(144) + 25 * 2"
}}
}}
}}"""
else:
# Generic tool format
return f"""
You have access to the '{tool_name}' tool: {tool_desc}
To use it, respond with ONLY this JSON format:
{{
"tool_use": {{
"name": "{tool_name}",
"arguments": {{
// Add required arguments here
}}
}}
}}"""
def _format_tools_prompt(self) -> str:
"""
Format all available tools into a clear, structured prompt.
"""
if not self.tools:
return ""
# Build instructions for each tool
tool_instructions = []
for tool in self.tools:
tool_instructions.append(self._format_tool_instructions(tool))
# dont confuse the role of tool_adapter, it orchestrates tools not create responses.
prompt = f"""
IMPORTANT: You have been provided with specialized tools to help answer the user's query.
IMPORTANT: The system has determined you NEED to use these tools for this query.
IMPORTANT: DO NOT Generate a response until told otherwise.
Available tools:
{''.join(tool_instructions)}
"""
return prompt
def _extract_user_query(self, messages: List[BaseMessage]) -> Optional[str]:
"""Extract the most recent user query from messages."""
for msg in reversed(messages):
if hasattr(msg, 'content') and msg.type == "human":
return msg.content
return None
def _parse_tool_calls(self, content: str) -> Optional[List[Dict[str, Any]]]:
"""
Parse tool calls from model output with multiple strategies.
Returns a list of tool call dicts, or None if none found.
"""
logger.debug(f"Parsing tool calls from content: {content[:200]}...")
# Clean the content
content = content.strip()
# Strategy 1: Direct JSON parsing
if content.startswith('{'):
try:
data = json.loads(content)
if "tool_use" in data:
tool_use = data["tool_use"]
tool_call = {
"id": "call_0",
"name": tool_use["name"],
"args": tool_use.get("arguments", {})
}
logger.info(f"Successfully parsed tool call (direct): {tool_call}")
return [tool_call]
except json.JSONDecodeError as e:
logger.debug(f"Direct JSON parsing failed: {e}")
# Strategy 2: Extract JSON blocks with regex
try:
# More robust regex pattern that handles nested objects
json_pattern = r'\\{(?:[^{}]|(?:\\{[^{}]*\\}))*"tool_use"(?:[^{}]|(?:\\{[^{}]*\\}))*\\}'
matches = re.findall(json_pattern, content, re.DOTALL)
if matches:
tool_calls = []
for i, match in enumerate(matches):
try:
data = json.loads(match)
if "tool_use" in data:
tool_use = data["tool_use"]
tool_call = {
"id": f"call_{i}",
"name": tool_use["name"],
"args": tool_use.get("arguments", {})
}
tool_calls.append(tool_call)
logger.info(f"Successfully parsed tool call (regex): {tool_call}")
except json.JSONDecodeError:
continue
if tool_calls:
return tool_calls
except Exception as e:
logger.debug(f"Regex extraction failed: {e}")
# Strategy 3: Look for tool intent without proper formatting
intent_phrases = [
"i searched", "i have searched", "searching for", "let me search",
"calculating", "let me calculate", "computing"
]
if any(phrase in content.lower() for phrase in intent_phrases):
logger.warning(
"Model expressed intent to use tools but didn't follow JSON format. "
"Consider reinforcing the prompt instructions."
)
return None
def _inject_tools_prompt(self, messages: List[BaseMessage]) -> List[BaseMessage]:
"""
Inject tool instructions into the messages in a clean way.
"""
if not self.tools:
return messages
# Check if we have tool results in the conversation
has_tool_results = any(
msg.type == "tool" for msg in messages
)
# If we already have tool results, don't inject tool prompts again
# This prevents the model from calling tools repeatedly
if has_tool_results:
# Find the tool results in the messages
tool_results_content = []
for msg in messages:
if msg.type == "tool":
tool_results_content.append(msg.content)
# Add a system message to prevent more tool calls
# But do NOT ask for synthesis - that's generate_response's job
result_processing_prompt = f"""
IMPORTANT: DO NOT Generate a response until told otherwise."""
messages_copy = messages.copy()
# Find the position after the last tool message
last_tool_idx = -1
for i, msg in enumerate(messages_copy):
if msg.type == "tool":
last_tool_idx = i
if last_tool_idx >= 0:
# Insert the processing prompt after the last tool message
messages_copy.insert(last_tool_idx + 1, SystemMessage(content=result_processing_prompt))
return messages_copy
# Original tool injection logic for first tool call
# Get the formatted tools prompt
tools_prompt = self._format_tools_prompt()
# Extract user query for context
user_query = self._extract_user_query(messages)
# Add specific encouragement based on context
if user_query and any(tool.name == "web_search" for tool in self.tools):
tools_prompt += f"\\n\\nTHE SYSTEM HAS IDENTIFIED THAT YOU NEED TO USE THE WEB SEARCH TOOL TO SEARCH FOR: {user_query}"
# Create a new message list with the tools prompt
messages_copy = messages.copy()
# Check if there's already a system message
if messages_copy and messages_copy[0].type == "system":
# Append to existing system message
messages_copy[0].content = f"{messages_copy[0].content}\\n\\n{tools_prompt}"
else:
# Create new system message at the beginning
system_msg = SystemMessage(content=tools_prompt)
messages_copy.insert(0, system_msg)
return messages_copy
def _generate(
self,
messages: List[BaseMessage],
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> ChatResult:
"""Generate with tool support"""
# Inject tool instructions if tools are bound
if self.tools:
messages = self._inject_tools_prompt(messages)
# Call the underlying model
result = self.model._generate(messages, stop, run_manager, **kwargs)
# Process the response for tool calls
if result.generations and self.tools:
generation = result.generations[0]
ai_message = generation.message
# Try to parse tool calls
tool_calls = self._parse_tool_calls(ai_message.content)
if tool_calls:
# Create a new AIMessage with tool_calls
new_message = AIMessage(
content=ai_message.content,
tool_calls=tool_calls,
additional_kwargs=getattr(ai_message, 'additional_kwargs', {})
)
# Update the generation with the new message
result.generations[0] = ChatGeneration(
message=new_message,
generation_info=generation.generation_info
)
return result
async def _agenerate(
self,
messages: List[BaseMessage],
stop: Optional[List[str]] = None,
run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> ChatResult:
"""Generate asynchronously with tool support"""
# Inject tool instructions if tools are bound
if self.tools:
messages = self._inject_tools_prompt(messages)
# Call the underlying model
result = await self.model._agenerate(messages, stop, run_manager, **kwargs)
# Process the response for tool calls
if result.generations and self.tools:
generation = result.generations[0]
ai_message = generation.message
# Try to parse tool calls
tool_calls = self._parse_tool_calls(ai_message.content)
if tool_calls:
# Create a new AIMessage with tool_calls
new_message = AIMessage(
content=ai_message.content,
tool_calls=tool_calls,
additional_kwargs=getattr(ai_message, 'additional_kwargs', {})
)
# Update the generation with the new message
result.generations[0] = ChatGeneration(
message=new_message,
generation_info=generation.generation_info
)
return result
@property
def _llm_type(self) -> str:
"""Return the type of language model"""
return f"tool_wrapper_{self.model._llm_type}"
@property
def _identifying_params(self) -> Dict[str, Any]:
"""Return the identifying parameters"""
return {
**self.model._identifying_params,
"wrapper": "ToolSupportWrapper",
"tools_count": len(self.tools)
}
def __getattr__(self, name):
"""Delegate unknown attributes to the underlying model"""
return getattr(self.model, name)
def with_multimodal_inputs(self, images=None, audios=None):
"""Pass through multimodal inputs to the underlying model if supported"""
if hasattr(self.model, 'with_multimodal_inputs'):
self.model = self.model.with_multimodal_inputs(images=images, audios=audios)
return self
def wrap_model_wi