UNPKG

@guyycodes/plugin-sdk

Version:

AI-powered plugin scaffolding tool - Create full-stack applications with 7+ AI models, 50+ business integrations, and production-ready infrastructure

1,295 lines (1,075 loc) • 262 kB
// create a folder in the models folder // and the sub folders /Flux, /Phi4_multimodal, /Qwen25Math, /DeepHermes3 // create related pythong scripts and then use this inside createBackend.js const fs = require('fs-extra'); const path = require('path'); const chalk = require('chalk'); async function createModelFiles(serverPath) { console.log(chalk.blue('šŸ“¦ Creating model management files...')); // Create models directory structure const modelsPath = path.join(serverPath, 'models'); fs.ensureDirSync(modelsPath); // Create subdirectories for each model const modelDirs = ['DeepHermes3', 'Flux', 'Phi4_multimodal', 'Qwen25Math', 'Qwen25VL', 'Qwen25Code', 'FluxKontext']; modelDirs.forEach(dir => { const modelDirPath = path.join(modelsPath, dir); fs.ensureDirSync(modelDirPath); // Create empty 'model' subdirectory inside each model directory fs.ensureDirSync(path.join(modelDirPath, 'model')); // Create images_output directory for Flux model if (dir === 'Flux') { fs.ensureDirSync(path.join(modelDirPath, 'images_output')); } }); // Create main model manager await createModelManager(modelsPath); await createModelLoader(modelsPath); await createToolAdapter(modelsPath); await createB2Downloader(modelsPath); await createDownloadTracker(modelsPath); await createMemoryOptimizer(modelsPath); await createAddModelReadme(modelsPath); // Create download scripts for each model await createQwen25MathDownloader(path.join(modelsPath, 'Qwen25Math')); await createQwen25MathEntrypoint(path.join(modelsPath, 'Qwen25Math')); await createQwen25MathReadme(path.join(modelsPath, 'Qwen25Math')); await createQwen25VLDownloader(path.join(modelsPath, 'Qwen25VL')); await createQwen25VLEntrypoint(path.join(modelsPath, 'Qwen25VL')); await createQwen25VLReadme(path.join(modelsPath, 'Qwen25VL')); await createFluxKontextDownloader(path.join(modelsPath, 'FluxKontext')); await createFluxKontextEntrypoint(path.join(modelsPath, 'FluxKontext')); await createFluxKontextReadme(path.join(modelsPath, 'FluxKontext')); await createFluxDownloader(path.join(modelsPath, 'Flux')); await createFluxEntrypoint(path.join(modelsPath, 'Flux')); await createFluxReadme(path.join(modelsPath, 'Flux')); await createQwen25CodeDownloader(path.join(modelsPath, 'Qwen25Code')); await createQwen25CodeEntrypoint(path.join(modelsPath, 'Qwen25Code')); // no reademe for code model await createDeepHermes3Downloader(path.join(modelsPath, 'DeepHermes3')); await createDeepHermes3Entrypoint(path.join(modelsPath, 'DeepHermes3')); await createDeepHermes3Readme(path.join(modelsPath, 'DeepHermes3')); await createPhi4MultimodalDownloader(path.join(modelsPath, 'Phi4_multimodal')); await createPhi4MultimodalEntrypoint(path.join(modelsPath, 'Phi4_multimodal')); await createPhi4MultimodalReadme(path.join(modelsPath, 'Phi4_multimodal')); console.log(chalk.green('āœ… Model management files created successfully')); } async function createAddModelReadme(modelsPath) { const readmePy = `\# Adding a New Model to the System This guide explains how to add a new model to the plugin system. Follow these steps in order: \#\# 1. Create Model Directory - Create a new directory under \`/models/\` with your model name (e.g., \`YourModelName/\`) - Inside this directory, create: - \`download_model.py\` - Handles model downloading from HuggingFace and B2 buckets - \`entrypoint.py\` - Implements the LangChain chat interface for your model - \`model/\` - Empty directory where the model files will be downloaded \#\# 2. Frontend Configuration Update \`src/client/src/pages/Settings.jsx\`: - Add your model to the \`models\` object with a key, label, size, and description - The key you use here (e.g., \`YourModelName\`) must match the directory name #\# 3. Backend Configuration ### 3.1 App Configuration Update \`app.config.json\`: - Add your model to the \`models\` array with: - \`name\`: Must match the frontend key - \`entrypoint\`: Path to your model's entrypoint.py ### 3.2 Model Manager Update \`src/server/models/model_manager.py\`: - Add your model to the \`SUPPORTED_MODELS\` dictionary with: - HuggingFace model ID - Download script name (usually \`download_model.py\`) - If your directory name differs from the model key, add special handling in \`is_model_available()\` and \`download_model()\` methods \#\#\# 3.3 Model Loader Update \`src/server/models/model_loader.py\`: - Add your model's factory function (e.g., \`create_your_model_chat\`) to the \`_load_local_model()\` method - Also add the class name as a fallback (e.g., \`YourModelChat\`) #\# 4. Implementation Requirements \#\#\# download_model.py Your download script should: - Support both HuggingFace and B2 bucket downloads - Use environment detection to choose the appropriate source - Accept \`--force-download\` flag - Handle the model size appropriately #\#\# entrypoint.py Your entrypoint should: - Implement \`BaseChatModel\` from LangChain - Include a factory function (e.g., \`create_your_model_chat()\`) - Handle device selection (MPS, CUDA, CPU) - Implement required methods: \`_generate\`, \`_agenerate\`, \`_stream\`, \`_astream\` - Set appropriate model properties and parameters #\# 5. Testing After adding all configurations: 1. Check that the model appears in the Settings page 2. Test the download functionality 3. Verify the model loads correctly when selected 4. Test chat functionality with your model #\# Directory Structure Example \`\`\` models/ ā”œā”€ā”€ YourModelName/ │ ā”œā”€ā”€ download_model.py │ ā”œā”€ā”€ entrypoint.py │ └── model/ │ └── (downloaded model files) ā”œā”€ā”€ Qwen25Math/ ā”œā”€ā”€ Qwen25VL/ ā”œā”€ā”€ Phi4_multimodal/ └── README.md \`\`\` #\# Important Notes - Model names must be consistent across all configuration files - The frontend key, app.config.json name, and model_manager key must all match - Special directory mappings (like phi4 → Phi4_multimodal) require additional handling - Always test the full flow from download to usage `; fs.writeFileSync(path.join(modelsPath, 'README.md'), readmePy); } async function createMemoryOptimizer(modelsPath) { const memoryOptimizerPy = `# /server/models/memory_optimizer.py # Memory optimizer for models import argparse import logging import psutil import os import gc import sys import torch from typing import Dict, Any, Optional, Tuple from transformers import AutoModelForCausalLM, AutoTokenizer # ---------------------------------------------------------------------- # Logger configuration # ---------------------------------------------------------------------- logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # ---------------------------------------------------------------------- # Core Utilities # ---------------------------------------------------------------------- def get_device_info() -> Dict[str, Any]: """ Get comprehensive device and memory information. Returns: device_info (dict): Dictionary containing RAM/VRAM availability, CUDA/MPS detection, and recommended device. """ device_info = { "ram_total_gb": 0.0, "ram_available_gb": 0.0, "has_cuda": False, "has_mps": False, "cuda_devices": {}, "mps_info": {}, "recommended_device": "cpu" } # Get RAM info ram = psutil.virtual_memory() device_info["ram_total_gb"] = ram.total / (1024**3) device_info["ram_available_gb"] = ram.available / (1024**3) # Check CUDA availability and get VRAM info if torch.cuda.is_available(): device_info["has_cuda"] = True device_info["recommended_device"] = "cuda" logger.info(f"CUDA detected: {torch.cuda.device_count()} device(s)") for i in range(torch.cuda.device_count()): props = torch.cuda.get_device_properties(i) torch.cuda.set_device(i) total_memory = props.total_memory / (1024**3) allocated_memory = torch.cuda.memory_allocated(i) / (1024**3) available_memory = total_memory - allocated_memory device_info["cuda_devices"][f"cuda:{i}"] = { "name": props.name, "compute_capability": f"{props.major}.{props.minor}", "total_memory_gb": total_memory, "allocated_memory_gb": allocated_memory, "available_memory_gb": available_memory, "multiprocessor_count": props.multi_processor_count, "supports_bf16": torch.cuda.is_bf16_supported(i) } logger.info(f" GPU {i}: {props.name} | {available_memory:.1f}GB available / {total_memory:.1f}GB total") # Check MPS availability (Apple Silicon) if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): device_info["has_mps"] = True if not device_info["has_cuda"]: # Only use MPS if no CUDA device_info["recommended_device"] = "mps" device_info["mps_info"] = { "is_built": torch.backends.mps.is_built(), "is_available": torch.backends.mps.is_available() } logger.info(f"MPS detected: Built={torch.backends.mps.is_built()}, Available={torch.backends.mps.is_available()}") # Log summary logger.info(f"System Memory: {device_info['ram_available_gb']:.1f}GB available / {device_info['ram_total_gb']:.1f}GB total") logger.info(f"Recommended device: {device_info['recommended_device']}") return device_info def get_optimal_model_config(model_size_gb: float = 8.0) -> Dict[str, Any]: """ Get optimal model loading config for GPU-only deployment. Maximizes GPU memory usage for provisioned GPU workloads. If CPU is the only fallback, exit with an error message. Args: model_size_gb (float): Estimated model size in GB. Returns: config (dict): Dictionary containing recommended loading parameters (device_map, torch_dtype, quantization, etc.) optimized for GPU-only usage. """ device_info = get_device_info() ram_available = device_info["ram_available_gb"] has_cuda = device_info["has_cuda"] has_mps = device_info["has_mps"] # If no CUDA or MPS, exit cleanly as requested if not has_cuda and not has_mps: logger.error("āŒ CPU only - GPU-intensive operations CPU not supported. Exiting.") sys.exit(0) logger.info(f"Optimizing for a {model_size_gb:.1f}GB model") # Base configuration config = { "low_cpu_mem_usage": True, "trust_remote_code": True, } # === CUDA OPTIMIZATION SCENARIOS === if has_cuda: logger.info("šŸš€ CUDA detected - applying CUDA optimizations") # Summed VRAM across all GPUs total_vram = sum(gpu["available_memory_gb"] for gpu in device_info["cuda_devices"].values()) primary_gpu = list(device_info["cuda_devices"].values())[0] # Inspect first GPU for BF16 logger.info(f"Total VRAM available: {total_vram:.1f}GB across {len(device_info['cuda_devices'])} GPU(s)") # Choose optimal data type based on GPU capabilities # Choose BF16 if supported, else float16 if primary_gpu["supports_bf16"]: config["torch_dtype"] = torch.bfloat16 logger.info("āœ“ Using bfloat16") else: config["torch_dtype"] = torch.float16 logger.info("āœ“ Using float16") # Example scenarios (same logic you used, slightly consolidated): # Scenario 1: Very low RAM - Using 4-bit or 8-bit quantization. if ram_available < 16 and total_vram <= model_size_gb * 1.0: logger.info("šŸ—œļø Very low RAM - Using 4-bit or 8-bit quantization.") config.update({ "load_in_4bit": True, "bnb_4bit_compute_dtype": config["torch_dtype"], "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4", "device_map": "auto", "max_memory": _get_cuda_max_memory_config(device_info, ram_available), }) elif ram_available < 32 and total_vram <= model_size_gb * 1.0: logger.info("šŸŽÆ Low RAM + Sufficient VRAM - Using 8-bit quantization.") config.update({ "load_in_8bit": True, "llm_int8_enable_fp32_cpu_offload": True, "device_map": "auto", "max_memory": _get_cuda_max_memory_config(device_info, ram_available), }) # Scenario 3: Sufficient RAM - GPU-only loading else: logger.info("āœ… GPU-only mode - maximizing GPU memory usage.") config.update({ "device_map": "auto", "max_memory": _get_cuda_max_memory_config(device_info, ram_available), }) # === MPS FALLBACK (less optimized) === elif has_mps: logger.info("šŸŽ MPS detected - minimal MPS config.") config.update({ "torch_dtype": torch.float16, "device_map": None, }) logger.info(f"Final config: {config}") return config def _get_cuda_max_memory_config(device_info: Dict[str, Any], ram_available: float) -> Dict[str, str]: """ Generate max_memory configuration dictionary for CUDA device mapping. This function calculates optimal memory allocation for GPUs only, maximizing GPU usage for provisioned GPU workloads. Args: device_info (Dict[str, Any]): Device information from get_device_info() ram_available (float): Available system RAM in GB Returns: Dict[str, str]: Memory configuration dictionary where: - Keys: Device identifiers (integers for GPUs only) - Values: Memory allocation strings (e.g., "22GB", "18GB") - Example: {0: "22GB", 1: "18GB"} """ max_memory = {} # GPU-only configuration - no CPU allocation for offloading logger.info("šŸš€ GPU-only configuration - maximizing GPU memory usage.") # Maximize GPU memory allocation for each device for device_id, gpu in device_info["cuda_devices"].items(): # Extract integer device ID from string like "cuda:0" -> 0 gpu_id = int(device_id.split(":")[-1]) available_vram = gpu["available_memory_gb"] # More aggressive GPU memory allocation since we're GPU-only if available_vram > 32: # Reserve only 1GB for CUDA overhead on high-end GPUs vram_for_model = available_vram - 1 logger.info(f"GPU {device_id}: reserving {vram_for_model:.1f}GB for model (GPU-only mode).") elif available_vram > 16: # Reserve 1GB for CUDA overhead on mid-range GPUs vram_for_model = available_vram - 1 logger.info(f"GPU {device_id}: reserving {vram_for_model:.1f}GB for model (GPU-only mode).") else: # Reserve minimal overhead on smaller GPUs vram_for_model = max(0.5, available_vram - 0.5) logger.info(f"GPU {device_id}: reserving {vram_for_model:.1f}GB for model (GPU-only mode).") # Use integer key for GPU devices max_memory[gpu_id] = f"{vram_for_model:.1f}GB" logger.info(f"max_memory config: {max_memory}") return max_memory def optimize_for_generation(model, tokenizer) -> Tuple[Any, Any]: """ Optimize loaded model and tokenizer for inference. """ device_info = get_device_info() # If CPU only, exit as requested if not device_info["has_cuda"] and not device_info["has_mps"]: logger.error("āŒ CPU only - GPU-intensive ops not supported. Exiting.") sys.exit(0) if device_info["has_cuda"]: # Enable xformers memory efficient attention if available if hasattr(model, 'enable_xformers_memory_efficient_attention'): try: model.enable_xformers_memory_efficient_attention() logger.info("āœ“ Enabled xformers memory efficient attention.") except Exception as e: logger.info(f"⚠ xformers not available: {e}") # Check if Flash Attention 2 is active (some models/hubs may have it) if hasattr(model.config, '_attn_implementation') and model.config._attn_implementation == 'flash_attention_2': logger.info("āœ“ Flash Attention 2 is active.") # Enable attention slicing if available if hasattr(model, 'enable_attention_slicing'): model.enable_attention_slicing("max") logger.info("āœ“ Enabled attention slicing for CUDA.") # Enable gradient checkpointing for memory efficiency (during inference this is minor) if hasattr(model, 'gradient_checkpointing_enable'): model.gradient_checkpointing_enable() logger.info("āœ“ Enabled gradient checkpointing.") torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True logger.info("āœ“ Enabled TF32 for CUDA.") elif device_info["has_mps"]: logger.info("šŸŽ Applying MPS optimizations.") if hasattr(model, 'enable_attention_slicing'): model.enable_attention_slicing("max") logger.info("āœ“ Enabled attention slicing for MPS.") # === UNIVERSAL OPTIMIZATIONS === model.eval() for param in model.parameters(): param.requires_grad = False logger.info("āœ“ Model set to eval mode; gradients disabled.") return model, tokenizer def cleanup_memory(): """ Free GPU caches and run garbage collection. """ device_info = get_device_info() # If CPU only, just do a GC if not device_info["has_cuda"] and not device_info["has_mps"]: logger.error("āŒ CPU only - skipping GPU memory cleanup. Exiting.") sys.exit(0) if device_info["has_cuda"]: try: # Clear CUDA cache on all devices for i in range(torch.cuda.device_count()): torch.cuda.set_device(i) torch.cuda.empty_cache() torch.cuda.ipc_collect() # Reset memory stats torch.cuda.reset_peak_memory_stats() total_memory = sum( torch.cuda.get_device_properties(i).total_memory / (1024**3) for i in range(torch.cuda.device_count()) ) allocated_memory = sum( torch.cuda.memory_allocated(i) / (1024**3) for i in range(torch.cuda.device_count()) ) logger.info(f"āœ“ CUDA cleanup complete: {allocated_memory:.1f}GB / {total_memory:.1f}GB allocated.") except Exception as e: logger.warning(f"⚠ CUDA cleanup failed: {e}") elif device_info["has_mps"]: # MPS has limited cleanup capabilities try: if hasattr(torch.backends.mps, 'empty_cache'): torch.backends.mps.empty_cache() logger.info("āœ“ MPS cache cleared.") else: logger.info("MPS cleanup not available in this PyTorch version.") except Exception as e: logger.warning(f"⚠ MPS cleanup failed: {e}") collected = gc.collect() logger.info(f"āœ“ Python garbage collection: {collected} objects collected.") ram = psutil.virtual_memory() logger.info(f"āœ“ System RAM after cleanup: {ram.available / (1024**3):.1f}GB available / {ram.total / (1024**3):.1f}GB total.") # ---------------------------------------------------------------------- # Convenience Functions # ---------------------------------------------------------------------- def get_memory_info() -> Dict[str, Any]: """Backward compatibility function - returns device_info.""" return get_device_info() def log_memory_summary(): """ Log a comprehensive memory summary. """ device_info = get_device_info() logger.info("=== MEMORY SUMMARY ===") logger.info(f"RAM: {device_info['ram_available_gb']:.1f}GB / {device_info['ram_total_gb']:.1f}GB") logger.info(f"Recommended device: {device_info['recommended_device']}") if device_info["has_cuda"]: logger.info("CUDA GPUs:") for device_id, gpu in device_info["cuda_devices"].items(): logger.info(f" {device_id}: {gpu['name']}") logger.info(f" VRAM: {gpu['available_memory_gb']:.1f}GB / {gpu['total_memory_gb']:.1f}GB") logger.info(f" Compute: {gpu['compute_capability']}, BF16: {gpu['supports_bf16']}") if device_info["has_mps"]: logger.info(f"MPS: Built={device_info['mps_info']['is_built']}, " f"Available={device_info['mps_info']['is_available']}") logger.info("======================") def get_recommended_batch_size(model_size_gb: float = 8.0) -> int: """ Heuristic for recommended batch size based on VRAM. """ device_info = get_device_info() if device_info["has_cuda"]: max_vram = max(g["available_memory_gb"] for g in device_info["cuda_devices"].values()) available_for_batching = max_vram - model_size_gb if available_for_batching > 16: return 4 elif available_for_batching > 8: return 2 else: return 1 # Default for CPU/MPS return 1 # ---------------------------------------------------------------------- # Entry Point for Testing # ---------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="Memory optimizer test script.") parser.add_argument("--model_size_gb", type=float, default=8.0, help="Estimated model size in GB.") args = parser.parse_args() logger.info("Starting memory optimizer test...") device_info = get_device_info() config = get_optimal_model_config(args.model_size_gb) log_memory_summary() recommended_bs = get_recommended_batch_size(args.model_size_gb) logger.info(f"Recommended batch size: {recommended_bs}") logger.info("Loading a small test model (gpt2) to demonstrate optimization...") if not device_info["has_cuda"] and not device_info["has_mps"]: logger.error("āŒ CPU only - GPU-intensive ops not supported. Exiting.") sys.exit(0) model_name = "gpt2" model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) model, tokenizer = optimize_for_generation(model, tokenizer) cleanup_memory() logger.info("Memory optimizer test complete.") # ---------------------------------------------------------------------- if __name__ == "__main__": main() `; fs.writeFileSync(path.join(modelsPath, 'memory_optimizer.py'), memoryOptimizerPy); } async function createDownloadTracker(modelsPath) { const downloadTrackerPy = `""" Download progress tracker for model downloads """ import time from pathlib import Path def track_download_progress(model_dir, expected_size_gb, stop_event): """Track download progress by monitoring directory size. Args: model_dir: Path to the model directory expected_size_gb: Expected total size in GB stop_event: Threading event to stop the tracker """ expected_bytes = expected_size_gb * 1024 * 1024 * 1024 model_dir = Path(model_dir) while not stop_event.is_set(): try: # Calculate current directory size current_bytes = sum(f.stat().st_size for f in model_dir.rglob('*') if f.is_file()) current_gb = current_bytes / (1024 * 1024 * 1024) percentage = min(100, (current_bytes / expected_bytes) * 100) print(f"\\ršŸ“¦ Progress: {percentage:.1f}% ({current_gb:.1f}GB / {expected_size_gb}GB)", end='', flush=True) time.sleep(2) # Update every 2 seconds except Exception: pass # Ignore errors during directory traversal print() # New line after progress tracking ends `; fs.writeFileSync(path.join(modelsPath, 'download_tracker.py'), downloadTrackerPy); } async function createB2Downloader(modelsPath) { const b2DownloaderPy = `""" Backblaze B2 model downloader """ import os import sys import subprocess from pathlib import Path import threading import time import logging # Import the download tracker sys.path.append(str(Path(__file__).parent)) from download_tracker import track_download_progress logger = logging.getLogger(__name__) # Environment variables - optional, checked at runtime B2_MODEL_BUCKET_ID_API_KEY = os.getenv("B2_MODEL_BUCKET_ID_API_KEY") B2_MODEL_BUCKET_API_KEY = os.getenv("B2_MODEL_BUCKET_API_KEY") def download_from_b2(model_dir: Path, model_name: str, force_download: bool = False, environment: str = "localhost", bucket_name: str = "bucket-models", model_size_gb: float = 10.0, use_custom_dirname: bool = False, custom_dirname: str = ""): """Download model from B2 bucket. Args: model_dir: Directory to download model to model_name: Name of the model (used as folder name in B2) force_download: Force re-download even if model exists environment: Environment name (affects progress tracking) bucket_name: B2 bucket name model_size_gb: Approximate model size in GB for progress tracking """ # Skip re-download if the directory is non-empty if model_dir.exists() and any(model_dir.iterdir()) and not force_download: print(f"Model already exists in {model_dir}. Skipping download (use --force-download to overwrite).") return print(f"Downloading {model_name} from B2 bucket...") print("This may take time depending on model size...") try: # Create model directory model_dir.mkdir(parents=True, exist_ok=True) # Make sure environment variables are set or at least try the default B2 ones b2_key_id = B2_MODEL_BUCKET_ID_API_KEY b2_key_value = B2_MODEL_BUCKET_API_KEY if not b2_key_id or not b2_key_value: print("āŒ Error: B2 credentials not found in environment.") print("B2 model downloading requires B2_MODEL_BUCKET_ID_API_KEY and B2_MODEL_BUCKET_API_KEY environment variables.") print("These are optional for basic app functionality but required for B2 model downloads.") raise ValueError("B2 credentials not available") # Authorize B2 (headless, no interactive prompt) auth_result = subprocess.run( ["b2", "account", "authorize", b2_key_id, b2_key_value], capture_output=True, text=True ) if auth_result.returncode != 0: print("āŒ Error: B2 authentication failed") print(auth_result.stderr) raise RuntimeError(f"B2 authentication failed: {auth_result.stderr}") # Decide how many threads to use in a container import multiprocessing cpu_count = multiprocessing.cpu_count() optimal_threads = min(4, max(1, cpu_count // 2)) print(f"Optimal threads: {optimal_threads} (Detected {cpu_count} CPUs)") # b2 sync command if use_custom_dirname: b2_path = f"b2://{bucket_name}/{model_name}/{custom_dirname}/" else: b2_path = f"b2://{bucket_name}/{model_name}/model/" print(f"Downloading from: {b2_path}") print(f"Downloading to: {model_dir}") sync_cmd = [ "b2", "sync", "--threads", str(optimal_threads), "--write-buffer-size", "100000000", # Adjust if you have enough memory b2_path, f"{model_dir}/" # trailing slash ] stop_event = None progress_thread = None try: if environment != "localhost": stop_event = threading.Event() progress_thread = threading.Thread( target=track_download_progress, args=(model_dir, model_size_gb, stop_event), daemon=False ) progress_thread.start() # Run download result = subprocess.run(sync_cmd, capture_output=True, text=True) finally: # Always clean up the thread, even if an exception occurred if progress_thread and stop_event: stop_event.set() progress_thread.join(timeout=1.0) # Properly wait for thread to finish if result.returncode != 0: print(f"āŒ Error downloading from B2: {result.stderr}") raise RuntimeError(f"B2 download failed: {result.stderr}") print(f"āœ… Successfully downloaded {model_name} from B2") print(f"Model saved to: {model_dir}") except Exception as e: print(f"Error downloading from B2: {e}") raise class B2ModelDownloader: """Download models from Backblaze B2 bucket""" `; fs.writeFileSync(path.join(modelsPath, 'b2_downloader.py'), b2DownloaderPy); } async function createModelManager(modelsPath) { const modelManagerPy = `""" /server/models/model_manager.py Simple model manager for checking and downloading models """ import os import subprocess from pathlib import Path from typing import Dict, Any, Optional import logging import json import sys sys.path.append(str(Path(__file__).parent.parent)) from config.utils import get_environment logger = logging.getLogger(__name__) class ModelManager: """Simple model manager for handling model downloads""" # Map of model names to their HuggingFace IDs and download scripts SUPPORTED_MODELS = { "Qwen25Math": { "hf_id": "Qwen/Qwen2.5-Math-7B", "download_script": "download_model.py" }, "Qwen25VL": { "hf_id": "Qwen/Qwen2.5-VL-7B-Instruct", "download_script": "download_model.py" }, "DeepHermes3": { "hf_id": "NousResearch/DeepHermes-3-Llama-3-8B-Preview", "download_script": "download_model.py" }, "Flux": { "hf_id": "black-forest-labs/FLUX.1-schnell", "download_script": "download_model.py" }, "FluxKontext": { "hf_id": "black-forest-labs/FLUX.1-Kontext-Dev", "download_script": "download_model.py" }, "phi4": { "hf_id": "microsoft/Phi-4-multimodal-instruct", "download_script": "download_model.py" }, "Qwen25Code": { "hf_id": "Qwen/Qwen2.5-Coder-32B-Instruct", "download_script": "download_model.py" }, # Add more models here as needed } def __init__(self): self.models_dir = Path(__file__).parent def is_model_available(self, model_name: str) -> bool: """Check if a model is downloaded and ready to use""" if model_name.startswith("gpt-"): # OpenAI models are always available (API-based) return True if model_name not in self.SUPPORTED_MODELS: return False # Check if model directory exists and has files # Handle special directory names model_dir_name = "Phi4_multimodal" if model_name == "phi4" else model_name model_dir = self.models_dir / model_dir_name / "model" if model_dir.exists() and any(model_dir.iterdir()): return True return False def download_model(self, model_name: str, progress_callback=None) -> Dict[str, Any]: """Download a model if it's supported Args: model_name: Name of the model to download progress_callback: Optional callback function to receive progress updates """ if model_name not in self.SUPPORTED_MODELS: return { "success": False, "error": f"Model '{model_name}' is not supported. Supported models: {list(self.SUPPORTED_MODELS.keys())}" } model_info = self.SUPPORTED_MODELS[model_name] # Handle special directory names model_dir_name = "Phi4_multimodal" if model_name == "phi4" else model_name download_script = self.models_dir / model_dir_name / model_info["download_script"] if not download_script.exists(): return { "success": False, "error": f"Download script not found for {model_name}" } try: logger.info(f"Starting download for {model_name}...") environment = get_environment() if environment == "localhost": # Run download with output directly to console (not captured) # This allows HuggingFace download progress to be visible in development result = subprocess.run( ["python", str(download_script)], cwd=download_script.parent, # Don't capture output - let it display directly capture_output=False, text=True ) if result.returncode == 0: logger.info(f"Successfully downloaded {model_name}") return { "success": True, "message": f"Model '{model_name}' downloaded successfully and is ready to use" } else: logger.error(f"Download failed with return code: {result.returncode}") return { "success": False, "error": f"Download failed with return code: {result.returncode}" } else: # Stream output for non-hugging-face environments if progress_callback: progress_callback(f"Starting download for {model_name}...") process = subprocess.Popen( ["python", str(download_script)], cwd=download_script.parent, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 ) # Stream output line by line for line in process.stdout: line = line.strip() if line: logger.info(line) if progress_callback: # Send progress updates if "Progress:" in line or "Downloading" in line or "%" in line: progress_callback(line) process.wait() if process.returncode == 0: logger.info(f"Successfully downloaded {model_name}") return { "success": True, "message": f"Model '{model_name}' downloaded successfully and is ready to use" } else: logger.error(f"Download failed with return code: {process.returncode}") return { "success": False, "error": f"Download failed with return code: {process.returncode}" } except Exception as e: logger.error(f"Error downloading model: {e}") return { "success": False, "error": f"Error downloading model: {str(e)}" } def get_available_models(self) -> Dict[str, bool]: """Get a list of all supported models and their availability status""" models = {"gpt-4o-mini": True} # Always available for model_name in self.SUPPORTED_MODELS: models[model_name] = self.is_model_available(model_name) return models # Global instance model_manager = ModelManager()`; fs.writeFileSync(path.join(modelsPath, 'model_manager.py'), modelManagerPy); } async function createToolAdapter(modelsPath) { const toolAdapterPy = `""" /server/models/tool_adapter.py Tool adapter for models that don't support native tool binding """ import json import re import logging from typing import List, Dict, Any, Optional, Union from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import AIMessage, BaseMessage, SystemMessage from langchain_core.tools import BaseTool from langchain_core.outputs import ChatResult, ChatGeneration from langchain_core.callbacks.manager import CallbackManagerForLLMRun, AsyncCallbackManagerForLLMRun logger = logging.getLogger(__name__) class ToolSupportWrapper(BaseChatModel): """ Wrapper that adds tool support to models that don't have native tool binding. Uses prompt engineering to instruct the model how to call tools. """ model: BaseChatModel tools: List[BaseTool] = [] def __init__(self, model: BaseChatModel, **kwargs): # Include model in kwargs for Pydantic validation kwargs['model'] = model super().__init__(**kwargs) def bind_tools(self, tools: List[BaseTool]) -> "ToolSupportWrapper": """Bind tools to this wrapper""" self.tools = tools return self def _format_tool_instructions(self, tool: BaseTool) -> str: """ Format instructions for a specific tool with clear JSON format and examples. """ tool_name = tool.name tool_desc = tool.description # Tool-specific formatting with examples if tool_name == "web_search": return f""" You have access to a 'web_search' tool that can find current information. To use it, respond with ONLY this JSON format: {{ "tool_use": {{ "name": "web_search", "arguments": {{ "query": "search query" }} }} }} Example usage: {{ "tool_use": {{ "name": "web_search", "arguments": {{ "query": "latest AI research breakthroughs 2025" }} }} }}""" elif tool_name == "calculator": return f""" You have access to a 'calculator' tool for solving mathematical expressions. To use it, respond with ONLY this JSON format: {{ "tool_use": {{ "name": "calculator", "arguments": {{ "expression": "mathematical expression" }} }} }} Example usage: {{ "tool_use": {{ "name": "calculator", "arguments": {{ "expression": "sqrt(144) + 25 * 2" }} }} }}""" else: # Generic tool format return f""" You have access to the '{tool_name}' tool: {tool_desc} To use it, respond with ONLY this JSON format: {{ "tool_use": {{ "name": "{tool_name}", "arguments": {{ // Add required arguments here }} }} }}""" def _format_tools_prompt(self) -> str: """ Format all available tools into a clear, structured prompt. """ if not self.tools: return "" # Build instructions for each tool tool_instructions = [] for tool in self.tools: tool_instructions.append(self._format_tool_instructions(tool)) # dont confuse the role of tool_adapter, it orchestrates tools not create responses. prompt = f""" IMPORTANT: You have been provided with specialized tools to help answer the user's query. IMPORTANT: The system has determined you NEED to use these tools for this query. IMPORTANT: DO NOT Generate a response until told otherwise. Available tools: {''.join(tool_instructions)} """ return prompt def _extract_user_query(self, messages: List[BaseMessage]) -> Optional[str]: """Extract the most recent user query from messages.""" for msg in reversed(messages): if hasattr(msg, 'content') and msg.type == "human": return msg.content return None def _parse_tool_calls(self, content: str) -> Optional[List[Dict[str, Any]]]: """ Parse tool calls from model output with multiple strategies. Returns a list of tool call dicts, or None if none found. """ logger.debug(f"Parsing tool calls from content: {content[:200]}...") # Clean the content content = content.strip() # Strategy 1: Direct JSON parsing if content.startswith('{'): try: data = json.loads(content) if "tool_use" in data: tool_use = data["tool_use"] tool_call = { "id": "call_0", "name": tool_use["name"], "args": tool_use.get("arguments", {}) } logger.info(f"Successfully parsed tool call (direct): {tool_call}") return [tool_call] except json.JSONDecodeError as e: logger.debug(f"Direct JSON parsing failed: {e}") # Strategy 2: Extract JSON blocks with regex try: # More robust regex pattern that handles nested objects json_pattern = r'\\{(?:[^{}]|(?:\\{[^{}]*\\}))*"tool_use"(?:[^{}]|(?:\\{[^{}]*\\}))*\\}' matches = re.findall(json_pattern, content, re.DOTALL) if matches: tool_calls = [] for i, match in enumerate(matches): try: data = json.loads(match) if "tool_use" in data: tool_use = data["tool_use"] tool_call = { "id": f"call_{i}", "name": tool_use["name"], "args": tool_use.get("arguments", {}) } tool_calls.append(tool_call) logger.info(f"Successfully parsed tool call (regex): {tool_call}") except json.JSONDecodeError: continue if tool_calls: return tool_calls except Exception as e: logger.debug(f"Regex extraction failed: {e}") # Strategy 3: Look for tool intent without proper formatting intent_phrases = [ "i searched", "i have searched", "searching for", "let me search", "calculating", "let me calculate", "computing" ] if any(phrase in content.lower() for phrase in intent_phrases): logger.warning( "Model expressed intent to use tools but didn't follow JSON format. " "Consider reinforcing the prompt instructions." ) return None def _inject_tools_prompt(self, messages: List[BaseMessage]) -> List[BaseMessage]: """ Inject tool instructions into the messages in a clean way. """ if not self.tools: return messages # Check if we have tool results in the conversation has_tool_results = any( msg.type == "tool" for msg in messages ) # If we already have tool results, don't inject tool prompts again # This prevents the model from calling tools repeatedly if has_tool_results: # Find the tool results in the messages tool_results_content = [] for msg in messages: if msg.type == "tool": tool_results_content.append(msg.content) # Add a system message to prevent more tool calls # But do NOT ask for synthesis - that's generate_response's job result_processing_prompt = f""" IMPORTANT: DO NOT Generate a response until told otherwise.""" messages_copy = messages.copy() # Find the position after the last tool message last_tool_idx = -1 for i, msg in enumerate(messages_copy): if msg.type == "tool": last_tool_idx = i if last_tool_idx >= 0: # Insert the processing prompt after the last tool message messages_copy.insert(last_tool_idx + 1, SystemMessage(content=result_processing_prompt)) return messages_copy # Original tool injection logic for first tool call # Get the formatted tools prompt tools_prompt = self._format_tools_prompt() # Extract user query for context user_query = self._extract_user_query(messages) # Add specific encouragement based on context if user_query and any(tool.name == "web_search" for tool in self.tools): tools_prompt += f"\\n\\nTHE SYSTEM HAS IDENTIFIED THAT YOU NEED TO USE THE WEB SEARCH TOOL TO SEARCH FOR: {user_query}" # Create a new message list with the tools prompt messages_copy = messages.copy() # Check if there's already a system message if messages_copy and messages_copy[0].type == "system": # Append to existing system message messages_copy[0].content = f"{messages_copy[0].content}\\n\\n{tools_prompt}" else: # Create new system message at the beginning system_msg = SystemMessage(content=tools_prompt) messages_copy.insert(0, system_msg) return messages_copy def _generate( self, messages: List[BaseMessage], stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any, ) -> ChatResult: """Generate with tool support""" # Inject tool instructions if tools are bound if self.tools: messages = self._inject_tools_prompt(messages) # Call the underlying model result = self.model._generate(messages, stop, run_manager, **kwargs) # Process the response for tool calls if result.generations and self.tools: generation = result.generations[0] ai_message = generation.message # Try to parse tool calls tool_calls = self._parse_tool_calls(ai_message.content) if tool_calls: # Create a new AIMessage with tool_calls new_message = AIMessage( content=ai_message.content, tool_calls=tool_calls, additional_kwargs=getattr(ai_message, 'additional_kwargs', {}) ) # Update the generation with the new message result.generations[0] = ChatGeneration( message=new_message, generation_info=generation.generation_info ) return result async def _agenerate( self, messages: List[BaseMessage], stop: Optional[List[str]] = None, run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, **kwargs: Any, ) -> ChatResult: """Generate asynchronously with tool support""" # Inject tool instructions if tools are bound if self.tools: messages = self._inject_tools_prompt(messages) # Call the underlying model result = await self.model._agenerate(messages, stop, run_manager, **kwargs) # Process the response for tool calls if result.generations and self.tools: generation = result.generations[0] ai_message = generation.message # Try to parse tool calls tool_calls = self._parse_tool_calls(ai_message.content) if tool_calls: # Create a new AIMessage with tool_calls new_message = AIMessage( content=ai_message.content, tool_calls=tool_calls, additional_kwargs=getattr(ai_message, 'additional_kwargs', {}) ) # Update the generation with the new message result.generations[0] = ChatGeneration( message=new_message, generation_info=generation.generation_info ) return result @property def _llm_type(self) -> str: """Return the type of language model""" return f"tool_wrapper_{self.model._llm_type}" @property def _identifying_params(self) -> Dict[str, Any]: """Return the identifying parameters""" return { **self.model._identifying_params, "wrapper": "ToolSupportWrapper", "tools_count": len(self.tools) } def __getattr__(self, name): """Delegate unknown attributes to the underlying model""" return getattr(self.model, name) def with_multimodal_inputs(self, images=None, audios=None): """Pass through multimodal inputs to the underlying model if supported""" if hasattr(self.model, 'with_multimodal_inputs'): self.model = self.model.with_multimodal_inputs(images=images, audios=audios) return self def wrap_model_wi