#!/bin/bash # Default values MODEL_NAME="" API_PORT=9999 NTHREADS=4 MAX_SEQ_LEN=4096 WORKERS="" BUFFER_FLOAT_TYPE="q80" # Parse command line arguments while [[ $# -gt 0 ]]; do case $1 in --model) MODEL_NAME="$2" shift 2 ;; --port) API_PORT="$2" shift 2 ;; --nthreads) NTHREADS="$2" shift 2 ;; --max-seq-len) MAX_SEQ_LEN="$2" shift 2 ;; --workers) shift WORKERS="$@" break ;; --buffer-float-type) BUFFER_FLOAT_TYPE="$2" shift 2 ;; --download) MODEL_NAME="$2" echo "Downloading model: $MODEL_NAME" /app/download-model.sh "$MODEL_NAME" exit 0 ;; --help) echo "Usage: docker run distributed-llama-controller [OPTIONS]" echo "" echo "Options:" echo " --download Download a model and exit" echo " --model Model name to use" echo " --port API server port (default: 9999)" echo " --nthreads Number of threads (default: 4)" echo " --max-seq-len Maximum sequence length (default: 4096)" echo " --buffer-float-type Buffer float type (default: q80)" echo " --workers Space-separated list of worker addresses" echo "" echo "Examples:" echo " # Download a model" echo " docker run -v ./models:/app/models distributed-llama-controller --download llama3_2_3b_instruct_q40" echo "" echo " # Run API server with workers" echo " docker run -p 9999:9999 -v ./models:/app/models distributed-llama-controller \\" echo " --model llama3_2_3b_instruct_q40 --workers 10.0.0.2:9999 10.0.0.3:9999" exit 0 ;; *) echo "Unknown option: $1" exit 1 ;; esac done if [ -z "$MODEL_NAME" ]; then echo "Error: --model is required (unless using --download)" echo "Use --help for usage information" exit 1 fi MODEL_PATH="/app/models/$MODEL_NAME/dllama_model_$MODEL_NAME.m" TOKENIZER_PATH="/app/models/$MODEL_NAME/dllama_tokenizer_$MODEL_NAME.t" if [ ! -f "$MODEL_PATH" ] || [ ! -f "$TOKENIZER_PATH" ]; then echo "Error: Model files not found for $MODEL_NAME" echo "Model path: $MODEL_PATH" echo "Tokenizer path: $TOKENIZER_PATH" echo "" echo "Please download the model first:" echo "docker run -v ./models:/app/models distributed-llama-controller --download $MODEL_NAME" exit 1 fi # Build the command CMD="./dllama-api --port $API_PORT --model $MODEL_PATH --tokenizer $TOKENIZER_PATH --buffer-float-type $BUFFER_FLOAT_TYPE --nthreads $NTHREADS --max-seq-len $MAX_SEQ_LEN" if [ ! -z "$WORKERS" ]; then CMD="$CMD --workers $WORKERS" fi echo "Starting API server with command:" echo "$CMD" echo "" exec $CMD