# Dockerfile for Distributed Llama Controller (Raspberry Pi) # This variant can download models and start the API server FROM arm64v8/debian:bookworm-slim # Install dependencies RUN apt-get update && apt-get install -y \ build-essential \ g++ \ make \ git \ python3 \ python3-pip \ curl \ wget \ ca-certificates \ && rm -rf /var/lib/apt/lists/* # Set working directory WORKDIR /app # Copy source code COPY src/ ./src/ COPY Makefile ./ COPY launch.py ./ # Build the applications RUN make dllama && make dllama-api # Create models directory for volume mount RUN mkdir -p /app/models # Create a script to download models COPY <" echo "Available models:" python3 launch.py exit 1 fi python3 launch.py "\$1" -skip-run -skip-script -y EOF RUN chmod +x /app/download-model.sh # Create entrypoint script COPY < Download a model and exit" echo " --model Model name to use" echo " --port API server port (default: 9999)" echo " --nthreads Number of threads (default: 4)" echo " --max-seq-len Maximum sequence length (default: 4096)" echo " --buffer-float-type Buffer float type (default: q80)" echo " --workers Space-separated list of worker addresses (e.g., 10.0.0.2:9999 10.0.0.3:9999)" echo "" echo "Examples:" echo " # Download a model" echo " docker run -v ./models:/app/models distributed-llama-controller --download llama3_2_3b_instruct_q40" echo "" echo " # Run API server with workers" echo " docker run -p 9999:9999 -v ./models:/app/models distributed-llama-controller \\" echo " --model llama3_2_3b_instruct_q40 --workers 10.0.0.2:9999 10.0.0.3:9999" exit 0 ;; *) echo "Unknown option: \$1" exit 1 ;; esac done if [ -z "\$MODEL_NAME" ]; then echo "Error: --model is required" echo "Use --help for usage information" exit 1 fi MODEL_PATH="/app/models/\$MODEL_NAME/dllama_model_\$MODEL_NAME.m" TOKENIZER_PATH="/app/models/\$MODEL_NAME/dllama_tokenizer_\$MODEL_NAME.t" if [ ! -f "\$MODEL_PATH" ] || [ ! -f "\$TOKENIZER_PATH" ]; then echo "Error: Model files not found for \$MODEL_NAME" echo "Model path: \$MODEL_PATH" echo "Tokenizer path: \$TOKENIZER_PATH" echo "" echo "Please download the model first:" echo "docker run -v ./models:/app/models distributed-llama-controller --download \$MODEL_NAME" exit 1 fi # Build the command CMD="./dllama-api --port \$API_PORT --model \$MODEL_PATH --tokenizer \$TOKENIZER_PATH --buffer-float-type \$BUFFER_FLOAT_TYPE --nthreads \$NTHREADS --max-seq-len \$MAX_SEQ_LEN" if [ ! -z "\$WORKERS" ]; then CMD="\$CMD --workers \$WORKERS" fi echo "Starting API server with command:" echo "\$CMD" echo "" exec \$CMD EOF RUN chmod +x /app/entrypoint.sh # Expose the default API port EXPOSE 9999 # Use the entrypoint script ENTRYPOINT ["/app/entrypoint.sh"]