diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f0e32c1..db2e83d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,7 +17,7 @@ jobs: include: - runner: ubuntu-22.04 arch: amd64 - - runner: ubuntu-24.04-arm + - runner: ubuntu-latest-arm64 arch: arm64 steps: - name: Checkout Repo diff --git a/Dockerfile.controller b/Dockerfile.controller index 070366a..92bbac1 100644 --- a/Dockerfile.controller +++ b/Dockerfile.controller @@ -23,136 +23,19 @@ COPY src/ ./src/ COPY Makefile ./ COPY launch.py ./ +# Copy scripts +COPY scripts/download-model.sh /app/download-model.sh +COPY scripts/entrypoint-controller.sh /app/entrypoint.sh + +# Make scripts executable +RUN chmod +x /app/download-model.sh /app/entrypoint.sh + # Build the applications RUN make dllama && make dllama-api # Create models directory for volume mount RUN mkdir -p /app/models -# Create a script to download models -COPY <" - echo "Available models:" - python3 launch.py - exit 1 -fi - -python3 launch.py "\$1" -skip-run -skip-script -y -EOF - -RUN chmod +x /app/download-model.sh - -# Create entrypoint script -COPY < Download a model and exit" - echo " --model Model name to use" - echo " --port API server port (default: 9999)" - echo " --nthreads Number of threads (default: 4)" - echo " --max-seq-len Maximum sequence length (default: 4096)" - echo " --buffer-float-type Buffer float type (default: q80)" - echo " --workers Space-separated list of worker addresses (e.g., 10.0.0.2:9999 10.0.0.3:9999)" - echo "" - echo "Examples:" - echo " # Download a model" - echo " docker run -v ./models:/app/models distributed-llama-controller --download llama3_2_3b_instruct_q40" - echo "" - echo " # Run API server with workers" - echo " docker run -p 9999:9999 -v ./models:/app/models distributed-llama-controller \\" - echo " --model llama3_2_3b_instruct_q40 --workers 10.0.0.2:9999 10.0.0.3:9999" - exit 0 - ;; - *) - echo "Unknown option: \$1" - exit 1 - ;; - esac -done - -if [ -z "\$MODEL_NAME" ]; then - echo "Error: --model is required" - echo "Use --help for usage information" - exit 1 -fi - -MODEL_PATH="/app/models/\$MODEL_NAME/dllama_model_\$MODEL_NAME.m" -TOKENIZER_PATH="/app/models/\$MODEL_NAME/dllama_tokenizer_\$MODEL_NAME.t" - -if [ ! -f "\$MODEL_PATH" ] || [ ! -f "\$TOKENIZER_PATH" ]; then - echo "Error: Model files not found for \$MODEL_NAME" - echo "Model path: \$MODEL_PATH" - echo "Tokenizer path: \$TOKENIZER_PATH" - echo "" - echo "Please download the model first:" - echo "docker run -v ./models:/app/models distributed-llama-controller --download \$MODEL_NAME" - exit 1 -fi - -# Build the command -CMD="./dllama-api --port \$API_PORT --model \$MODEL_PATH --tokenizer \$TOKENIZER_PATH --buffer-float-type \$BUFFER_FLOAT_TYPE --nthreads \$NTHREADS --max-seq-len \$MAX_SEQ_LEN" - -if [ ! -z "\$WORKERS" ]; then - CMD="\$CMD --workers \$WORKERS" -fi - -echo "Starting API server with command:" -echo "\$CMD" -echo "" - -exec \$CMD -EOF - -RUN chmod +x /app/entrypoint.sh - # Expose the default API port EXPOSE 9999 diff --git a/Dockerfile.worker b/Dockerfile.worker index 782e76e..0965806 100644 --- a/Dockerfile.worker +++ b/Dockerfile.worker @@ -16,58 +16,15 @@ WORKDIR /app COPY src/ ./src/ COPY Makefile ./ +# Copy worker entrypoint script +COPY scripts/entrypoint-worker.sh /app/entrypoint.sh + +# Make script executable +RUN chmod +x /app/entrypoint.sh + # Build only the worker application RUN make dllama -# Create entrypoint script -COPY < Worker port (default: 9999)" - echo " --nthreads Number of threads (default: 4)" - echo "" - echo "Example:" - echo " docker run -p 9999:9999 distributed-llama-worker --port 9999 --nthreads 4" - exit 0 - ;; - *) - echo "Unknown option: \$1" - exit 1 - ;; - esac -done - -# Build the command -CMD="./dllama worker --port \$PORT --nthreads \$NTHREADS" - -echo "Starting worker with command:" -echo "\$CMD" -echo "" - -exec \$CMD -EOF - -RUN chmod +x /app/entrypoint.sh - # Expose the default worker port EXPOSE 9999 diff --git a/docker-compose.yml b/docker-compose.yml index 77210f9..3ac5ed8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,13 +3,11 @@ version: '3.8' services: # Controller service - downloads models and runs API controller: - build: - context: . - dockerfile: Dockerfile.controller + image: registry.haschek.at/dllama-controller:latest ports: - "9999:9999" volumes: - - ./models:/app/models + - /srv/nfs/swarm/dllama:/app/models networks: distributed-llama: ipv4_address: 172.20.0.10 @@ -32,9 +30,7 @@ services: # Worker services worker1: - build: - context: . - dockerfile: Dockerfile.worker + image: registry.haschek.at/dllama-worker:latest networks: distributed-llama: ipv4_address: 172.20.0.11 @@ -43,39 +39,12 @@ services: command: > --port 9999 --nthreads ${WORKER_NTHREADS:-4} + deploy: + placement: + constraints: + - node.role == manager - worker2: - build: - context: . - dockerfile: Dockerfile.worker - networks: - distributed-llama: - ipv4_address: 172.20.0.12 - environment: - - NTHREADS=${WORKER_NTHREADS:-4} - command: > - --port 9999 - --nthreads ${WORKER_NTHREADS:-4} - worker3: - build: - context: . - dockerfile: Dockerfile.worker - networks: - distributed-llama: - ipv4_address: 172.20.0.13 - environment: - - NTHREADS=${WORKER_NTHREADS:-4} - command: > - --port 9999 - --nthreads ${WORKER_NTHREADS:-4} - -networks: - distributed-llama: - driver: bridge - ipam: - config: - - subnet: 172.20.0.0/16 volumes: models: \ No newline at end of file diff --git a/scripts/download-model.sh b/scripts/download-model.sh new file mode 100644 index 0000000..53064e4 --- /dev/null +++ b/scripts/download-model.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -z "$1" ]; then + echo "Usage: download-model.sh " + echo "Available models:" + python3 launch.py + exit 1 +fi + +python3 launch.py "$1" -skip-run -skip-script -y \ No newline at end of file diff --git a/scripts/entrypoint-controller.sh b/scripts/entrypoint-controller.sh new file mode 100644 index 0000000..6b79a24 --- /dev/null +++ b/scripts/entrypoint-controller.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Default values +MODEL_NAME="" +API_PORT=9999 +NTHREADS=4 +MAX_SEQ_LEN=4096 +WORKERS="" +BUFFER_FLOAT_TYPE="q80" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --model) + MODEL_NAME="$2" + shift 2 + ;; + --port) + API_PORT="$2" + shift 2 + ;; + --nthreads) + NTHREADS="$2" + shift 2 + ;; + --max-seq-len) + MAX_SEQ_LEN="$2" + shift 2 + ;; + --workers) + shift + WORKERS="$@" + break + ;; + --buffer-float-type) + BUFFER_FLOAT_TYPE="$2" + shift 2 + ;; + --download) + MODEL_NAME="$2" + echo "Downloading model: $MODEL_NAME" + /app/download-model.sh "$MODEL_NAME" + exit 0 + ;; + --help) + echo "Usage: docker run distributed-llama-controller [OPTIONS]" + echo "" + echo "Options:" + echo " --download Download a model and exit" + echo " --model Model name to use" + echo " --port API server port (default: 9999)" + echo " --nthreads Number of threads (default: 4)" + echo " --max-seq-len Maximum sequence length (default: 4096)" + echo " --buffer-float-type Buffer float type (default: q80)" + echo " --workers Space-separated list of worker addresses" + echo "" + echo "Examples:" + echo " # Download a model" + echo " docker run -v ./models:/app/models distributed-llama-controller --download llama3_2_3b_instruct_q40" + echo "" + echo " # Run API server with workers" + echo " docker run -p 9999:9999 -v ./models:/app/models distributed-llama-controller \\" + echo " --model llama3_2_3b_instruct_q40 --workers 10.0.0.2:9999 10.0.0.3:9999" + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [ -z "$MODEL_NAME" ]; then + echo "Error: --model is required (unless using --download)" + echo "Use --help for usage information" + exit 1 +fi + +MODEL_PATH="/app/models/$MODEL_NAME/dllama_model_$MODEL_NAME.m" +TOKENIZER_PATH="/app/models/$MODEL_NAME/dllama_tokenizer_$MODEL_NAME.t" + +if [ ! -f "$MODEL_PATH" ] || [ ! -f "$TOKENIZER_PATH" ]; then + echo "Error: Model files not found for $MODEL_NAME" + echo "Model path: $MODEL_PATH" + echo "Tokenizer path: $TOKENIZER_PATH" + echo "" + echo "Please download the model first:" + echo "docker run -v ./models:/app/models distributed-llama-controller --download $MODEL_NAME" + exit 1 +fi + +# Build the command +CMD="./dllama-api --port $API_PORT --model $MODEL_PATH --tokenizer $TOKENIZER_PATH --buffer-float-type $BUFFER_FLOAT_TYPE --nthreads $NTHREADS --max-seq-len $MAX_SEQ_LEN" + +if [ ! -z "$WORKERS" ]; then + CMD="$CMD --workers $WORKERS" +fi + +echo "Starting API server with command:" +echo "$CMD" +echo "" + +exec $CMD \ No newline at end of file diff --git a/scripts/entrypoint-worker.sh b/scripts/entrypoint-worker.sh new file mode 100644 index 0000000..61a3470 --- /dev/null +++ b/scripts/entrypoint-worker.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Default values +PORT=9999 +NTHREADS=4 + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --port) + PORT="$2" + shift 2 + ;; + --nthreads) + NTHREADS="$2" + shift 2 + ;; + --help) + echo "Usage: docker run distributed-llama-worker [OPTIONS]" + echo "" + echo "Options:" + echo " --port Worker port (default: 9999)" + echo " --nthreads Number of threads (default: 4)" + echo "" + echo "Example:" + echo " docker run -p 9999:9999 distributed-llama-worker --port 9999 --nthreads 4" + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Build the command +CMD="./dllama worker --port $PORT --nthreads $NTHREADS" + +echo "Starting worker with command:" +echo "$CMD" +echo "" + +exec $CMD \ No newline at end of file