new stuff
Some checks failed
main / Linux (amd64, ubuntu-22.04) (push) Successful in 24s
main / Linux (arm64, ubuntu-latest-arm64) (push) Successful in 1m9s
main / Windows (push) Has been cancelled

This commit is contained in:
2025-10-24 12:10:58 +02:00
parent 42172cbb6f
commit c571a82356
7 changed files with 176 additions and 212 deletions

View File

@@ -17,7 +17,7 @@ jobs:
include:
- runner: ubuntu-22.04
arch: amd64
- runner: ubuntu-24.04-arm
- runner: ubuntu-latest-arm64
arch: arm64
steps:
- name: Checkout Repo

View File

@@ -23,136 +23,19 @@ COPY src/ ./src/
COPY Makefile ./
COPY launch.py ./
# Copy scripts
COPY scripts/download-model.sh /app/download-model.sh
COPY scripts/entrypoint-controller.sh /app/entrypoint.sh
# Make scripts executable
RUN chmod +x /app/download-model.sh /app/entrypoint.sh
# Build the applications
RUN make dllama && make dllama-api
# Create models directory for volume mount
RUN mkdir -p /app/models
# Create a script to download models
COPY <<EOF /app/download-model.sh
#!/bin/bash
if [ -z "\$1" ]; then
echo "Usage: download-model.sh <model_name>"
echo "Available models:"
python3 launch.py
exit 1
fi
python3 launch.py "\$1" -skip-run -skip-script -y
EOF
RUN chmod +x /app/download-model.sh
# Create entrypoint script
COPY <<EOF /app/entrypoint.sh
#!/bin/bash
# Default values
MODEL_NAME=""
API_PORT=9999
NTHREADS=4
MAX_SEQ_LEN=4096
WORKERS=""
BUFFER_FLOAT_TYPE="q80"
# Parse command line arguments
while [[ \$# -gt 0 ]]; do
case \$1 in
--model)
MODEL_NAME="\$2"
shift 2
;;
--port)
API_PORT="\$2"
shift 2
;;
--nthreads)
NTHREADS="\$2"
shift 2
;;
--max-seq-len)
MAX_SEQ_LEN="\$2"
shift 2
;;
--workers)
shift
WORKERS="\$@"
break
;;
--buffer-float-type)
BUFFER_FLOAT_TYPE="\$2"
shift 2
;;
--download)
MODEL_NAME="\$2"
echo "Downloading model: \$MODEL_NAME"
/app/download-model.sh "\$MODEL_NAME"
exit 0
;;
--help)
echo "Usage: docker run distributed-llama-controller [OPTIONS]"
echo ""
echo "Options:"
echo " --download <model> Download a model and exit"
echo " --model <model> Model name to use"
echo " --port <port> API server port (default: 9999)"
echo " --nthreads <n> Number of threads (default: 4)"
echo " --max-seq-len <n> Maximum sequence length (default: 4096)"
echo " --buffer-float-type <type> Buffer float type (default: q80)"
echo " --workers <workers> Space-separated list of worker addresses (e.g., 10.0.0.2:9999 10.0.0.3:9999)"
echo ""
echo "Examples:"
echo " # Download a model"
echo " docker run -v ./models:/app/models distributed-llama-controller --download llama3_2_3b_instruct_q40"
echo ""
echo " # Run API server with workers"
echo " docker run -p 9999:9999 -v ./models:/app/models distributed-llama-controller \\"
echo " --model llama3_2_3b_instruct_q40 --workers 10.0.0.2:9999 10.0.0.3:9999"
exit 0
;;
*)
echo "Unknown option: \$1"
exit 1
;;
esac
done
if [ -z "\$MODEL_NAME" ]; then
echo "Error: --model is required"
echo "Use --help for usage information"
exit 1
fi
MODEL_PATH="/app/models/\$MODEL_NAME/dllama_model_\$MODEL_NAME.m"
TOKENIZER_PATH="/app/models/\$MODEL_NAME/dllama_tokenizer_\$MODEL_NAME.t"
if [ ! -f "\$MODEL_PATH" ] || [ ! -f "\$TOKENIZER_PATH" ]; then
echo "Error: Model files not found for \$MODEL_NAME"
echo "Model path: \$MODEL_PATH"
echo "Tokenizer path: \$TOKENIZER_PATH"
echo ""
echo "Please download the model first:"
echo "docker run -v ./models:/app/models distributed-llama-controller --download \$MODEL_NAME"
exit 1
fi
# Build the command
CMD="./dllama-api --port \$API_PORT --model \$MODEL_PATH --tokenizer \$TOKENIZER_PATH --buffer-float-type \$BUFFER_FLOAT_TYPE --nthreads \$NTHREADS --max-seq-len \$MAX_SEQ_LEN"
if [ ! -z "\$WORKERS" ]; then
CMD="\$CMD --workers \$WORKERS"
fi
echo "Starting API server with command:"
echo "\$CMD"
echo ""
exec \$CMD
EOF
RUN chmod +x /app/entrypoint.sh
# Expose the default API port
EXPOSE 9999

View File

@@ -16,58 +16,15 @@ WORKDIR /app
COPY src/ ./src/
COPY Makefile ./
# Copy worker entrypoint script
COPY scripts/entrypoint-worker.sh /app/entrypoint.sh
# Make script executable
RUN chmod +x /app/entrypoint.sh
# Build only the worker application
RUN make dllama
# Create entrypoint script
COPY <<EOF /app/entrypoint.sh
#!/bin/bash
# Default values
PORT=9999
NTHREADS=4
# Parse command line arguments
while [[ \$# -gt 0 ]]; do
case \$1 in
--port)
PORT="\$2"
shift 2
;;
--nthreads)
NTHREADS="\$2"
shift 2
;;
--help)
echo "Usage: docker run distributed-llama-worker [OPTIONS]"
echo ""
echo "Options:"
echo " --port <port> Worker port (default: 9999)"
echo " --nthreads <n> Number of threads (default: 4)"
echo ""
echo "Example:"
echo " docker run -p 9999:9999 distributed-llama-worker --port 9999 --nthreads 4"
exit 0
;;
*)
echo "Unknown option: \$1"
exit 1
;;
esac
done
# Build the command
CMD="./dllama worker --port \$PORT --nthreads \$NTHREADS"
echo "Starting worker with command:"
echo "\$CMD"
echo ""
exec \$CMD
EOF
RUN chmod +x /app/entrypoint.sh
# Expose the default worker port
EXPOSE 9999

View File

@@ -3,13 +3,11 @@ version: '3.8'
services:
# Controller service - downloads models and runs API
controller:
build:
context: .
dockerfile: Dockerfile.controller
image: registry.haschek.at/dllama-controller:latest
ports:
- "9999:9999"
volumes:
- ./models:/app/models
- /srv/nfs/swarm/dllama:/app/models
networks:
distributed-llama:
ipv4_address: 172.20.0.10
@@ -32,9 +30,7 @@ services:
# Worker services
worker1:
build:
context: .
dockerfile: Dockerfile.worker
image: registry.haschek.at/dllama-worker:latest
networks:
distributed-llama:
ipv4_address: 172.20.0.11
@@ -43,39 +39,12 @@ services:
command: >
--port 9999
--nthreads ${WORKER_NTHREADS:-4}
deploy:
placement:
constraints:
- node.role == manager
worker2:
build:
context: .
dockerfile: Dockerfile.worker
networks:
distributed-llama:
ipv4_address: 172.20.0.12
environment:
- NTHREADS=${WORKER_NTHREADS:-4}
command: >
--port 9999
--nthreads ${WORKER_NTHREADS:-4}
worker3:
build:
context: .
dockerfile: Dockerfile.worker
networks:
distributed-llama:
ipv4_address: 172.20.0.13
environment:
- NTHREADS=${WORKER_NTHREADS:-4}
command: >
--port 9999
--nthreads ${WORKER_NTHREADS:-4}
networks:
distributed-llama:
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/16
volumes:
models:

View File

@@ -0,0 +1,9 @@
#!/bin/bash
if [ -z "$1" ]; then
echo "Usage: download-model.sh <model_name>"
echo "Available models:"
python3 launch.py
exit 1
fi
python3 launch.py "$1" -skip-run -skip-script -y

View File

@@ -0,0 +1,103 @@
#!/bin/bash
# Default values
MODEL_NAME=""
API_PORT=9999
NTHREADS=4
MAX_SEQ_LEN=4096
WORKERS=""
BUFFER_FLOAT_TYPE="q80"
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME="$2"
shift 2
;;
--port)
API_PORT="$2"
shift 2
;;
--nthreads)
NTHREADS="$2"
shift 2
;;
--max-seq-len)
MAX_SEQ_LEN="$2"
shift 2
;;
--workers)
shift
WORKERS="$@"
break
;;
--buffer-float-type)
BUFFER_FLOAT_TYPE="$2"
shift 2
;;
--download)
MODEL_NAME="$2"
echo "Downloading model: $MODEL_NAME"
/app/download-model.sh "$MODEL_NAME"
exit 0
;;
--help)
echo "Usage: docker run distributed-llama-controller [OPTIONS]"
echo ""
echo "Options:"
echo " --download <model> Download a model and exit"
echo " --model <model> Model name to use"
echo " --port <port> API server port (default: 9999)"
echo " --nthreads <n> Number of threads (default: 4)"
echo " --max-seq-len <n> Maximum sequence length (default: 4096)"
echo " --buffer-float-type <type> Buffer float type (default: q80)"
echo " --workers <workers> Space-separated list of worker addresses"
echo ""
echo "Examples:"
echo " # Download a model"
echo " docker run -v ./models:/app/models distributed-llama-controller --download llama3_2_3b_instruct_q40"
echo ""
echo " # Run API server with workers"
echo " docker run -p 9999:9999 -v ./models:/app/models distributed-llama-controller \\"
echo " --model llama3_2_3b_instruct_q40 --workers 10.0.0.2:9999 10.0.0.3:9999"
exit 0
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
if [ -z "$MODEL_NAME" ]; then
echo "Error: --model is required (unless using --download)"
echo "Use --help for usage information"
exit 1
fi
MODEL_PATH="/app/models/$MODEL_NAME/dllama_model_$MODEL_NAME.m"
TOKENIZER_PATH="/app/models/$MODEL_NAME/dllama_tokenizer_$MODEL_NAME.t"
if [ ! -f "$MODEL_PATH" ] || [ ! -f "$TOKENIZER_PATH" ]; then
echo "Error: Model files not found for $MODEL_NAME"
echo "Model path: $MODEL_PATH"
echo "Tokenizer path: $TOKENIZER_PATH"
echo ""
echo "Please download the model first:"
echo "docker run -v ./models:/app/models distributed-llama-controller --download $MODEL_NAME"
exit 1
fi
# Build the command
CMD="./dllama-api --port $API_PORT --model $MODEL_PATH --tokenizer $TOKENIZER_PATH --buffer-float-type $BUFFER_FLOAT_TYPE --nthreads $NTHREADS --max-seq-len $MAX_SEQ_LEN"
if [ ! -z "$WORKERS" ]; then
CMD="$CMD --workers $WORKERS"
fi
echo "Starting API server with command:"
echo "$CMD"
echo ""
exec $CMD

View File

@@ -0,0 +1,43 @@
#!/bin/bash
# Default values
PORT=9999
NTHREADS=4
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--port)
PORT="$2"
shift 2
;;
--nthreads)
NTHREADS="$2"
shift 2
;;
--help)
echo "Usage: docker run distributed-llama-worker [OPTIONS]"
echo ""
echo "Options:"
echo " --port <port> Worker port (default: 9999)"
echo " --nthreads <n> Number of threads (default: 4)"
echo ""
echo "Example:"
echo " docker run -p 9999:9999 distributed-llama-worker --port 9999 --nthreads 4"
exit 0
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
# Build the command
CMD="./dllama worker --port $PORT --nthreads $NTHREADS"
echo "Starting worker with command:"
echo "$CMD"
echo ""
exec $CMD