new stuff
This commit is contained in:
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -17,7 +17,7 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- runner: ubuntu-22.04
|
- runner: ubuntu-22.04
|
||||||
arch: amd64
|
arch: amd64
|
||||||
- runner: ubuntu-24.04-arm
|
- runner: ubuntu-latest-arm64
|
||||||
arch: arm64
|
arch: arm64
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Repo
|
- name: Checkout Repo
|
||||||
|
|||||||
@@ -23,136 +23,19 @@ COPY src/ ./src/
|
|||||||
COPY Makefile ./
|
COPY Makefile ./
|
||||||
COPY launch.py ./
|
COPY launch.py ./
|
||||||
|
|
||||||
|
# Copy scripts
|
||||||
|
COPY scripts/download-model.sh /app/download-model.sh
|
||||||
|
COPY scripts/entrypoint-controller.sh /app/entrypoint.sh
|
||||||
|
|
||||||
|
# Make scripts executable
|
||||||
|
RUN chmod +x /app/download-model.sh /app/entrypoint.sh
|
||||||
|
|
||||||
# Build the applications
|
# Build the applications
|
||||||
RUN make dllama && make dllama-api
|
RUN make dllama && make dllama-api
|
||||||
|
|
||||||
# Create models directory for volume mount
|
# Create models directory for volume mount
|
||||||
RUN mkdir -p /app/models
|
RUN mkdir -p /app/models
|
||||||
|
|
||||||
# Create a script to download models
|
|
||||||
COPY <<EOF /app/download-model.sh
|
|
||||||
#!/bin/bash
|
|
||||||
if [ -z "\$1" ]; then
|
|
||||||
echo "Usage: download-model.sh <model_name>"
|
|
||||||
echo "Available models:"
|
|
||||||
python3 launch.py
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 launch.py "\$1" -skip-run -skip-script -y
|
|
||||||
EOF
|
|
||||||
|
|
||||||
RUN chmod +x /app/download-model.sh
|
|
||||||
|
|
||||||
# Create entrypoint script
|
|
||||||
COPY <<EOF /app/entrypoint.sh
|
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Default values
|
|
||||||
MODEL_NAME=""
|
|
||||||
API_PORT=9999
|
|
||||||
NTHREADS=4
|
|
||||||
MAX_SEQ_LEN=4096
|
|
||||||
WORKERS=""
|
|
||||||
BUFFER_FLOAT_TYPE="q80"
|
|
||||||
|
|
||||||
# Parse command line arguments
|
|
||||||
while [[ \$# -gt 0 ]]; do
|
|
||||||
case \$1 in
|
|
||||||
--model)
|
|
||||||
MODEL_NAME="\$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--port)
|
|
||||||
API_PORT="\$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--nthreads)
|
|
||||||
NTHREADS="\$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--max-seq-len)
|
|
||||||
MAX_SEQ_LEN="\$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--workers)
|
|
||||||
shift
|
|
||||||
WORKERS="\$@"
|
|
||||||
break
|
|
||||||
;;
|
|
||||||
--buffer-float-type)
|
|
||||||
BUFFER_FLOAT_TYPE="\$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--download)
|
|
||||||
MODEL_NAME="\$2"
|
|
||||||
echo "Downloading model: \$MODEL_NAME"
|
|
||||||
/app/download-model.sh "\$MODEL_NAME"
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
--help)
|
|
||||||
echo "Usage: docker run distributed-llama-controller [OPTIONS]"
|
|
||||||
echo ""
|
|
||||||
echo "Options:"
|
|
||||||
echo " --download <model> Download a model and exit"
|
|
||||||
echo " --model <model> Model name to use"
|
|
||||||
echo " --port <port> API server port (default: 9999)"
|
|
||||||
echo " --nthreads <n> Number of threads (default: 4)"
|
|
||||||
echo " --max-seq-len <n> Maximum sequence length (default: 4096)"
|
|
||||||
echo " --buffer-float-type <type> Buffer float type (default: q80)"
|
|
||||||
echo " --workers <workers> Space-separated list of worker addresses (e.g., 10.0.0.2:9999 10.0.0.3:9999)"
|
|
||||||
echo ""
|
|
||||||
echo "Examples:"
|
|
||||||
echo " # Download a model"
|
|
||||||
echo " docker run -v ./models:/app/models distributed-llama-controller --download llama3_2_3b_instruct_q40"
|
|
||||||
echo ""
|
|
||||||
echo " # Run API server with workers"
|
|
||||||
echo " docker run -p 9999:9999 -v ./models:/app/models distributed-llama-controller \\"
|
|
||||||
echo " --model llama3_2_3b_instruct_q40 --workers 10.0.0.2:9999 10.0.0.3:9999"
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: \$1"
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ -z "\$MODEL_NAME" ]; then
|
|
||||||
echo "Error: --model is required"
|
|
||||||
echo "Use --help for usage information"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
MODEL_PATH="/app/models/\$MODEL_NAME/dllama_model_\$MODEL_NAME.m"
|
|
||||||
TOKENIZER_PATH="/app/models/\$MODEL_NAME/dllama_tokenizer_\$MODEL_NAME.t"
|
|
||||||
|
|
||||||
if [ ! -f "\$MODEL_PATH" ] || [ ! -f "\$TOKENIZER_PATH" ]; then
|
|
||||||
echo "Error: Model files not found for \$MODEL_NAME"
|
|
||||||
echo "Model path: \$MODEL_PATH"
|
|
||||||
echo "Tokenizer path: \$TOKENIZER_PATH"
|
|
||||||
echo ""
|
|
||||||
echo "Please download the model first:"
|
|
||||||
echo "docker run -v ./models:/app/models distributed-llama-controller --download \$MODEL_NAME"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Build the command
|
|
||||||
CMD="./dllama-api --port \$API_PORT --model \$MODEL_PATH --tokenizer \$TOKENIZER_PATH --buffer-float-type \$BUFFER_FLOAT_TYPE --nthreads \$NTHREADS --max-seq-len \$MAX_SEQ_LEN"
|
|
||||||
|
|
||||||
if [ ! -z "\$WORKERS" ]; then
|
|
||||||
CMD="\$CMD --workers \$WORKERS"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Starting API server with command:"
|
|
||||||
echo "\$CMD"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
exec \$CMD
|
|
||||||
EOF
|
|
||||||
|
|
||||||
RUN chmod +x /app/entrypoint.sh
|
|
||||||
|
|
||||||
# Expose the default API port
|
# Expose the default API port
|
||||||
EXPOSE 9999
|
EXPOSE 9999
|
||||||
|
|
||||||
|
|||||||
@@ -16,58 +16,15 @@ WORKDIR /app
|
|||||||
COPY src/ ./src/
|
COPY src/ ./src/
|
||||||
COPY Makefile ./
|
COPY Makefile ./
|
||||||
|
|
||||||
|
# Copy worker entrypoint script
|
||||||
|
COPY scripts/entrypoint-worker.sh /app/entrypoint.sh
|
||||||
|
|
||||||
|
# Make script executable
|
||||||
|
RUN chmod +x /app/entrypoint.sh
|
||||||
|
|
||||||
# Build only the worker application
|
# Build only the worker application
|
||||||
RUN make dllama
|
RUN make dllama
|
||||||
|
|
||||||
# Create entrypoint script
|
|
||||||
COPY <<EOF /app/entrypoint.sh
|
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Default values
|
|
||||||
PORT=9999
|
|
||||||
NTHREADS=4
|
|
||||||
|
|
||||||
# Parse command line arguments
|
|
||||||
while [[ \$# -gt 0 ]]; do
|
|
||||||
case \$1 in
|
|
||||||
--port)
|
|
||||||
PORT="\$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--nthreads)
|
|
||||||
NTHREADS="\$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--help)
|
|
||||||
echo "Usage: docker run distributed-llama-worker [OPTIONS]"
|
|
||||||
echo ""
|
|
||||||
echo "Options:"
|
|
||||||
echo " --port <port> Worker port (default: 9999)"
|
|
||||||
echo " --nthreads <n> Number of threads (default: 4)"
|
|
||||||
echo ""
|
|
||||||
echo "Example:"
|
|
||||||
echo " docker run -p 9999:9999 distributed-llama-worker --port 9999 --nthreads 4"
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: \$1"
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# Build the command
|
|
||||||
CMD="./dllama worker --port \$PORT --nthreads \$NTHREADS"
|
|
||||||
|
|
||||||
echo "Starting worker with command:"
|
|
||||||
echo "\$CMD"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
exec \$CMD
|
|
||||||
EOF
|
|
||||||
|
|
||||||
RUN chmod +x /app/entrypoint.sh
|
|
||||||
|
|
||||||
# Expose the default worker port
|
# Expose the default worker port
|
||||||
EXPOSE 9999
|
EXPOSE 9999
|
||||||
|
|
||||||
|
|||||||
@@ -3,13 +3,11 @@ version: '3.8'
|
|||||||
services:
|
services:
|
||||||
# Controller service - downloads models and runs API
|
# Controller service - downloads models and runs API
|
||||||
controller:
|
controller:
|
||||||
build:
|
image: registry.haschek.at/dllama-controller:latest
|
||||||
context: .
|
|
||||||
dockerfile: Dockerfile.controller
|
|
||||||
ports:
|
ports:
|
||||||
- "9999:9999"
|
- "9999:9999"
|
||||||
volumes:
|
volumes:
|
||||||
- ./models:/app/models
|
- /srv/nfs/swarm/dllama:/app/models
|
||||||
networks:
|
networks:
|
||||||
distributed-llama:
|
distributed-llama:
|
||||||
ipv4_address: 172.20.0.10
|
ipv4_address: 172.20.0.10
|
||||||
@@ -32,9 +30,7 @@ services:
|
|||||||
|
|
||||||
# Worker services
|
# Worker services
|
||||||
worker1:
|
worker1:
|
||||||
build:
|
image: registry.haschek.at/dllama-worker:latest
|
||||||
context: .
|
|
||||||
dockerfile: Dockerfile.worker
|
|
||||||
networks:
|
networks:
|
||||||
distributed-llama:
|
distributed-llama:
|
||||||
ipv4_address: 172.20.0.11
|
ipv4_address: 172.20.0.11
|
||||||
@@ -43,39 +39,12 @@ services:
|
|||||||
command: >
|
command: >
|
||||||
--port 9999
|
--port 9999
|
||||||
--nthreads ${WORKER_NTHREADS:-4}
|
--nthreads ${WORKER_NTHREADS:-4}
|
||||||
|
deploy:
|
||||||
|
placement:
|
||||||
|
constraints:
|
||||||
|
- node.role == manager
|
||||||
|
|
||||||
worker2:
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: Dockerfile.worker
|
|
||||||
networks:
|
|
||||||
distributed-llama:
|
|
||||||
ipv4_address: 172.20.0.12
|
|
||||||
environment:
|
|
||||||
- NTHREADS=${WORKER_NTHREADS:-4}
|
|
||||||
command: >
|
|
||||||
--port 9999
|
|
||||||
--nthreads ${WORKER_NTHREADS:-4}
|
|
||||||
|
|
||||||
worker3:
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: Dockerfile.worker
|
|
||||||
networks:
|
|
||||||
distributed-llama:
|
|
||||||
ipv4_address: 172.20.0.13
|
|
||||||
environment:
|
|
||||||
- NTHREADS=${WORKER_NTHREADS:-4}
|
|
||||||
command: >
|
|
||||||
--port 9999
|
|
||||||
--nthreads ${WORKER_NTHREADS:-4}
|
|
||||||
|
|
||||||
networks:
|
|
||||||
distributed-llama:
|
|
||||||
driver: bridge
|
|
||||||
ipam:
|
|
||||||
config:
|
|
||||||
- subnet: 172.20.0.0/16
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
models:
|
models:
|
||||||
9
scripts/download-model.sh
Normal file
9
scripts/download-model.sh
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
if [ -z "$1" ]; then
|
||||||
|
echo "Usage: download-model.sh <model_name>"
|
||||||
|
echo "Available models:"
|
||||||
|
python3 launch.py
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 launch.py "$1" -skip-run -skip-script -y
|
||||||
103
scripts/entrypoint-controller.sh
Normal file
103
scripts/entrypoint-controller.sh
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
MODEL_NAME=""
|
||||||
|
API_PORT=9999
|
||||||
|
NTHREADS=4
|
||||||
|
MAX_SEQ_LEN=4096
|
||||||
|
WORKERS=""
|
||||||
|
BUFFER_FLOAT_TYPE="q80"
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--model)
|
||||||
|
MODEL_NAME="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--port)
|
||||||
|
API_PORT="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--nthreads)
|
||||||
|
NTHREADS="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--max-seq-len)
|
||||||
|
MAX_SEQ_LEN="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--workers)
|
||||||
|
shift
|
||||||
|
WORKERS="$@"
|
||||||
|
break
|
||||||
|
;;
|
||||||
|
--buffer-float-type)
|
||||||
|
BUFFER_FLOAT_TYPE="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--download)
|
||||||
|
MODEL_NAME="$2"
|
||||||
|
echo "Downloading model: $MODEL_NAME"
|
||||||
|
/app/download-model.sh "$MODEL_NAME"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
--help)
|
||||||
|
echo "Usage: docker run distributed-llama-controller [OPTIONS]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:"
|
||||||
|
echo " --download <model> Download a model and exit"
|
||||||
|
echo " --model <model> Model name to use"
|
||||||
|
echo " --port <port> API server port (default: 9999)"
|
||||||
|
echo " --nthreads <n> Number of threads (default: 4)"
|
||||||
|
echo " --max-seq-len <n> Maximum sequence length (default: 4096)"
|
||||||
|
echo " --buffer-float-type <type> Buffer float type (default: q80)"
|
||||||
|
echo " --workers <workers> Space-separated list of worker addresses"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " # Download a model"
|
||||||
|
echo " docker run -v ./models:/app/models distributed-llama-controller --download llama3_2_3b_instruct_q40"
|
||||||
|
echo ""
|
||||||
|
echo " # Run API server with workers"
|
||||||
|
echo " docker run -p 9999:9999 -v ./models:/app/models distributed-llama-controller \\"
|
||||||
|
echo " --model llama3_2_3b_instruct_q40 --workers 10.0.0.2:9999 10.0.0.3:9999"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option: $1"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -z "$MODEL_NAME" ]; then
|
||||||
|
echo "Error: --model is required (unless using --download)"
|
||||||
|
echo "Use --help for usage information"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
MODEL_PATH="/app/models/$MODEL_NAME/dllama_model_$MODEL_NAME.m"
|
||||||
|
TOKENIZER_PATH="/app/models/$MODEL_NAME/dllama_tokenizer_$MODEL_NAME.t"
|
||||||
|
|
||||||
|
if [ ! -f "$MODEL_PATH" ] || [ ! -f "$TOKENIZER_PATH" ]; then
|
||||||
|
echo "Error: Model files not found for $MODEL_NAME"
|
||||||
|
echo "Model path: $MODEL_PATH"
|
||||||
|
echo "Tokenizer path: $TOKENIZER_PATH"
|
||||||
|
echo ""
|
||||||
|
echo "Please download the model first:"
|
||||||
|
echo "docker run -v ./models:/app/models distributed-llama-controller --download $MODEL_NAME"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build the command
|
||||||
|
CMD="./dllama-api --port $API_PORT --model $MODEL_PATH --tokenizer $TOKENIZER_PATH --buffer-float-type $BUFFER_FLOAT_TYPE --nthreads $NTHREADS --max-seq-len $MAX_SEQ_LEN"
|
||||||
|
|
||||||
|
if [ ! -z "$WORKERS" ]; then
|
||||||
|
CMD="$CMD --workers $WORKERS"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Starting API server with command:"
|
||||||
|
echo "$CMD"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
exec $CMD
|
||||||
43
scripts/entrypoint-worker.sh
Normal file
43
scripts/entrypoint-worker.sh
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
PORT=9999
|
||||||
|
NTHREADS=4
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--port)
|
||||||
|
PORT="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--nthreads)
|
||||||
|
NTHREADS="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--help)
|
||||||
|
echo "Usage: docker run distributed-llama-worker [OPTIONS]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:"
|
||||||
|
echo " --port <port> Worker port (default: 9999)"
|
||||||
|
echo " --nthreads <n> Number of threads (default: 4)"
|
||||||
|
echo ""
|
||||||
|
echo "Example:"
|
||||||
|
echo " docker run -p 9999:9999 distributed-llama-worker --port 9999 --nthreads 4"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option: $1"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Build the command
|
||||||
|
CMD="./dllama worker --port $PORT --nthreads $NTHREADS"
|
||||||
|
|
||||||
|
echo "Starting worker with command:"
|
||||||
|
echo "$CMD"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
exec $CMD
|
||||||
Reference in New Issue
Block a user