version: '3.8' services: # Controller service - downloads models and runs API controller: image: registry.haschek.at/dllama-controller:latest ports: - "9999:9999" volumes: - /srv/nfs/swarm/dllama:/app/models networks: distributed-llama: ipv4_address: 172.20.0.10 environment: - MODEL_NAME=${MODEL_NAME:-llama3_2_3b_instruct_q40} - NTHREADS=${CONTROLLER_NTHREADS:-4} - MAX_SEQ_LEN=${MAX_SEQ_LEN:-4096} - BUFFER_FLOAT_TYPE=${BUFFER_FLOAT_TYPE:-q80} command: > --model ${MODEL_NAME:-llama3_2_3b_instruct_q40} --port 9999 --nthreads ${CONTROLLER_NTHREADS:-4} --max-seq-len ${MAX_SEQ_LEN:-4096} --buffer-float-type ${BUFFER_FLOAT_TYPE:-q80} --workers 172.20.0.11:9999 172.20.0.12:9999 172.20.0.13:9999 depends_on: - worker1 - worker2 - worker3 # Worker services worker1: image: registry.haschek.at/dllama-worker:latest networks: distributed-llama: ipv4_address: 172.20.0.11 environment: - NTHREADS=${WORKER_NTHREADS:-4} command: > --port 9999 --nthreads ${WORKER_NTHREADS:-4} deploy: placement: constraints: - node.role == manager volumes: models: