version: '3.8' services: # Controller service - downloads models and runs API controller: build: context: . dockerfile: Dockerfile.controller ports: - "9999:9999" volumes: - ./models:/app/models networks: distributed-llama: ipv4_address: 172.20.0.10 environment: - MODEL_NAME=${MODEL_NAME:-llama3_2_3b_instruct_q40} - NTHREADS=${CONTROLLER_NTHREADS:-4} - MAX_SEQ_LEN=${MAX_SEQ_LEN:-4096} - BUFFER_FLOAT_TYPE=${BUFFER_FLOAT_TYPE:-q80} command: > --model ${MODEL_NAME:-llama3_2_3b_instruct_q40} --port 9999 --nthreads ${CONTROLLER_NTHREADS:-4} --max-seq-len ${MAX_SEQ_LEN:-4096} --buffer-float-type ${BUFFER_FLOAT_TYPE:-q80} --workers 172.20.0.11:9999 172.20.0.12:9999 172.20.0.13:9999 depends_on: - worker1 - worker2 - worker3 # Worker services worker1: build: context: . dockerfile: Dockerfile.worker networks: distributed-llama: ipv4_address: 172.20.0.11 environment: - NTHREADS=${WORKER_NTHREADS:-4} command: > --port 9999 --nthreads ${WORKER_NTHREADS:-4} worker2: build: context: . dockerfile: Dockerfile.worker networks: distributed-llama: ipv4_address: 172.20.0.12 environment: - NTHREADS=${WORKER_NTHREADS:-4} command: > --port 9999 --nthreads ${WORKER_NTHREADS:-4} worker3: build: context: . dockerfile: Dockerfile.worker networks: distributed-llama: ipv4_address: 172.20.0.13 environment: - NTHREADS=${WORKER_NTHREADS:-4} command: > --port 9999 --nthreads ${WORKER_NTHREADS:-4} networks: distributed-llama: driver: bridge ipam: config: - subnet: 172.20.0.0/16 volumes: models: