import os import sys import time import socket import multiprocessing from urllib.request import urlopen def parts(length): result = [] for i in range(length): a = chr(97 + (i // 26)) b = chr(97 + (i % 26)) result.append(a + b) return result # [['model-url-0', 'model-url-1', ...], 'tokenizer-url', 'weights-float-type', 'buffer-float-type', 'model-type'] MODELS = { 'llama3_1_8b_instruct_q40': [ ['https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.1_instruct_q40.m?download=true'], 'https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama_3_1.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'llama3_1_405b_instruct_q40': [ list(map(lambda suffix : f'https://huggingface.co/b4rtaz/Llama-3_1-405B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama31_405b_q40_{suffix}?download=true', parts(56))), 'https://huggingface.co/b4rtaz/Llama-3_1-405B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama_3_1.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'llama3_2_1b_instruct_q40': [ ['https://huggingface.co/b4rtaz/Llama-3_2-1B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.2-1b-instruct_q40.m?download=true'], 'https://huggingface.co/b4rtaz/Llama-3_2-1B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3_2.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'llama3_2_3b_instruct_q40': [ ['https://huggingface.co/b4rtaz/Llama-3_2-3B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.2-3b-instruct_q40.m?download=true'], 'https://huggingface.co/b4rtaz/Llama-3_2-3B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3_2.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'llama3_3_70b_instruct_q40': [ list(map(lambda suffix : f'https://huggingface.co/b4rtaz/Llama-3_3-70B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama-3.3-70b_q40{suffix}?download=true', parts(11))), 'https://huggingface.co/b4rtaz/Llama-3_3-70B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama-3.3-70b.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'deepseek_r1_distill_llama_8b_q40': [ ['https://huggingface.co/b4rtaz/DeepSeek-R1-Distill-Llama-8B-Distributed-Llama/resolve/main/dllama_model_deepseek-r1-distill-llama-8b_q40.m?download=true'], 'https://huggingface.co/b4rtaz/DeepSeek-R1-Distill-Llama-8B-Distributed-Llama/resolve/main/dllama_tokenizer_deepseek-r1-distill-llama-8b.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'qwen3_0.6b_q40': [ ['https://huggingface.co/b4rtaz/Qwen3-0.6B-Q40-Distributed-Llama/resolve/main/dllama_model_qwen3_0.6b_q40.m?download=true'], 'https://huggingface.co/b4rtaz/Qwen3-0.6B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_qwen3_0.6b.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'qwen3_1.7b_q40': [ ['https://huggingface.co/b4rtaz/Qwen3-1.7B-Q40-Distributed-Llama/resolve/main/dllama_model_qwen3_1.7b_q40.m?download=true'], 'https://huggingface.co/b4rtaz/Qwen3-1.7B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_qwen3_1.7b.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'qwen3_8b_q40': [ ['https://huggingface.co/b4rtaz/Qwen3-8B-Q40-Distributed-Llama/resolve/main/dllama_model_qwen3_8b_q40.m?download=true'], 'https://huggingface.co/b4rtaz/Qwen3-8B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_qwen3_8b.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'qwen3_14b_q40': [ list(map(lambda suffix : f'https://huggingface.co/b4rtaz/Qwen3-14B-Q40-Distributed-Llama/resolve/main/dllama_model_qwen3_14b_q40_{suffix}?download=true', parts(2))), 'https://huggingface.co/b4rtaz/Qwen3-14B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_qwen3_14b.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], 'qwen3_30b_a3b_q40': [ list(map(lambda suffix : f'https://huggingface.co/b4rtaz/Qwen3-30B-A3B-Q40-Distributed-Llama/resolve/main/dllama_model_qwen3_30b_a3b_{suffix}?download=true', parts(5))), 'https://huggingface.co/b4rtaz/Qwen3-30B-A3B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_qwen3_30b_a3b.t?download=true', 'q40', 'q80', 'chat', '--max-seq-len 4096' ], } def confirm(message: str): alwaysYes = sys.argv.count('-y') > 0 if alwaysYes: return True result = input(f'ā“ {message} ("Y" if yes): ').upper() return result == 'Y' or result == 'YES' def downloadFile(urls, path: str): if os.path.isfile(path): fileName = os.path.basename(path) if not confirm(f'{fileName} already exists, do you want to download again?'): return socket.setdefaulttimeout(30) lastSizeMb = 0 with open(path, 'wb') as file: for url in urls: startPosition = file.tell() success = False for attempt in range(8): print(f'šŸ“„ {url} (attempt: {attempt})') try: with urlopen(url) as response: while True: chunk = response.read(4096) if not chunk: break file.write(chunk) sizeMb = file.tell() // (1024 * 1024) if sizeMb != lastSizeMb: sys.stdout.write("\rDownloaded %i MB" % sizeMb) lastSizeMb = sizeMb sys.stdout.write('\n') success = True break except Exception as e: print(f'\nāŒ Error downloading {url}: {e}') file.seek(startPosition) file.truncate() time.sleep(1 * attempt) if not success: raise Exception(f'Failed to download {url}') sys.stdout.write(' āœ…\n') def download(modelName: str, model: list): dirPath = os.path.join('models', modelName) print(f'šŸ“€ Downloading {modelName} to {dirPath}...') os.makedirs(dirPath, exist_ok=True) modelUrls = model[0] tokenizerUrl = model[1] modelPath = os.path.join(dirPath, f'dllama_model_{modelName}.m') tokenizerPath = os.path.join(dirPath, f'dllama_tokenizer_{modelName}.t') downloadFile(modelUrls, modelPath) downloadFile([tokenizerUrl], tokenizerPath) print('šŸ“€ All files are downloaded') return (modelPath, tokenizerPath) def writeRunFile(modelName: str, command: str): filePath = f'run_{modelName}.sh' with open(filePath, 'w') as file: file.write('#!/bin/sh\n') file.write('\n') file.write(f'{command}\n') return filePath def printUsage(): print('Usage: python download-model.py ') print() print('Options:') print(' The name of the model to download') print(' -skip-run Do not run the model after download') print(' -skip-script Do not create a script to run the model') print(' -y Skip confirmation prompts') print() print('Available models:') for model in MODELS: print(f' {model}') if __name__ == '__main__': if (len(sys.argv) < 2): printUsage() exit(1) os.chdir(os.path.dirname(__file__)) modelName = sys.argv[1].replace('-', '_') if modelName not in MODELS: print(f'Model is not supported: {modelName}') exit(1) model = MODELS[modelName] (modelPath, tokenizerPath) = download(modelName, model) nThreads = multiprocessing.cpu_count() if (model[4] == 'chat'): command = './dllama chat' else: command = './dllama inference --steps 64 --prompt "Hello world"' command += f' --model {modelPath} --tokenizer {tokenizerPath} --buffer-float-type {model[3]} --nthreads {nThreads}' if (len(model) > 5): command += f' {model[5]}' print('To run Distributed Llama you need to execute:') print('--- copy start ---') print() print('\033[96m' + command + '\033[0m') print() print('--- copy end -----') skipRun = sys.argv.count('-skip-run') > 0 skipScript = sys.argv.count('-skip-script') > 0 if (not skipScript): runFilePath = writeRunFile(modelName, command) print(f'🌻 Created {runFilePath} script to easy run') if (not skipRun): if (confirm('Do you want to run Distributed Llama?')): if (not os.path.isfile('dllama')): os.system('make dllama') os.system(command)