Files
dllama/converter/convert-llama.py
Chris 42172cbb6f
Some checks failed
main / Linux (amd64, ubuntu-22.04) (push) Successful in 49s
main / Linux (arm64, ubuntu-24.04-arm) (push) Has been cancelled
main / Windows (push) Has been cancelled
init
2025-10-24 11:42:14 +02:00

122 lines
4.4 KiB
Python

import os
import sys
import json
import torch
import math
import numpy as np
from writer import writeTensor, writeHeader, parseFloatType, strFloatType, FloatType
from pathlib import Path
LAYER_CHUNK_SIZE = 48
def convert(modelPath, outputPath, targetFloatType):
paramsPath = os.path.join(modelPath, 'params.json')
with open(paramsPath) as f:
params = json.load(f)
if (params['vocab_size'] < 1):
raise Exception('vocab_size is invalid, please update params.json file')
if (params.get('max_seq_len') is None):
raise Exception('max_seq_len is required, please update params.json file')
params['n_kv_heads'] = params.get('n_kv_heads') or params['n_heads']
params['head_size'] = params['dim'] / params['n_heads']
params['arch_type'] = 0xABCD00
params['n_experts'] = 0
params['n_active_experts'] = 0
params['weights_float_type'] = targetFloatType
if ('rope_theta' in params):
params['rope_theta'] = int(params['rope_theta'])
modelPaths = sorted(list(Path(modelPath).glob('consolidated.*.pth')))
nSlices = len(modelPaths)
layers = []
layers.append('tok_embeddings.weight')
for layerIndex in range(0, params['n_layers']):
layers.append(f'layers.{layerIndex}.attention.wq.weight')
layers.append(f'layers.{layerIndex}.attention.wk.weight')
layers.append(f'layers.{layerIndex}.attention.wv.weight')
layers.append(f'layers.{layerIndex}.attention.wo.weight')
layers.append(f'layers.{layerIndex}.feed_forward.w1.weight')
layers.append(f'layers.{layerIndex}.feed_forward.w2.weight')
layers.append(f'layers.{layerIndex}.feed_forward.w3.weight')
layers.append(f'layers.{layerIndex}.attention_norm.weight')
layers.append(f'layers.{layerIndex}.ffn_norm.weight')
layers.append('norm.weight')
layers.append('output.weight')
isHeaderWrote = False
outFile = open(outputPath, 'wb')
nChunks = math.ceil(len(layers) / LAYER_CHUNK_SIZE)
for chunkIndex in range(0, nChunks):
chunkLayerNames = layers[LAYER_CHUNK_SIZE * chunkIndex:LAYER_CHUNK_SIZE * (chunkIndex + 1)]
models = {}
for layerName in chunkLayerNames:
models[layerName] = []
print(f'💿 Chunking model {chunkIndex + 1}/{nChunks}...')
for modelPath in modelPaths:
model = torch.load(modelPath, map_location='cpu')
for modelKey in model:
if (modelKey in chunkLayerNames):
models[modelKey].append(model[modelKey])
if not isHeaderWrote:
params['hidden_dim'] = model['layers.0.feed_forward.w1.weight'].shape[0] * nSlices
writeHeader(outFile, params)
isHeaderWrote = True
del model
for layerName in chunkLayerNames:
if layerName == 'rope.freqs':
continue
isAxis1 = (
layerName == 'tok_embeddings.weight' or
layerName.endswith('.attention.wo.weight') or
layerName.endswith('.feed_forward.w2.weight')
)
isAlwaysF32 = (
layerName == 'tok_embeddings.weight' or
layerName.endswith('.attention_norm.weight') or
layerName.endswith('.ffn_norm.weight') or
layerName == 'norm.weight'
)
floatType = FloatType.F32 if isAlwaysF32 else targetFloatType
tensors = models[layerName]
if len(tensors) == 1 or len(tensors[0].shape) == 1:
tensor = tensors[0]
else:
tensor = torch.cat(tensors, dim=(1 if isAxis1 else 0))
print(f'🔶 Exporting {layerName} {tensor.shape}...')
writeTensor(outFile, tensor, floatType)
del models
outFile.close()
def usage():
print('Usage: python convert-llama.py <modelPath> <targetFloatType>')
exit(1)
if __name__ == '__main__':
if (len(sys.argv) < 3):
usage()
modelPath = sys.argv[1]
targetFloatType = parseFloatType(sys.argv[2])
targetFloatTypeStr = strFloatType(targetFloatType)
modelName = os.path.basename(modelPath)
outputFileName = f'dllama_model_{modelName.lower()}_{targetFloatTypeStr}.m'
print(f'Model name: {modelName}')
print(f'Target float type: {targetFloatTypeStr}')
print(f'Target file: {outputFileName}')
convert(modelPath, outputFileName, targetFloatType)
print('Done!')