import os import sys import json import torch import math import numpy as np from writer import writeTensor, writeHeader, parseFloatType, strFloatType, FloatType from pathlib import Path LAYER_CHUNK_SIZE = 48 def convert(modelPath, outputPath, targetFloatType): paramsPath = os.path.join(modelPath, 'params.json') with open(paramsPath) as f: params = json.load(f) if (params['vocab_size'] < 1): raise Exception('vocab_size is invalid, please update params.json file') if (params.get('max_seq_len') is None): raise Exception('max_seq_len is required, please update params.json file') params['n_kv_heads'] = params.get('n_kv_heads') or params['n_heads'] params['head_size'] = params['dim'] / params['n_heads'] params['arch_type'] = 0xABCD00 params['n_experts'] = 0 params['n_active_experts'] = 0 params['weights_float_type'] = targetFloatType if ('rope_theta' in params): params['rope_theta'] = int(params['rope_theta']) modelPaths = sorted(list(Path(modelPath).glob('consolidated.*.pth'))) nSlices = len(modelPaths) layers = [] layers.append('tok_embeddings.weight') for layerIndex in range(0, params['n_layers']): layers.append(f'layers.{layerIndex}.attention.wq.weight') layers.append(f'layers.{layerIndex}.attention.wk.weight') layers.append(f'layers.{layerIndex}.attention.wv.weight') layers.append(f'layers.{layerIndex}.attention.wo.weight') layers.append(f'layers.{layerIndex}.feed_forward.w1.weight') layers.append(f'layers.{layerIndex}.feed_forward.w2.weight') layers.append(f'layers.{layerIndex}.feed_forward.w3.weight') layers.append(f'layers.{layerIndex}.attention_norm.weight') layers.append(f'layers.{layerIndex}.ffn_norm.weight') layers.append('norm.weight') layers.append('output.weight') isHeaderWrote = False outFile = open(outputPath, 'wb') nChunks = math.ceil(len(layers) / LAYER_CHUNK_SIZE) for chunkIndex in range(0, nChunks): chunkLayerNames = layers[LAYER_CHUNK_SIZE * chunkIndex:LAYER_CHUNK_SIZE * (chunkIndex + 1)] models = {} for layerName in chunkLayerNames: models[layerName] = [] print(f'💿 Chunking model {chunkIndex + 1}/{nChunks}...') for modelPath in modelPaths: model = torch.load(modelPath, map_location='cpu') for modelKey in model: if (modelKey in chunkLayerNames): models[modelKey].append(model[modelKey]) if not isHeaderWrote: params['hidden_dim'] = model['layers.0.feed_forward.w1.weight'].shape[0] * nSlices writeHeader(outFile, params) isHeaderWrote = True del model for layerName in chunkLayerNames: if layerName == 'rope.freqs': continue isAxis1 = ( layerName == 'tok_embeddings.weight' or layerName.endswith('.attention.wo.weight') or layerName.endswith('.feed_forward.w2.weight') ) isAlwaysF32 = ( layerName == 'tok_embeddings.weight' or layerName.endswith('.attention_norm.weight') or layerName.endswith('.ffn_norm.weight') or layerName == 'norm.weight' ) floatType = FloatType.F32 if isAlwaysF32 else targetFloatType tensors = models[layerName] if len(tensors) == 1 or len(tensors[0].shape) == 1: tensor = tensors[0] else: tensor = torch.cat(tensors, dim=(1 if isAxis1 else 0)) print(f'🔶 Exporting {layerName} {tensor.shape}...') writeTensor(outFile, tensor, floatType) del models outFile.close() def usage(): print('Usage: python convert-llama.py ') exit(1) if __name__ == '__main__': if (len(sys.argv) < 3): usage() modelPath = sys.argv[1] targetFloatType = parseFloatType(sys.argv[2]) targetFloatTypeStr = strFloatType(targetFloatType) modelName = os.path.basename(modelPath) outputFileName = f'dllama_model_{modelName.lower()}_{targetFloatTypeStr}.m' print(f'Model name: {modelName}') print(f'Target float type: {targetFloatTypeStr}') print(f'Target file: {outputFileName}') convert(modelPath, outputFileName, targetFloatType) print('Done!')