init
Some checks failed
main / Linux (amd64, ubuntu-22.04) (push) Successful in 49s
main / Linux (arm64, ubuntu-24.04-arm) (push) Has been cancelled
main / Windows (push) Has been cancelled

This commit is contained in:
2025-10-24 11:42:14 +02:00
commit 42172cbb6f
85 changed files with 40316 additions and 0 deletions

4
converter/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
*.t
*.m
*.bin
*/

265
converter/convert-hf.py Normal file
View File

@@ -0,0 +1,265 @@
import gc
import json
import sys
import os
from writer import parseFloatType, writeTensor, writeHeader, FloatType
from safetensors import safe_open
class ArchType:
LLAMA = 0xABCD00
QWEN3 = 0xABCD01
QWEN3_MOE = 0xABCD02
def permute(tensor, nHeads: int, nKvHeads: int):
if nHeads != nKvHeads:
nHeads = nKvHeads
return (tensor.reshape(nHeads, 2, tensor.shape[0] // nHeads // 2, *tensor.shape[1:]).swapaxes(1, 2).reshape(tensor.shape))
class Processor:
def __init__(self, config):
self.config = config
self.archType = config['arch_type']
self.currentModelIndex = None
self.currentModel = None
self.currentModelKeys = None
self.layerMap = {}
self.plan = []
def __unloadModel(self):
if self.currentModel:
del self.currentModel
self.currentModel = None
gc.collect()
self.currentModelIndex = None
def __loadModel(self, index: int):
if (self.currentModelIndex == index):
return
self.__unloadModel()
filePath = self.config['files'][index]
fileName = os.path.basename(filePath)
print(f'💿 Loading file {fileName}...')
self.currentModel = safe_open(filePath, framework='pt', device='cpu')
self.currentModelKeys = list(self.currentModel.keys())
for key in self.currentModelKeys:
self.layerMap[key] = index
print(f'Found {len(self.currentModelKeys)} layers')
self.currentModelIndex = index
def __transformQ(self, tensor):
if self.archType == ArchType.LLAMA:
return permute(tensor, self.config['n_heads'], self.config['n_heads'])
return tensor
def __transformK(self, tensor):
if self.archType == ArchType.LLAMA:
return permute(tensor, self.config['n_heads'], self.config['n_kv_heads'])
return tensor
def __preparePlan(self):
wt = self.config['weights_float_type']
p = self.plan
p.append([FloatType.F32,
'model.embed_tokens.weight'])
for l in range(0, self.config['n_layers']):
p.append([wt, self.__transformQ,
f'model.layers.{l}.self_attn.q_proj.weight'])
p.append([wt, self.__transformK,
f'model.layers.{l}.self_attn.k_proj.weight'])
p.append([wt,
f'model.layers.{l}.self_attn.v_proj.weight'])
p.append([wt,
f'model.layers.{l}.self_attn.o_proj.weight'])
if (self.config['n_experts'] > 0):
p.append([FloatType.F32, f'model.layers.{l}.mlp.gate.weight'])
for e in range(self.config['n_experts']):
p.append([wt,
f'model.layers.{l}.mlp.experts.{e}.gate_proj.weight'])
p.append([wt,
f'model.layers.{l}.mlp.experts.{e}.down_proj.weight'])
p.append([wt,
f'model.layers.{l}.mlp.experts.{e}.up_proj.weight'])
else:
p.append([wt,
f'model.layers.{l}.mlp.gate_proj.weight'])
p.append([wt,
f'model.layers.{l}.mlp.down_proj.weight'])
p.append([wt,
f'model.layers.{l}.mlp.up_proj.weight'])
if (self.archType == ArchType.QWEN3 or self.archType == ArchType.QWEN3_MOE):
p.append([FloatType.F32,
f'model.layers.{l}.self_attn.q_norm.weight'])
p.append([FloatType.F32,
f'model.layers.{l}.self_attn.k_norm.weight'])
p.append([FloatType.F32,
f'model.layers.{l}.input_layernorm.weight'])
p.append([FloatType.F32,
f'model.layers.{l}.post_attention_layernorm.weight'])
p.append([FloatType.F32,
'model.norm.weight'])
p.append([wt,
'lm_head.weight', 'model.embed_tokens.weight'])
def write(self, outputFile: str):
self.__preparePlan()
# Loading the last model file to get the layer names
self.__loadModel(len(self.config['files']) - 1)
self.__unloadModel()
for planItem in self.plan:
lookup = planItem[1:]
transform = None
if (callable(lookup[0])):
transform = lookup[0]
lookup = lookup[1:]
if (self.currentModelIndex == None):
modelIndex = 0
else:
modelIndex = None
for layerName in lookup:
if (layerName in self.layerMap):
modelIndex = self.layerMap[layerName]
break
if (modelIndex is None):
modelIndex = self.currentModelIndex + 1
self.__loadModel(modelIndex)
tensor = None
for layerName in lookup:
if (layerName in self.currentModelKeys):
tensor = self.currentModel.get_tensor(layerName)
break
if tensor is None:
raise Exception(f'Layer {lookup[0]} not found')
print(f'🔶 Writing tensor {layerName} {tensor.shape}...')
floatType = planItem[0]
if (transform):
tensor = transform(tensor)
writeTensor(outputFile, tensor, floatType)
def parseArchType(type: str):
archType = {
'llama': ArchType.LLAMA,
'mistral': ArchType.LLAMA,
'qwen3': ArchType.QWEN3,
'qwen3_moe': ArchType.QWEN3_MOE,
}.get(type)
if (archType is None):
raise Exception(f'Unsupported arch type: {type}')
return archType
def parseHiddenAct(act: str):
hiddenAct = {
'gelu': 0,
'silu': 1
}.get(act)
if (hiddenAct is None):
raise Exception(f'Unsupported hidden act: {act}')
return hiddenAct
def parseRopeType(rt: str):
ropeType = {
'llama3': 2, # LLAMA3_1
}.get(rt)
if (ropeType is None):
raise Exception(f'Unsupported rope type: {ropeType}')
return ropeType
def parseRmsNormEpsilon(epsilon: float):
if (epsilon == 1e-05):
return 5
elif (epsilon == 1e-06):
return 6
raise Exception(f'Unsupported epsilon: {epsilon}')
def loadConfig(folderPath: str, weightsFloatType: int):
allFiles = os.listdir(folderPath)
allFiles.sort()
with open(os.path.join(folderPath, 'config.json')) as fc:
config = json.load(fc)
files = []
for fileName in allFiles:
if fileName.endswith('.safetensors') and not fileName.startswith('.'):
files.append(os.path.join(folderPath, fileName))
if (len(files) == 0):
raise Exception('Not found any model file')
result = {
'version': 0,
'arch_type': parseArchType(config['model_type']),
'hidden_act': parseHiddenAct(config['hidden_act']),
'dim': config['hidden_size'],
'hidden_dim': config['intermediate_size'],
'n_layers': config['num_hidden_layers'],
'n_heads': config['num_attention_heads'],
'n_kv_heads': config['num_key_value_heads'],
'weights_float_type': weightsFloatType,
'max_seq_len': config['max_position_embeddings'],
'vocab_size': config['vocab_size'],
'files': files,
}
nExperts = config.get('num_experts')
nActiveExperts = config.get('num_experts_per_tok')
result['n_experts'] = int(nExperts) if nExperts is not None else 0
result['n_active_experts'] = int(nActiveExperts) if nActiveExperts is not None else 0
ropeTheta = config.get('rope_theta')
if (ropeTheta is not None):
result['rope_theta'] = int(ropeTheta)
ropeScaling = config.get('rope_scaling')
if (ropeScaling is not None):
result['rope_scaling_factor'] = int(ropeScaling['factor'])
result['rope_scaling_low_freq_factor'] = int(ropeScaling['low_freq_factor'])
result['rope_scaling_high_freq_factory'] = int(ropeScaling['high_freq_factor'])
result['rope_scaling_orig_max_seq_len'] = int(ropeScaling['original_max_position_embeddings'])
result['rope_type'] = parseRopeType(ropeScaling['rope_type'])
headDim = config.get('head_dim')
if (headDim is not None):
result['head_dim'] = headDim
rmsNormEps = config.get('rms_norm_eps')
if (rmsNormEps is not None):
result['norm_epsilon'] = parseRmsNormEpsilon(rmsNormEps)
moeHiddenDim = config.get('moe_intermediate_size')
if (moeHiddenDim is not None):
result['moe_hidden_dim'] = int(moeHiddenDim)
return result
def printUsage():
print('Usage: python convert-hf.py <sourceFolderPath> <weightsFloatType> <name>')
print()
print('Options:')
print(' <sourceFolderPath> The path to the folder containing the model files')
print(' <weightsFloatType> The float type of the weights (e.g. "q40")')
print(' <name> The name of the model (e.g. "llama3")')
if __name__ == '__main__':
if (len(sys.argv) < 4):
printUsage()
exit(1)
sourceFolderPath = sys.argv[1]
weightsFloatType = parseFloatType(sys.argv[2])
name = sys.argv[3]
outputFileName = f'dllama_model_{name}_{sys.argv[2]}.m'
print(f'Output file: {outputFileName}')
config = loadConfig(sourceFolderPath, weightsFloatType)
with open(outputFileName, 'wb') as outputFile:
writeHeader(outputFile, config)
processor = Processor(config)
processor.write(outputFile)
print(f'{outputFileName} created successfully')

121
converter/convert-llama.py Normal file
View File

@@ -0,0 +1,121 @@
import os
import sys
import json
import torch
import math
import numpy as np
from writer import writeTensor, writeHeader, parseFloatType, strFloatType, FloatType
from pathlib import Path
LAYER_CHUNK_SIZE = 48
def convert(modelPath, outputPath, targetFloatType):
paramsPath = os.path.join(modelPath, 'params.json')
with open(paramsPath) as f:
params = json.load(f)
if (params['vocab_size'] < 1):
raise Exception('vocab_size is invalid, please update params.json file')
if (params.get('max_seq_len') is None):
raise Exception('max_seq_len is required, please update params.json file')
params['n_kv_heads'] = params.get('n_kv_heads') or params['n_heads']
params['head_size'] = params['dim'] / params['n_heads']
params['arch_type'] = 0xABCD00
params['n_experts'] = 0
params['n_active_experts'] = 0
params['weights_float_type'] = targetFloatType
if ('rope_theta' in params):
params['rope_theta'] = int(params['rope_theta'])
modelPaths = sorted(list(Path(modelPath).glob('consolidated.*.pth')))
nSlices = len(modelPaths)
layers = []
layers.append('tok_embeddings.weight')
for layerIndex in range(0, params['n_layers']):
layers.append(f'layers.{layerIndex}.attention.wq.weight')
layers.append(f'layers.{layerIndex}.attention.wk.weight')
layers.append(f'layers.{layerIndex}.attention.wv.weight')
layers.append(f'layers.{layerIndex}.attention.wo.weight')
layers.append(f'layers.{layerIndex}.feed_forward.w1.weight')
layers.append(f'layers.{layerIndex}.feed_forward.w2.weight')
layers.append(f'layers.{layerIndex}.feed_forward.w3.weight')
layers.append(f'layers.{layerIndex}.attention_norm.weight')
layers.append(f'layers.{layerIndex}.ffn_norm.weight')
layers.append('norm.weight')
layers.append('output.weight')
isHeaderWrote = False
outFile = open(outputPath, 'wb')
nChunks = math.ceil(len(layers) / LAYER_CHUNK_SIZE)
for chunkIndex in range(0, nChunks):
chunkLayerNames = layers[LAYER_CHUNK_SIZE * chunkIndex:LAYER_CHUNK_SIZE * (chunkIndex + 1)]
models = {}
for layerName in chunkLayerNames:
models[layerName] = []
print(f'💿 Chunking model {chunkIndex + 1}/{nChunks}...')
for modelPath in modelPaths:
model = torch.load(modelPath, map_location='cpu')
for modelKey in model:
if (modelKey in chunkLayerNames):
models[modelKey].append(model[modelKey])
if not isHeaderWrote:
params['hidden_dim'] = model['layers.0.feed_forward.w1.weight'].shape[0] * nSlices
writeHeader(outFile, params)
isHeaderWrote = True
del model
for layerName in chunkLayerNames:
if layerName == 'rope.freqs':
continue
isAxis1 = (
layerName == 'tok_embeddings.weight' or
layerName.endswith('.attention.wo.weight') or
layerName.endswith('.feed_forward.w2.weight')
)
isAlwaysF32 = (
layerName == 'tok_embeddings.weight' or
layerName.endswith('.attention_norm.weight') or
layerName.endswith('.ffn_norm.weight') or
layerName == 'norm.weight'
)
floatType = FloatType.F32 if isAlwaysF32 else targetFloatType
tensors = models[layerName]
if len(tensors) == 1 or len(tensors[0].shape) == 1:
tensor = tensors[0]
else:
tensor = torch.cat(tensors, dim=(1 if isAxis1 else 0))
print(f'🔶 Exporting {layerName} {tensor.shape}...')
writeTensor(outFile, tensor, floatType)
del models
outFile.close()
def usage():
print('Usage: python convert-llama.py <modelPath> <targetFloatType>')
exit(1)
if __name__ == '__main__':
if (len(sys.argv) < 3):
usage()
modelPath = sys.argv[1]
targetFloatType = parseFloatType(sys.argv[2])
targetFloatTypeStr = strFloatType(targetFloatType)
modelName = os.path.basename(modelPath)
outputFileName = f'dllama_model_{modelName.lower()}_{targetFloatTypeStr}.m'
print(f'Model name: {modelName}')
print(f'Target float type: {targetFloatTypeStr}')
print(f'Target file: {outputFileName}')
convert(modelPath, outputFileName, targetFloatType)
print('Done!')

View File

@@ -0,0 +1,137 @@
import sys
import json
import os
from sentencepiece import SentencePieceProcessor
from transformers import PreTrainedTokenizerFast
writer = __import__('tokenizer-writer')
def openJson(path):
with open(path, 'r', encoding='utf-8') as file:
return json.load(file)
def unicodeToBytes():
# https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/encoder.py#L9
bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
cs = bs[:]
n = 0
for b in range(2 ** 8):
if b not in bs:
bs.append(b)
cs.append(2 ** 8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(cs, bs))
class TokensResolver:
def __init__(self, dirPath, tokenizerConfig):
self.dirPath = dirPath
self.tokenizerConfig = tokenizerConfig
self.bosId = None
self.eosIds = None
self.tokens = []
self.scores = []
def resolvePreTrainedTokenizerFast(self):
utb = unicodeToBytes()
tokenizer = PreTrainedTokenizerFast(tokenizer_file = os.path.join(self.dirPath, 'tokenizer.json'))
vocabLen = len(tokenizer.get_vocab())
for i in range(vocabLen):
tokenChars = list(tokenizer.convert_ids_to_tokens([i])[0])
tokenBytes = []
for chr in tokenChars:
if (chr in utb):
tokenBytes.append(utb[chr])
else:
tokenBytes += list(chr.encode('utf-8'))
self.tokens.append(bytes(tokenBytes))
self.scores.append(-float(i))
self.bosId = tokenizer.bos_token_id
if (tokenizer.eos_token_id):
self.eosIds = [tokenizer.eos_token_id]
if (self.bosId is None or self.eosId is None):
config = openJson(os.path.join(self.dirPath, 'config.json'))
if (self.bosId is None):
self.bosId = config['bos_token_id']
if (self.eosIds is None):
self.eosIds = config['eos_token_id']
if isinstance(self.eosIds, list):
self.eosIds = self.eosIds
else:
self.eosIds = [self.eosIds]
def resolveLlamaTokenizer(self):
modelPath = os.path.join(self.dirPath, 'tokenizer.model')
processor = SentencePieceProcessor(model_file=modelPath)
assert processor.vocab_size() == processor.get_piece_size()
self.bosId = processor.bos_id()
self.eosIds = [processor.eos_id()]
vocabSize = processor.vocab_size()
for i in range(vocabSize):
t = processor.id_to_piece(i)
s = processor.get_score(i)
t = t.replace('', ' ') # sentencepiece uses this character as whitespace
# Check for byte characters
if len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
# For example, "<0x0A>"" is a newline character
b = bytearray.fromhex(t[3:-1])
else:
b = t.encode('utf-8')
self.tokens.append(b)
self.scores.append(s)
def resolve(self):
cls = self.tokenizerConfig['tokenizer_class']
if (cls == 'PreTrainedTokenizerFast' or
cls == 'LlamaTokenizerFast' or
cls == 'Qwen2Tokenizer'):
return self.resolvePreTrainedTokenizerFast()
if (cls == 'LlamaTokenizer'):
return self.resolveLlamaTokenizer()
raise Exception(f'Tokenizer {cls} is not supported')
def printUsage():
print('Usage: python convert-tokenizer-hf.py <tokenizerFolderPath> <name>')
print()
print('Options:')
print(' <tokenizerFolderPath> The path to the folder with tokenizer_config.json')
print(' <name> The name of the tokenizer (e.g. "llama3")')
if __name__ == '__main__':
if (len(sys.argv) < 2):
printUsage()
exit(1)
dirPath = sys.argv[1]
name = sys.argv[2]
tokenizerConfig = openJson(os.path.join(dirPath, 'tokenizer_config.json'))
resolver = TokensResolver(dirPath, tokenizerConfig)
resolver.resolve()
if (resolver.bosId is None or resolver.eosIds is None):
raise Exception('Cannot resolve bosId or eosIds')
print(f'bosId: {resolver.bosId} ({resolver.tokens[resolver.bosId]})')
for eosId in resolver.eosIds:
print(f'eosId: {eosId} ({resolver.tokens[eosId]})')
chatTemplate = None
if ('chat_template' in tokenizerConfig):
chatTemplate = tokenizerConfig['chat_template'].encode('utf-8')
addBos = True
if ('add_bos_token' in tokenizerConfig):
addBos = tokenizerConfig['add_bos_token']
outputFileName = f'dllama_tokenizer_{name}.t'
with open(outputFileName, 'wb') as outputFile:
writer.writeTokenizer(
outputFile,
resolver.tokens,
resolver.scores,
chatTemplate,
resolver.bosId,
addBos,
resolver.eosIds)
print(f'✅ Created {outputFileName}')

View File

@@ -0,0 +1,44 @@
import sys
import os
from sentencepiece import SentencePieceProcessor
writer = __import__('tokenizer-writer')
chatTemplate = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
def printUsage():
print('Usage: python convert-tokenizer-llama2.py <llama2FolderPath>')
print()
print('Options:')
print(' <llama2FolderPath> The path to the folder with llama2 folder path')
if __name__ == '__main__':
if (len(sys.argv) < 2):
printUsage()
exit(1)
dirPath = sys.argv[1]
modelPath = os.path.join(dirPath, 'tokenizer.model')
processor = SentencePieceProcessor(model_file=modelPath)
vocabSize = processor.vocab_size()
tokens = []
scores = []
for i in range(vocabSize):
t = processor.id_to_piece(i)
s = processor.get_score(i)
t = t.replace('', ' ') # sentencepiece uses this character as whitespace
b = t.encode('utf-8')
tokens.append(b)
scores.append(s)
outputFileName = 'dllama_tokenizer_llama2.t'
with open(outputFileName, 'wb') as outputFile:
writer.writeTokenizer(
outputFile,
tokens,
scores,
chatTemplate.encode('utf-8'),
processor.bos_id(),
[processor.eos_id()])
print(f'✅ Created {outputFileName}')

View File

@@ -0,0 +1,78 @@
import sys
import base64
writer = __import__('tokenizer-writer')
# Format of input file:
# ```
# IQ== 0
# Ig== 1
# Iw== 2
# ...
# ```
nSpecialTokens = 256
specialTokens = [
'<|begin_of_text|>',
'<|end_of_text|>',
'<|reserved_special_token_0|>',
'<|reserved_special_token_1|>',
'<|reserved_special_token_2|>',
'<|reserved_special_token_3|>',
'<|start_header_id|>',
'<|end_header_id|>',
'<|reserved_special_token_4|>',
'<|eot_id|>',
] + [
f'<|reserved_special_token_{i}|>'
for i in range(5, nSpecialTokens - 5)
]
bosId = 128000
eosId = 128001
chatEosId = 128009
chatTemplate = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
def printUsage():
print('Usage: python convert-tokenizer-llama3.py <tokenizerPath>')
print()
print('Options:')
print(' <tokenizerPath> The path to the Llama 3 tokenizer model (tokenizer.model)')
if __name__ == '__main__':
if (len(sys.argv) < 2):
printUsage()
exit(1)
modelPath = sys.argv[1]
outputFileName = 'dllama_tokenizer_llama3.t'
with open(modelPath, 'r') as inputFile:
with open(outputFileName, 'wb') as outputFile:
inputLines = inputFile.readlines()
nLines = len(inputLines)
tokens = []
scores = []
for line in inputLines:
s = line.split(' ')
bytes = base64.b64decode(s[0])
score = -float(s[1])
tokens.append(bytes)
scores.append(score)
specialTokenIndex = nLines
for token in specialTokens:
bytes = token.encode('utf-8')
score = -float(specialTokenIndex)
tokens.append(bytes)
scores.append(score)
specialTokenIndex += 1
writer.writeTokenizer(
outputFile,
tokens,
scores,
chatTemplate.encode('utf-8'),
bosId,
[eosId, chatEosId])
print(f'✅ Created {outputFileName}')

View File

@@ -0,0 +1,5 @@
python>=3.9
numpy==1.23.5
pytorch==2.0.1
safetensors==0.4.2
sentencepiece==0.1.99

View File

@@ -0,0 +1,57 @@
import struct
def writeTokenizer(file, tokens, scores, chatTemplate, bosId, addBos, eosTokens):
headerKeys = {
'version': 0,
'vocab_size': 1,
'max_token_length': 2,
'bos_id': 3,
'chat_template': 7,
'n_eos_tokens': 9,
'add_bos': 10,
}
header = struct.pack('i', 0x567124)
nTokens = len(tokens)
maxTokenLength = max(len(t) for t in tokens)
params = {}
params['bos_id'] = bosId
params['version'] = 1
params['vocab_size'] = nTokens
params['max_token_length'] = maxTokenLength
if (chatTemplate):
params['chat_template'] = len(chatTemplate)
params['n_eos_tokens'] = len(eosTokens)
params['add_bos'] = 1 if addBos else 0
data = b''
for key in params:
value = params[key]
if value is None:
continue
if key in headerKeys:
data += struct.pack('ii', headerKeys[key], params[key])
else:
print(f'Unknown header key: {key}')
print('⭐ Params:')
print(params)
if (chatTemplate):
print('⭐ Chat template:')
print(chatTemplate)
header += struct.pack('i', len(header) * 2 + len(data))
file.write(header)
file.write(data)
if chatTemplate:
file.write(chatTemplate)
for eosToken in eosTokens:
file.write(struct.pack('i', eosToken))
for i in range(0, nTokens):
size = len(tokens[i])
assert(size > 0)
file.write(struct.pack('fI', scores[i], size))
file.write(tokens[i])

35
converter/writer-test.py Normal file
View File

@@ -0,0 +1,35 @@
import sys
import time
import torch
from writer import writeQuantizedQ40Tensor
TEMP_FILE_NAME = 'writer-test.temp'
def readBase64FromFile(path):
with open(path, 'rb') as file:
return file.read().hex()
def testWriteQuantizedQ40Tensor():
EXPECTED_OUTPUT = '7e346345a692b89665b2c5790537876e598aaa366d988876a898b8d788a98868ce660c66f6b3a88cba5ce9a871987ba9cc5bcaaa760c1eb556a4455b747b6b9504968828ef2a8d7c1db5c6be3764799e66db6d8e76463126a30e4333cad7a4f645947c6cf97f9de086d468c8d535a6ba7dc799d3d0c657bab6799468cad8bb349eb7d7635c7c798998696bb38e4085a9eb34444ba96a7f8ba7b2b42d746a96cf9660aeb4499d8708ad5c7b9a7558947645f3bbb6b0346a656887ad9a86059baac5c596ab781c703569bb8a4356a4bd58cb78736ba09759bb0e34a6274e827b957d7a67dfa86846955660d234b6d9d78a378094a8a8708a7a774ae92f8a36b8c999a9b77a7d958a69747c807963941235379886d69a7a8767b3a6a4ac71999760'
torch.manual_seed(seed=1)
tensor = torch.randn(32, 16)
with open(TEMP_FILE_NAME, 'wb') as file:
writeQuantizedQ40Tensor(file, tensor)
contentBase64 = readBase64FromFile(TEMP_FILE_NAME)
assert contentBase64 == EXPECTED_OUTPUT, f'Received: {contentBase64}'
print('✅ writeQuantizedQ40Tensor')
def runWriteQuantizedQ40TensorBenchmark():
tensor = torch.randn(8192, 4096)
t0 = time.time()
with open(TEMP_FILE_NAME, 'wb') as file:
writeQuantizedQ40Tensor(file, tensor)
t1 = time.time()
print(f'🕐 writeQuantizedQ40Tensor: {t1 - t0:.4f}s')
if __name__ == '__main__':
testWriteQuantizedQ40Tensor()
runWriteQuantizedQ40TensorBenchmark()

148
converter/writer.py Normal file
View File

@@ -0,0 +1,148 @@
import struct
import torch
import time
import numpy as np
class FloatType:
F32 = 0
F16 = 1
Q40 = 2
Q80 = 3
floatTypeMap = {
'f32': FloatType.F32,
'f16': FloatType.F16,
'q40': FloatType.Q40,
'q80': FloatType.Q80,
}
floatTypeNames = list(floatTypeMap.keys())
def parseFloatType(type):
floatType = floatTypeMap.get(type)
if floatType is not None:
return floatType
raise Exception(f'{type} is not supported')
def strFloatType(type):
return floatTypeNames[type]
def writeQuantizedQ40Tensor(file, x):
x = x.to(torch.float32).numpy().astype(np.float32)
blockSize = 32
blockHalfSize = blockSize // 2
assert(x.shape[0] % blockSize == 0)
groups = x.reshape(-1, blockSize)
gmax = np.max(groups, axis=1)
gmin = np.min(groups, axis=1)
deltas = np.divide(np.where(-gmin > gmax, gmin, gmax), -8)
deltas16 = deltas.astype(np.float16)
ids = np.where(deltas != 0, 1.0 / deltas, 0)
groups = np.add(groups * ids[:, np.newaxis], 8.5)
groups = np.clip(groups, 0, 15).astype(int)
gLow = groups[:, :blockHalfSize] & 0xF
gHigh = (groups[:, blockHalfSize:] & 0xF) << 4
gCombined = gLow | gHigh
nBytes = 0
for groupIndex in range(0, len(groups)):
delta16 = deltas16[groupIndex]
buffer = struct.pack(f'e{blockHalfSize}B', delta16, *gCombined[groupIndex])
file.write(buffer)
nBytes += len(buffer)
return nBytes
def writeQuantizedQ80Tensor(file, x):
x = x.to(torch.float32).numpy().astype(np.float32)
blockSize = 32
assert(x.shape[0] % blockSize == 0)
groups = x.reshape(-1, blockSize)
gmax = np.max(groups, axis=1)
gmin = np.min(groups, axis=1)
gabsMax = np.where(-gmin > gmax, -gmin, gmax)
deltas = gabsMax / ((1 << 7) - 1)
deltas16 = deltas.astype(np.float16)
ids = np.where(deltas != 0, 1.0 / deltas, 0)
groups = groups * ids[:, np.newaxis]
groups8 = np.round(groups).astype(np.int8)
nBytes = 0
for groupIndex in range(0, len(groups)):
buffer = struct.pack(f'e{blockSize}b', deltas16[groupIndex], *groups8[groupIndex])
file.write(buffer)
nBytes += len(buffer)
return nBytes
def writeF32Tensor(file, d):
chunkSize = 10000
nBytes = 0
for i in range(0, len(d), chunkSize):
chunk = d[i:i+chunkSize].to(torch.float32).numpy().astype(np.float32)
b = struct.pack(f'{len(chunk)}f', *chunk)
nBytes += len(b)
file.write(b)
return nBytes
def writeF16Tensor(file, d):
d = d.to(torch.float16).numpy().astype(np.float16)
b = struct.pack(f'{len(d)}e', *d)
file.write(b)
return len(b)
def writeTensor(file, tensor, floatType):
d = tensor.detach().cpu().view(-1)
t0 = time.time()
nBytes = 0
if (floatType == FloatType.F16):
nBytes = writeF16Tensor(file, d)
elif (floatType == FloatType.F32):
nBytes = writeF32Tensor(file, d)
elif (floatType == FloatType.Q40):
nBytes = writeQuantizedQ40Tensor(file, d)
elif (floatType == FloatType.Q80):
nBytes = writeQuantizedQ80Tensor(file, d)
else:
raise Exception(f'Unknown float type')
t1 = time.time()
print(f'Saved {strFloatType(floatType)} tensor in {t1 - t0:.2f}s, {nBytes} bytes')
def writeHeader(file, params):
headerKeys = {
'version': 0,
'arch_type': 1,
'dim': 2,
'hidden_dim': 3,
'n_layers': 4,
'n_heads': 5,
'n_kv_heads': 6,
'n_experts': 7,
'n_active_experts': 8,
'vocab_size': 9,
'max_seq_len': 10,
'hidden_act': 11,
'rope_theta': 12,
'weights_float_type': 13,
'rope_scaling_factor': 14,
'rope_scaling_low_freq_factor': 15,
'rope_scaling_high_freq_factory': 16,
'rope_scaling_orig_max_seq_len': 17,
'rope_type': 18,
'head_dim': 19,
'norm_epsilon': 20,
'moe_hidden_dim': 21,
}
header = struct.pack('i', 0xA00ABCD)
data = b''
for key in params:
if key in headerKeys:
data += struct.pack('ii', headerKeys[key], params[key])
else:
print(f'Warning: Unknown header key: {key}')
header += struct.pack('i', len(header) * 2 + len(data))
file.write(header)
file.write(data)
for key in params:
print(f'🎓 {key}: {params[key]}')
print()