init
This commit is contained in:
4
converter/.gitignore
vendored
Normal file
4
converter/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
*.t
|
||||
*.m
|
||||
*.bin
|
||||
*/
|
||||
265
converter/convert-hf.py
Normal file
265
converter/convert-hf.py
Normal file
@@ -0,0 +1,265 @@
|
||||
import gc
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from writer import parseFloatType, writeTensor, writeHeader, FloatType
|
||||
from safetensors import safe_open
|
||||
|
||||
class ArchType:
|
||||
LLAMA = 0xABCD00
|
||||
QWEN3 = 0xABCD01
|
||||
QWEN3_MOE = 0xABCD02
|
||||
|
||||
def permute(tensor, nHeads: int, nKvHeads: int):
|
||||
if nHeads != nKvHeads:
|
||||
nHeads = nKvHeads
|
||||
return (tensor.reshape(nHeads, 2, tensor.shape[0] // nHeads // 2, *tensor.shape[1:]).swapaxes(1, 2).reshape(tensor.shape))
|
||||
|
||||
class Processor:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.archType = config['arch_type']
|
||||
self.currentModelIndex = None
|
||||
self.currentModel = None
|
||||
self.currentModelKeys = None
|
||||
self.layerMap = {}
|
||||
self.plan = []
|
||||
|
||||
def __unloadModel(self):
|
||||
if self.currentModel:
|
||||
del self.currentModel
|
||||
self.currentModel = None
|
||||
gc.collect()
|
||||
self.currentModelIndex = None
|
||||
|
||||
def __loadModel(self, index: int):
|
||||
if (self.currentModelIndex == index):
|
||||
return
|
||||
self.__unloadModel()
|
||||
filePath = self.config['files'][index]
|
||||
fileName = os.path.basename(filePath)
|
||||
print(f'💿 Loading file {fileName}...')
|
||||
self.currentModel = safe_open(filePath, framework='pt', device='cpu')
|
||||
self.currentModelKeys = list(self.currentModel.keys())
|
||||
for key in self.currentModelKeys:
|
||||
self.layerMap[key] = index
|
||||
print(f'Found {len(self.currentModelKeys)} layers')
|
||||
self.currentModelIndex = index
|
||||
|
||||
def __transformQ(self, tensor):
|
||||
if self.archType == ArchType.LLAMA:
|
||||
return permute(tensor, self.config['n_heads'], self.config['n_heads'])
|
||||
return tensor
|
||||
|
||||
def __transformK(self, tensor):
|
||||
if self.archType == ArchType.LLAMA:
|
||||
return permute(tensor, self.config['n_heads'], self.config['n_kv_heads'])
|
||||
return tensor
|
||||
|
||||
def __preparePlan(self):
|
||||
wt = self.config['weights_float_type']
|
||||
p = self.plan
|
||||
p.append([FloatType.F32,
|
||||
'model.embed_tokens.weight'])
|
||||
for l in range(0, self.config['n_layers']):
|
||||
p.append([wt, self.__transformQ,
|
||||
f'model.layers.{l}.self_attn.q_proj.weight'])
|
||||
p.append([wt, self.__transformK,
|
||||
f'model.layers.{l}.self_attn.k_proj.weight'])
|
||||
p.append([wt,
|
||||
f'model.layers.{l}.self_attn.v_proj.weight'])
|
||||
p.append([wt,
|
||||
f'model.layers.{l}.self_attn.o_proj.weight'])
|
||||
|
||||
if (self.config['n_experts'] > 0):
|
||||
p.append([FloatType.F32, f'model.layers.{l}.mlp.gate.weight'])
|
||||
for e in range(self.config['n_experts']):
|
||||
p.append([wt,
|
||||
f'model.layers.{l}.mlp.experts.{e}.gate_proj.weight'])
|
||||
p.append([wt,
|
||||
f'model.layers.{l}.mlp.experts.{e}.down_proj.weight'])
|
||||
p.append([wt,
|
||||
f'model.layers.{l}.mlp.experts.{e}.up_proj.weight'])
|
||||
else:
|
||||
p.append([wt,
|
||||
f'model.layers.{l}.mlp.gate_proj.weight'])
|
||||
p.append([wt,
|
||||
f'model.layers.{l}.mlp.down_proj.weight'])
|
||||
p.append([wt,
|
||||
f'model.layers.{l}.mlp.up_proj.weight'])
|
||||
|
||||
if (self.archType == ArchType.QWEN3 or self.archType == ArchType.QWEN3_MOE):
|
||||
p.append([FloatType.F32,
|
||||
f'model.layers.{l}.self_attn.q_norm.weight'])
|
||||
p.append([FloatType.F32,
|
||||
f'model.layers.{l}.self_attn.k_norm.weight'])
|
||||
|
||||
p.append([FloatType.F32,
|
||||
f'model.layers.{l}.input_layernorm.weight'])
|
||||
p.append([FloatType.F32,
|
||||
f'model.layers.{l}.post_attention_layernorm.weight'])
|
||||
p.append([FloatType.F32,
|
||||
'model.norm.weight'])
|
||||
p.append([wt,
|
||||
'lm_head.weight', 'model.embed_tokens.weight'])
|
||||
|
||||
def write(self, outputFile: str):
|
||||
self.__preparePlan()
|
||||
|
||||
# Loading the last model file to get the layer names
|
||||
self.__loadModel(len(self.config['files']) - 1)
|
||||
self.__unloadModel()
|
||||
|
||||
for planItem in self.plan:
|
||||
lookup = planItem[1:]
|
||||
transform = None
|
||||
if (callable(lookup[0])):
|
||||
transform = lookup[0]
|
||||
lookup = lookup[1:]
|
||||
|
||||
if (self.currentModelIndex == None):
|
||||
modelIndex = 0
|
||||
else:
|
||||
modelIndex = None
|
||||
for layerName in lookup:
|
||||
if (layerName in self.layerMap):
|
||||
modelIndex = self.layerMap[layerName]
|
||||
break
|
||||
if (modelIndex is None):
|
||||
modelIndex = self.currentModelIndex + 1
|
||||
self.__loadModel(modelIndex)
|
||||
|
||||
tensor = None
|
||||
for layerName in lookup:
|
||||
if (layerName in self.currentModelKeys):
|
||||
tensor = self.currentModel.get_tensor(layerName)
|
||||
break
|
||||
if tensor is None:
|
||||
raise Exception(f'Layer {lookup[0]} not found')
|
||||
print(f'🔶 Writing tensor {layerName} {tensor.shape}...')
|
||||
|
||||
floatType = planItem[0]
|
||||
if (transform):
|
||||
tensor = transform(tensor)
|
||||
writeTensor(outputFile, tensor, floatType)
|
||||
|
||||
def parseArchType(type: str):
|
||||
archType = {
|
||||
'llama': ArchType.LLAMA,
|
||||
'mistral': ArchType.LLAMA,
|
||||
'qwen3': ArchType.QWEN3,
|
||||
'qwen3_moe': ArchType.QWEN3_MOE,
|
||||
}.get(type)
|
||||
if (archType is None):
|
||||
raise Exception(f'Unsupported arch type: {type}')
|
||||
return archType
|
||||
|
||||
def parseHiddenAct(act: str):
|
||||
hiddenAct = {
|
||||
'gelu': 0,
|
||||
'silu': 1
|
||||
}.get(act)
|
||||
if (hiddenAct is None):
|
||||
raise Exception(f'Unsupported hidden act: {act}')
|
||||
return hiddenAct
|
||||
|
||||
def parseRopeType(rt: str):
|
||||
ropeType = {
|
||||
'llama3': 2, # LLAMA3_1
|
||||
}.get(rt)
|
||||
if (ropeType is None):
|
||||
raise Exception(f'Unsupported rope type: {ropeType}')
|
||||
return ropeType
|
||||
|
||||
def parseRmsNormEpsilon(epsilon: float):
|
||||
if (epsilon == 1e-05):
|
||||
return 5
|
||||
elif (epsilon == 1e-06):
|
||||
return 6
|
||||
raise Exception(f'Unsupported epsilon: {epsilon}')
|
||||
|
||||
def loadConfig(folderPath: str, weightsFloatType: int):
|
||||
allFiles = os.listdir(folderPath)
|
||||
allFiles.sort()
|
||||
with open(os.path.join(folderPath, 'config.json')) as fc:
|
||||
config = json.load(fc)
|
||||
files = []
|
||||
for fileName in allFiles:
|
||||
if fileName.endswith('.safetensors') and not fileName.startswith('.'):
|
||||
files.append(os.path.join(folderPath, fileName))
|
||||
if (len(files) == 0):
|
||||
raise Exception('Not found any model file')
|
||||
|
||||
result = {
|
||||
'version': 0,
|
||||
'arch_type': parseArchType(config['model_type']),
|
||||
'hidden_act': parseHiddenAct(config['hidden_act']),
|
||||
'dim': config['hidden_size'],
|
||||
'hidden_dim': config['intermediate_size'],
|
||||
'n_layers': config['num_hidden_layers'],
|
||||
'n_heads': config['num_attention_heads'],
|
||||
'n_kv_heads': config['num_key_value_heads'],
|
||||
'weights_float_type': weightsFloatType,
|
||||
'max_seq_len': config['max_position_embeddings'],
|
||||
'vocab_size': config['vocab_size'],
|
||||
'files': files,
|
||||
}
|
||||
|
||||
nExperts = config.get('num_experts')
|
||||
nActiveExperts = config.get('num_experts_per_tok')
|
||||
result['n_experts'] = int(nExperts) if nExperts is not None else 0
|
||||
result['n_active_experts'] = int(nActiveExperts) if nActiveExperts is not None else 0
|
||||
|
||||
ropeTheta = config.get('rope_theta')
|
||||
if (ropeTheta is not None):
|
||||
result['rope_theta'] = int(ropeTheta)
|
||||
|
||||
ropeScaling = config.get('rope_scaling')
|
||||
if (ropeScaling is not None):
|
||||
result['rope_scaling_factor'] = int(ropeScaling['factor'])
|
||||
result['rope_scaling_low_freq_factor'] = int(ropeScaling['low_freq_factor'])
|
||||
result['rope_scaling_high_freq_factory'] = int(ropeScaling['high_freq_factor'])
|
||||
result['rope_scaling_orig_max_seq_len'] = int(ropeScaling['original_max_position_embeddings'])
|
||||
result['rope_type'] = parseRopeType(ropeScaling['rope_type'])
|
||||
|
||||
headDim = config.get('head_dim')
|
||||
if (headDim is not None):
|
||||
result['head_dim'] = headDim
|
||||
|
||||
rmsNormEps = config.get('rms_norm_eps')
|
||||
if (rmsNormEps is not None):
|
||||
result['norm_epsilon'] = parseRmsNormEpsilon(rmsNormEps)
|
||||
|
||||
moeHiddenDim = config.get('moe_intermediate_size')
|
||||
if (moeHiddenDim is not None):
|
||||
result['moe_hidden_dim'] = int(moeHiddenDim)
|
||||
return result
|
||||
|
||||
def printUsage():
|
||||
print('Usage: python convert-hf.py <sourceFolderPath> <weightsFloatType> <name>')
|
||||
print()
|
||||
print('Options:')
|
||||
print(' <sourceFolderPath> The path to the folder containing the model files')
|
||||
print(' <weightsFloatType> The float type of the weights (e.g. "q40")')
|
||||
print(' <name> The name of the model (e.g. "llama3")')
|
||||
|
||||
if __name__ == '__main__':
|
||||
if (len(sys.argv) < 4):
|
||||
printUsage()
|
||||
exit(1)
|
||||
|
||||
sourceFolderPath = sys.argv[1]
|
||||
weightsFloatType = parseFloatType(sys.argv[2])
|
||||
name = sys.argv[3]
|
||||
outputFileName = f'dllama_model_{name}_{sys.argv[2]}.m'
|
||||
|
||||
print(f'Output file: {outputFileName}')
|
||||
|
||||
config = loadConfig(sourceFolderPath, weightsFloatType)
|
||||
|
||||
with open(outputFileName, 'wb') as outputFile:
|
||||
writeHeader(outputFile, config)
|
||||
processor = Processor(config)
|
||||
processor.write(outputFile)
|
||||
|
||||
print(f'✅ {outputFileName} created successfully')
|
||||
121
converter/convert-llama.py
Normal file
121
converter/convert-llama.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import torch
|
||||
import math
|
||||
import numpy as np
|
||||
from writer import writeTensor, writeHeader, parseFloatType, strFloatType, FloatType
|
||||
from pathlib import Path
|
||||
|
||||
LAYER_CHUNK_SIZE = 48
|
||||
|
||||
def convert(modelPath, outputPath, targetFloatType):
|
||||
paramsPath = os.path.join(modelPath, 'params.json')
|
||||
with open(paramsPath) as f:
|
||||
params = json.load(f)
|
||||
if (params['vocab_size'] < 1):
|
||||
raise Exception('vocab_size is invalid, please update params.json file')
|
||||
if (params.get('max_seq_len') is None):
|
||||
raise Exception('max_seq_len is required, please update params.json file')
|
||||
params['n_kv_heads'] = params.get('n_kv_heads') or params['n_heads']
|
||||
params['head_size'] = params['dim'] / params['n_heads']
|
||||
params['arch_type'] = 0xABCD00
|
||||
params['n_experts'] = 0
|
||||
params['n_active_experts'] = 0
|
||||
params['weights_float_type'] = targetFloatType
|
||||
if ('rope_theta' in params):
|
||||
params['rope_theta'] = int(params['rope_theta'])
|
||||
|
||||
modelPaths = sorted(list(Path(modelPath).glob('consolidated.*.pth')))
|
||||
nSlices = len(modelPaths)
|
||||
|
||||
layers = []
|
||||
layers.append('tok_embeddings.weight')
|
||||
for layerIndex in range(0, params['n_layers']):
|
||||
layers.append(f'layers.{layerIndex}.attention.wq.weight')
|
||||
layers.append(f'layers.{layerIndex}.attention.wk.weight')
|
||||
layers.append(f'layers.{layerIndex}.attention.wv.weight')
|
||||
layers.append(f'layers.{layerIndex}.attention.wo.weight')
|
||||
layers.append(f'layers.{layerIndex}.feed_forward.w1.weight')
|
||||
layers.append(f'layers.{layerIndex}.feed_forward.w2.weight')
|
||||
layers.append(f'layers.{layerIndex}.feed_forward.w3.weight')
|
||||
layers.append(f'layers.{layerIndex}.attention_norm.weight')
|
||||
layers.append(f'layers.{layerIndex}.ffn_norm.weight')
|
||||
layers.append('norm.weight')
|
||||
layers.append('output.weight')
|
||||
|
||||
isHeaderWrote = False
|
||||
outFile = open(outputPath, 'wb')
|
||||
|
||||
nChunks = math.ceil(len(layers) / LAYER_CHUNK_SIZE)
|
||||
for chunkIndex in range(0, nChunks):
|
||||
chunkLayerNames = layers[LAYER_CHUNK_SIZE * chunkIndex:LAYER_CHUNK_SIZE * (chunkIndex + 1)]
|
||||
models = {}
|
||||
for layerName in chunkLayerNames:
|
||||
models[layerName] = []
|
||||
|
||||
print(f'💿 Chunking model {chunkIndex + 1}/{nChunks}...')
|
||||
|
||||
for modelPath in modelPaths:
|
||||
model = torch.load(modelPath, map_location='cpu')
|
||||
for modelKey in model:
|
||||
if (modelKey in chunkLayerNames):
|
||||
models[modelKey].append(model[modelKey])
|
||||
if not isHeaderWrote:
|
||||
params['hidden_dim'] = model['layers.0.feed_forward.w1.weight'].shape[0] * nSlices
|
||||
writeHeader(outFile, params)
|
||||
isHeaderWrote = True
|
||||
del model
|
||||
|
||||
for layerName in chunkLayerNames:
|
||||
if layerName == 'rope.freqs':
|
||||
continue
|
||||
|
||||
isAxis1 = (
|
||||
layerName == 'tok_embeddings.weight' or
|
||||
layerName.endswith('.attention.wo.weight') or
|
||||
layerName.endswith('.feed_forward.w2.weight')
|
||||
)
|
||||
isAlwaysF32 = (
|
||||
layerName == 'tok_embeddings.weight' or
|
||||
layerName.endswith('.attention_norm.weight') or
|
||||
layerName.endswith('.ffn_norm.weight') or
|
||||
layerName == 'norm.weight'
|
||||
)
|
||||
floatType = FloatType.F32 if isAlwaysF32 else targetFloatType
|
||||
|
||||
tensors = models[layerName]
|
||||
if len(tensors) == 1 or len(tensors[0].shape) == 1:
|
||||
tensor = tensors[0]
|
||||
else:
|
||||
tensor = torch.cat(tensors, dim=(1 if isAxis1 else 0))
|
||||
|
||||
print(f'🔶 Exporting {layerName} {tensor.shape}...')
|
||||
writeTensor(outFile, tensor, floatType)
|
||||
|
||||
del models
|
||||
|
||||
outFile.close()
|
||||
|
||||
def usage():
|
||||
print('Usage: python convert-llama.py <modelPath> <targetFloatType>')
|
||||
exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
if (len(sys.argv) < 3):
|
||||
usage()
|
||||
|
||||
modelPath = sys.argv[1]
|
||||
targetFloatType = parseFloatType(sys.argv[2])
|
||||
targetFloatTypeStr = strFloatType(targetFloatType)
|
||||
|
||||
modelName = os.path.basename(modelPath)
|
||||
outputFileName = f'dllama_model_{modelName.lower()}_{targetFloatTypeStr}.m'
|
||||
|
||||
print(f'Model name: {modelName}')
|
||||
print(f'Target float type: {targetFloatTypeStr}')
|
||||
print(f'Target file: {outputFileName}')
|
||||
|
||||
convert(modelPath, outputFileName, targetFloatType)
|
||||
|
||||
print('Done!')
|
||||
137
converter/convert-tokenizer-hf.py
Normal file
137
converter/convert-tokenizer-hf.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
from transformers import PreTrainedTokenizerFast
|
||||
writer = __import__('tokenizer-writer')
|
||||
|
||||
def openJson(path):
|
||||
with open(path, 'r', encoding='utf-8') as file:
|
||||
return json.load(file)
|
||||
|
||||
def unicodeToBytes():
|
||||
# https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/encoder.py#L9
|
||||
bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2 ** 8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2 ** 8 + n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(cs, bs))
|
||||
|
||||
class TokensResolver:
|
||||
def __init__(self, dirPath, tokenizerConfig):
|
||||
self.dirPath = dirPath
|
||||
self.tokenizerConfig = tokenizerConfig
|
||||
self.bosId = None
|
||||
self.eosIds = None
|
||||
self.tokens = []
|
||||
self.scores = []
|
||||
|
||||
def resolvePreTrainedTokenizerFast(self):
|
||||
utb = unicodeToBytes()
|
||||
tokenizer = PreTrainedTokenizerFast(tokenizer_file = os.path.join(self.dirPath, 'tokenizer.json'))
|
||||
vocabLen = len(tokenizer.get_vocab())
|
||||
for i in range(vocabLen):
|
||||
tokenChars = list(tokenizer.convert_ids_to_tokens([i])[0])
|
||||
tokenBytes = []
|
||||
for chr in tokenChars:
|
||||
if (chr in utb):
|
||||
tokenBytes.append(utb[chr])
|
||||
else:
|
||||
tokenBytes += list(chr.encode('utf-8'))
|
||||
self.tokens.append(bytes(tokenBytes))
|
||||
self.scores.append(-float(i))
|
||||
|
||||
self.bosId = tokenizer.bos_token_id
|
||||
if (tokenizer.eos_token_id):
|
||||
self.eosIds = [tokenizer.eos_token_id]
|
||||
if (self.bosId is None or self.eosId is None):
|
||||
config = openJson(os.path.join(self.dirPath, 'config.json'))
|
||||
if (self.bosId is None):
|
||||
self.bosId = config['bos_token_id']
|
||||
if (self.eosIds is None):
|
||||
self.eosIds = config['eos_token_id']
|
||||
if isinstance(self.eosIds, list):
|
||||
self.eosIds = self.eosIds
|
||||
else:
|
||||
self.eosIds = [self.eosIds]
|
||||
|
||||
def resolveLlamaTokenizer(self):
|
||||
modelPath = os.path.join(self.dirPath, 'tokenizer.model')
|
||||
processor = SentencePieceProcessor(model_file=modelPath)
|
||||
|
||||
assert processor.vocab_size() == processor.get_piece_size()
|
||||
self.bosId = processor.bos_id()
|
||||
self.eosIds = [processor.eos_id()]
|
||||
vocabSize = processor.vocab_size()
|
||||
for i in range(vocabSize):
|
||||
t = processor.id_to_piece(i)
|
||||
s = processor.get_score(i)
|
||||
t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
|
||||
# Check for byte characters
|
||||
if len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
|
||||
# For example, "<0x0A>"" is a newline character
|
||||
b = bytearray.fromhex(t[3:-1])
|
||||
else:
|
||||
b = t.encode('utf-8')
|
||||
self.tokens.append(b)
|
||||
self.scores.append(s)
|
||||
|
||||
def resolve(self):
|
||||
cls = self.tokenizerConfig['tokenizer_class']
|
||||
if (cls == 'PreTrainedTokenizerFast' or
|
||||
cls == 'LlamaTokenizerFast' or
|
||||
cls == 'Qwen2Tokenizer'):
|
||||
return self.resolvePreTrainedTokenizerFast()
|
||||
if (cls == 'LlamaTokenizer'):
|
||||
return self.resolveLlamaTokenizer()
|
||||
raise Exception(f'Tokenizer {cls} is not supported')
|
||||
|
||||
def printUsage():
|
||||
print('Usage: python convert-tokenizer-hf.py <tokenizerFolderPath> <name>')
|
||||
print()
|
||||
print('Options:')
|
||||
print(' <tokenizerFolderPath> The path to the folder with tokenizer_config.json')
|
||||
print(' <name> The name of the tokenizer (e.g. "llama3")')
|
||||
|
||||
if __name__ == '__main__':
|
||||
if (len(sys.argv) < 2):
|
||||
printUsage()
|
||||
exit(1)
|
||||
|
||||
dirPath = sys.argv[1]
|
||||
name = sys.argv[2]
|
||||
tokenizerConfig = openJson(os.path.join(dirPath, 'tokenizer_config.json'))
|
||||
|
||||
resolver = TokensResolver(dirPath, tokenizerConfig)
|
||||
resolver.resolve()
|
||||
|
||||
if (resolver.bosId is None or resolver.eosIds is None):
|
||||
raise Exception('Cannot resolve bosId or eosIds')
|
||||
print(f'bosId: {resolver.bosId} ({resolver.tokens[resolver.bosId]})')
|
||||
for eosId in resolver.eosIds:
|
||||
print(f'eosId: {eosId} ({resolver.tokens[eosId]})')
|
||||
|
||||
chatTemplate = None
|
||||
if ('chat_template' in tokenizerConfig):
|
||||
chatTemplate = tokenizerConfig['chat_template'].encode('utf-8')
|
||||
|
||||
addBos = True
|
||||
if ('add_bos_token' in tokenizerConfig):
|
||||
addBos = tokenizerConfig['add_bos_token']
|
||||
|
||||
outputFileName = f'dllama_tokenizer_{name}.t'
|
||||
with open(outputFileName, 'wb') as outputFile:
|
||||
writer.writeTokenizer(
|
||||
outputFile,
|
||||
resolver.tokens,
|
||||
resolver.scores,
|
||||
chatTemplate,
|
||||
resolver.bosId,
|
||||
addBos,
|
||||
resolver.eosIds)
|
||||
print(f'✅ Created {outputFileName}')
|
||||
44
converter/convert-tokenizer-llama2.py
Normal file
44
converter/convert-tokenizer-llama2.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import sys
|
||||
import os
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
writer = __import__('tokenizer-writer')
|
||||
|
||||
chatTemplate = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
|
||||
|
||||
def printUsage():
|
||||
print('Usage: python convert-tokenizer-llama2.py <llama2FolderPath>')
|
||||
print()
|
||||
print('Options:')
|
||||
print(' <llama2FolderPath> The path to the folder with llama2 folder path')
|
||||
|
||||
if __name__ == '__main__':
|
||||
if (len(sys.argv) < 2):
|
||||
printUsage()
|
||||
exit(1)
|
||||
|
||||
dirPath = sys.argv[1]
|
||||
modelPath = os.path.join(dirPath, 'tokenizer.model')
|
||||
processor = SentencePieceProcessor(model_file=modelPath)
|
||||
|
||||
vocabSize = processor.vocab_size()
|
||||
tokens = []
|
||||
scores = []
|
||||
for i in range(vocabSize):
|
||||
t = processor.id_to_piece(i)
|
||||
s = processor.get_score(i)
|
||||
t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
|
||||
b = t.encode('utf-8')
|
||||
tokens.append(b)
|
||||
scores.append(s)
|
||||
|
||||
outputFileName = 'dllama_tokenizer_llama2.t'
|
||||
with open(outputFileName, 'wb') as outputFile:
|
||||
writer.writeTokenizer(
|
||||
outputFile,
|
||||
tokens,
|
||||
scores,
|
||||
chatTemplate.encode('utf-8'),
|
||||
processor.bos_id(),
|
||||
[processor.eos_id()])
|
||||
|
||||
print(f'✅ Created {outputFileName}')
|
||||
78
converter/convert-tokenizer-llama3.py
Normal file
78
converter/convert-tokenizer-llama3.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import sys
|
||||
import base64
|
||||
writer = __import__('tokenizer-writer')
|
||||
|
||||
# Format of input file:
|
||||
# ```
|
||||
# IQ== 0
|
||||
# Ig== 1
|
||||
# Iw== 2
|
||||
# ...
|
||||
# ```
|
||||
|
||||
nSpecialTokens = 256
|
||||
specialTokens = [
|
||||
'<|begin_of_text|>',
|
||||
'<|end_of_text|>',
|
||||
'<|reserved_special_token_0|>',
|
||||
'<|reserved_special_token_1|>',
|
||||
'<|reserved_special_token_2|>',
|
||||
'<|reserved_special_token_3|>',
|
||||
'<|start_header_id|>',
|
||||
'<|end_header_id|>',
|
||||
'<|reserved_special_token_4|>',
|
||||
'<|eot_id|>',
|
||||
] + [
|
||||
f'<|reserved_special_token_{i}|>'
|
||||
for i in range(5, nSpecialTokens - 5)
|
||||
]
|
||||
bosId = 128000
|
||||
eosId = 128001
|
||||
chatEosId = 128009
|
||||
chatTemplate = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
|
||||
|
||||
def printUsage():
|
||||
print('Usage: python convert-tokenizer-llama3.py <tokenizerPath>')
|
||||
print()
|
||||
print('Options:')
|
||||
print(' <tokenizerPath> The path to the Llama 3 tokenizer model (tokenizer.model)')
|
||||
|
||||
if __name__ == '__main__':
|
||||
if (len(sys.argv) < 2):
|
||||
printUsage()
|
||||
exit(1)
|
||||
|
||||
modelPath = sys.argv[1]
|
||||
outputFileName = 'dllama_tokenizer_llama3.t'
|
||||
|
||||
with open(modelPath, 'r') as inputFile:
|
||||
with open(outputFileName, 'wb') as outputFile:
|
||||
inputLines = inputFile.readlines()
|
||||
nLines = len(inputLines)
|
||||
|
||||
tokens = []
|
||||
scores = []
|
||||
for line in inputLines:
|
||||
s = line.split(' ')
|
||||
bytes = base64.b64decode(s[0])
|
||||
score = -float(s[1])
|
||||
tokens.append(bytes)
|
||||
scores.append(score)
|
||||
|
||||
specialTokenIndex = nLines
|
||||
for token in specialTokens:
|
||||
bytes = token.encode('utf-8')
|
||||
score = -float(specialTokenIndex)
|
||||
tokens.append(bytes)
|
||||
scores.append(score)
|
||||
specialTokenIndex += 1
|
||||
|
||||
writer.writeTokenizer(
|
||||
outputFile,
|
||||
tokens,
|
||||
scores,
|
||||
chatTemplate.encode('utf-8'),
|
||||
bosId,
|
||||
[eosId, chatEosId])
|
||||
|
||||
print(f'✅ Created {outputFileName}')
|
||||
5
converter/requirements.txt
Normal file
5
converter/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
python>=3.9
|
||||
numpy==1.23.5
|
||||
pytorch==2.0.1
|
||||
safetensors==0.4.2
|
||||
sentencepiece==0.1.99
|
||||
57
converter/tokenizer-writer.py
Normal file
57
converter/tokenizer-writer.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import struct
|
||||
|
||||
def writeTokenizer(file, tokens, scores, chatTemplate, bosId, addBos, eosTokens):
|
||||
headerKeys = {
|
||||
'version': 0,
|
||||
'vocab_size': 1,
|
||||
'max_token_length': 2,
|
||||
'bos_id': 3,
|
||||
'chat_template': 7,
|
||||
'n_eos_tokens': 9,
|
||||
'add_bos': 10,
|
||||
}
|
||||
header = struct.pack('i', 0x567124)
|
||||
|
||||
nTokens = len(tokens)
|
||||
maxTokenLength = max(len(t) for t in tokens)
|
||||
|
||||
params = {}
|
||||
params['bos_id'] = bosId
|
||||
params['version'] = 1
|
||||
params['vocab_size'] = nTokens
|
||||
params['max_token_length'] = maxTokenLength
|
||||
if (chatTemplate):
|
||||
params['chat_template'] = len(chatTemplate)
|
||||
params['n_eos_tokens'] = len(eosTokens)
|
||||
params['add_bos'] = 1 if addBos else 0
|
||||
|
||||
data = b''
|
||||
for key in params:
|
||||
value = params[key]
|
||||
if value is None:
|
||||
continue
|
||||
if key in headerKeys:
|
||||
data += struct.pack('ii', headerKeys[key], params[key])
|
||||
else:
|
||||
print(f'Unknown header key: {key}')
|
||||
|
||||
print('⭐ Params:')
|
||||
print(params)
|
||||
if (chatTemplate):
|
||||
print('⭐ Chat template:')
|
||||
print(chatTemplate)
|
||||
|
||||
header += struct.pack('i', len(header) * 2 + len(data))
|
||||
file.write(header)
|
||||
file.write(data)
|
||||
if chatTemplate:
|
||||
file.write(chatTemplate)
|
||||
|
||||
for eosToken in eosTokens:
|
||||
file.write(struct.pack('i', eosToken))
|
||||
|
||||
for i in range(0, nTokens):
|
||||
size = len(tokens[i])
|
||||
assert(size > 0)
|
||||
file.write(struct.pack('fI', scores[i], size))
|
||||
file.write(tokens[i])
|
||||
35
converter/writer-test.py
Normal file
35
converter/writer-test.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import sys
|
||||
import time
|
||||
import torch
|
||||
from writer import writeQuantizedQ40Tensor
|
||||
|
||||
TEMP_FILE_NAME = 'writer-test.temp'
|
||||
|
||||
def readBase64FromFile(path):
|
||||
with open(path, 'rb') as file:
|
||||
return file.read().hex()
|
||||
|
||||
def testWriteQuantizedQ40Tensor():
|
||||
EXPECTED_OUTPUT = '7e346345a692b89665b2c5790537876e598aaa366d988876a898b8d788a98868ce660c66f6b3a88cba5ce9a871987ba9cc5bcaaa760c1eb556a4455b747b6b9504968828ef2a8d7c1db5c6be3764799e66db6d8e76463126a30e4333cad7a4f645947c6cf97f9de086d468c8d535a6ba7dc799d3d0c657bab6799468cad8bb349eb7d7635c7c798998696bb38e4085a9eb34444ba96a7f8ba7b2b42d746a96cf9660aeb4499d8708ad5c7b9a7558947645f3bbb6b0346a656887ad9a86059baac5c596ab781c703569bb8a4356a4bd58cb78736ba09759bb0e34a6274e827b957d7a67dfa86846955660d234b6d9d78a378094a8a8708a7a774ae92f8a36b8c999a9b77a7d958a69747c807963941235379886d69a7a8767b3a6a4ac71999760'
|
||||
|
||||
torch.manual_seed(seed=1)
|
||||
tensor = torch.randn(32, 16)
|
||||
|
||||
with open(TEMP_FILE_NAME, 'wb') as file:
|
||||
writeQuantizedQ40Tensor(file, tensor)
|
||||
|
||||
contentBase64 = readBase64FromFile(TEMP_FILE_NAME)
|
||||
assert contentBase64 == EXPECTED_OUTPUT, f'Received: {contentBase64}'
|
||||
print('✅ writeQuantizedQ40Tensor')
|
||||
|
||||
def runWriteQuantizedQ40TensorBenchmark():
|
||||
tensor = torch.randn(8192, 4096)
|
||||
t0 = time.time()
|
||||
with open(TEMP_FILE_NAME, 'wb') as file:
|
||||
writeQuantizedQ40Tensor(file, tensor)
|
||||
t1 = time.time()
|
||||
print(f'🕐 writeQuantizedQ40Tensor: {t1 - t0:.4f}s')
|
||||
|
||||
if __name__ == '__main__':
|
||||
testWriteQuantizedQ40Tensor()
|
||||
runWriteQuantizedQ40TensorBenchmark()
|
||||
148
converter/writer.py
Normal file
148
converter/writer.py
Normal file
@@ -0,0 +1,148 @@
|
||||
import struct
|
||||
import torch
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
class FloatType:
|
||||
F32 = 0
|
||||
F16 = 1
|
||||
Q40 = 2
|
||||
Q80 = 3
|
||||
|
||||
floatTypeMap = {
|
||||
'f32': FloatType.F32,
|
||||
'f16': FloatType.F16,
|
||||
'q40': FloatType.Q40,
|
||||
'q80': FloatType.Q80,
|
||||
}
|
||||
floatTypeNames = list(floatTypeMap.keys())
|
||||
|
||||
def parseFloatType(type):
|
||||
floatType = floatTypeMap.get(type)
|
||||
if floatType is not None:
|
||||
return floatType
|
||||
raise Exception(f'{type} is not supported')
|
||||
|
||||
def strFloatType(type):
|
||||
return floatTypeNames[type]
|
||||
|
||||
def writeQuantizedQ40Tensor(file, x):
|
||||
x = x.to(torch.float32).numpy().astype(np.float32)
|
||||
blockSize = 32
|
||||
blockHalfSize = blockSize // 2
|
||||
assert(x.shape[0] % blockSize == 0)
|
||||
groups = x.reshape(-1, blockSize)
|
||||
gmax = np.max(groups, axis=1)
|
||||
gmin = np.min(groups, axis=1)
|
||||
deltas = np.divide(np.where(-gmin > gmax, gmin, gmax), -8)
|
||||
deltas16 = deltas.astype(np.float16)
|
||||
ids = np.where(deltas != 0, 1.0 / deltas, 0)
|
||||
groups = np.add(groups * ids[:, np.newaxis], 8.5)
|
||||
groups = np.clip(groups, 0, 15).astype(int)
|
||||
|
||||
gLow = groups[:, :blockHalfSize] & 0xF
|
||||
gHigh = (groups[:, blockHalfSize:] & 0xF) << 4
|
||||
gCombined = gLow | gHigh
|
||||
|
||||
nBytes = 0
|
||||
for groupIndex in range(0, len(groups)):
|
||||
delta16 = deltas16[groupIndex]
|
||||
buffer = struct.pack(f'e{blockHalfSize}B', delta16, *gCombined[groupIndex])
|
||||
file.write(buffer)
|
||||
nBytes += len(buffer)
|
||||
return nBytes
|
||||
|
||||
def writeQuantizedQ80Tensor(file, x):
|
||||
x = x.to(torch.float32).numpy().astype(np.float32)
|
||||
blockSize = 32
|
||||
assert(x.shape[0] % blockSize == 0)
|
||||
groups = x.reshape(-1, blockSize)
|
||||
gmax = np.max(groups, axis=1)
|
||||
gmin = np.min(groups, axis=1)
|
||||
gabsMax = np.where(-gmin > gmax, -gmin, gmax)
|
||||
deltas = gabsMax / ((1 << 7) - 1)
|
||||
deltas16 = deltas.astype(np.float16)
|
||||
ids = np.where(deltas != 0, 1.0 / deltas, 0)
|
||||
groups = groups * ids[:, np.newaxis]
|
||||
groups8 = np.round(groups).astype(np.int8)
|
||||
|
||||
nBytes = 0
|
||||
for groupIndex in range(0, len(groups)):
|
||||
buffer = struct.pack(f'e{blockSize}b', deltas16[groupIndex], *groups8[groupIndex])
|
||||
file.write(buffer)
|
||||
nBytes += len(buffer)
|
||||
return nBytes
|
||||
|
||||
def writeF32Tensor(file, d):
|
||||
chunkSize = 10000
|
||||
nBytes = 0
|
||||
for i in range(0, len(d), chunkSize):
|
||||
chunk = d[i:i+chunkSize].to(torch.float32).numpy().astype(np.float32)
|
||||
b = struct.pack(f'{len(chunk)}f', *chunk)
|
||||
nBytes += len(b)
|
||||
file.write(b)
|
||||
return nBytes
|
||||
|
||||
def writeF16Tensor(file, d):
|
||||
d = d.to(torch.float16).numpy().astype(np.float16)
|
||||
b = struct.pack(f'{len(d)}e', *d)
|
||||
file.write(b)
|
||||
return len(b)
|
||||
|
||||
def writeTensor(file, tensor, floatType):
|
||||
d = tensor.detach().cpu().view(-1)
|
||||
t0 = time.time()
|
||||
nBytes = 0
|
||||
if (floatType == FloatType.F16):
|
||||
nBytes = writeF16Tensor(file, d)
|
||||
elif (floatType == FloatType.F32):
|
||||
nBytes = writeF32Tensor(file, d)
|
||||
elif (floatType == FloatType.Q40):
|
||||
nBytes = writeQuantizedQ40Tensor(file, d)
|
||||
elif (floatType == FloatType.Q80):
|
||||
nBytes = writeQuantizedQ80Tensor(file, d)
|
||||
else:
|
||||
raise Exception(f'Unknown float type')
|
||||
t1 = time.time()
|
||||
print(f'Saved {strFloatType(floatType)} tensor in {t1 - t0:.2f}s, {nBytes} bytes')
|
||||
|
||||
def writeHeader(file, params):
|
||||
headerKeys = {
|
||||
'version': 0,
|
||||
'arch_type': 1,
|
||||
'dim': 2,
|
||||
'hidden_dim': 3,
|
||||
'n_layers': 4,
|
||||
'n_heads': 5,
|
||||
'n_kv_heads': 6,
|
||||
'n_experts': 7,
|
||||
'n_active_experts': 8,
|
||||
'vocab_size': 9,
|
||||
'max_seq_len': 10,
|
||||
'hidden_act': 11,
|
||||
'rope_theta': 12,
|
||||
'weights_float_type': 13,
|
||||
'rope_scaling_factor': 14,
|
||||
'rope_scaling_low_freq_factor': 15,
|
||||
'rope_scaling_high_freq_factory': 16,
|
||||
'rope_scaling_orig_max_seq_len': 17,
|
||||
'rope_type': 18,
|
||||
'head_dim': 19,
|
||||
'norm_epsilon': 20,
|
||||
'moe_hidden_dim': 21,
|
||||
}
|
||||
header = struct.pack('i', 0xA00ABCD)
|
||||
|
||||
data = b''
|
||||
for key in params:
|
||||
if key in headerKeys:
|
||||
data += struct.pack('ii', headerKeys[key], params[key])
|
||||
else:
|
||||
print(f'Warning: Unknown header key: {key}')
|
||||
|
||||
header += struct.pack('i', len(header) * 2 + len(data))
|
||||
file.write(header)
|
||||
file.write(data)
|
||||
for key in params:
|
||||
print(f'🎓 {key}: {params[key]}')
|
||||
print()
|
||||
Reference in New Issue
Block a user