init
Some checks failed
main / Linux (amd64, ubuntu-22.04) (push) Successful in 49s
main / Linux (arm64, ubuntu-24.04-arm) (push) Has been cancelled
main / Windows (push) Has been cancelled

This commit is contained in:
2025-10-24 11:42:14 +02:00
commit 42172cbb6f
85 changed files with 40316 additions and 0 deletions

View File

@@ -0,0 +1,57 @@
import struct
def writeTokenizer(file, tokens, scores, chatTemplate, bosId, addBos, eosTokens):
headerKeys = {
'version': 0,
'vocab_size': 1,
'max_token_length': 2,
'bos_id': 3,
'chat_template': 7,
'n_eos_tokens': 9,
'add_bos': 10,
}
header = struct.pack('i', 0x567124)
nTokens = len(tokens)
maxTokenLength = max(len(t) for t in tokens)
params = {}
params['bos_id'] = bosId
params['version'] = 1
params['vocab_size'] = nTokens
params['max_token_length'] = maxTokenLength
if (chatTemplate):
params['chat_template'] = len(chatTemplate)
params['n_eos_tokens'] = len(eosTokens)
params['add_bos'] = 1 if addBos else 0
data = b''
for key in params:
value = params[key]
if value is None:
continue
if key in headerKeys:
data += struct.pack('ii', headerKeys[key], params[key])
else:
print(f'Unknown header key: {key}')
print('⭐ Params:')
print(params)
if (chatTemplate):
print('⭐ Chat template:')
print(chatTemplate)
header += struct.pack('i', len(header) * 2 + len(data))
file.write(header)
file.write(data)
if chatTemplate:
file.write(chatTemplate)
for eosToken in eosTokens:
file.write(struct.pack('i', eosToken))
for i in range(0, nTokens):
size = len(tokens[i])
assert(size > 0)
file.write(struct.pack('fI', scores[i], size))
file.write(tokens[i])