58 lines
1.5 KiB
Python
58 lines
1.5 KiB
Python
import struct
|
|
|
|
def writeTokenizer(file, tokens, scores, chatTemplate, bosId, addBos, eosTokens):
|
|
headerKeys = {
|
|
'version': 0,
|
|
'vocab_size': 1,
|
|
'max_token_length': 2,
|
|
'bos_id': 3,
|
|
'chat_template': 7,
|
|
'n_eos_tokens': 9,
|
|
'add_bos': 10,
|
|
}
|
|
header = struct.pack('i', 0x567124)
|
|
|
|
nTokens = len(tokens)
|
|
maxTokenLength = max(len(t) for t in tokens)
|
|
|
|
params = {}
|
|
params['bos_id'] = bosId
|
|
params['version'] = 1
|
|
params['vocab_size'] = nTokens
|
|
params['max_token_length'] = maxTokenLength
|
|
if (chatTemplate):
|
|
params['chat_template'] = len(chatTemplate)
|
|
params['n_eos_tokens'] = len(eosTokens)
|
|
params['add_bos'] = 1 if addBos else 0
|
|
|
|
data = b''
|
|
for key in params:
|
|
value = params[key]
|
|
if value is None:
|
|
continue
|
|
if key in headerKeys:
|
|
data += struct.pack('ii', headerKeys[key], params[key])
|
|
else:
|
|
print(f'Unknown header key: {key}')
|
|
|
|
print('⭐ Params:')
|
|
print(params)
|
|
if (chatTemplate):
|
|
print('⭐ Chat template:')
|
|
print(chatTemplate)
|
|
|
|
header += struct.pack('i', len(header) * 2 + len(data))
|
|
file.write(header)
|
|
file.write(data)
|
|
if chatTemplate:
|
|
file.write(chatTemplate)
|
|
|
|
for eosToken in eosTokens:
|
|
file.write(struct.pack('i', eosToken))
|
|
|
|
for i in range(0, nTokens):
|
|
size = len(tokens[i])
|
|
assert(size > 0)
|
|
file.write(struct.pack('fI', scores[i], size))
|
|
file.write(tokens[i])
|