init
This commit is contained in:
78
converter/convert-tokenizer-llama3.py
Normal file
78
converter/convert-tokenizer-llama3.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import sys
|
||||
import base64
|
||||
writer = __import__('tokenizer-writer')
|
||||
|
||||
# Format of input file:
|
||||
# ```
|
||||
# IQ== 0
|
||||
# Ig== 1
|
||||
# Iw== 2
|
||||
# ...
|
||||
# ```
|
||||
|
||||
nSpecialTokens = 256
|
||||
specialTokens = [
|
||||
'<|begin_of_text|>',
|
||||
'<|end_of_text|>',
|
||||
'<|reserved_special_token_0|>',
|
||||
'<|reserved_special_token_1|>',
|
||||
'<|reserved_special_token_2|>',
|
||||
'<|reserved_special_token_3|>',
|
||||
'<|start_header_id|>',
|
||||
'<|end_header_id|>',
|
||||
'<|reserved_special_token_4|>',
|
||||
'<|eot_id|>',
|
||||
] + [
|
||||
f'<|reserved_special_token_{i}|>'
|
||||
for i in range(5, nSpecialTokens - 5)
|
||||
]
|
||||
bosId = 128000
|
||||
eosId = 128001
|
||||
chatEosId = 128009
|
||||
chatTemplate = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
|
||||
|
||||
def printUsage():
|
||||
print('Usage: python convert-tokenizer-llama3.py <tokenizerPath>')
|
||||
print()
|
||||
print('Options:')
|
||||
print(' <tokenizerPath> The path to the Llama 3 tokenizer model (tokenizer.model)')
|
||||
|
||||
if __name__ == '__main__':
|
||||
if (len(sys.argv) < 2):
|
||||
printUsage()
|
||||
exit(1)
|
||||
|
||||
modelPath = sys.argv[1]
|
||||
outputFileName = 'dllama_tokenizer_llama3.t'
|
||||
|
||||
with open(modelPath, 'r') as inputFile:
|
||||
with open(outputFileName, 'wb') as outputFile:
|
||||
inputLines = inputFile.readlines()
|
||||
nLines = len(inputLines)
|
||||
|
||||
tokens = []
|
||||
scores = []
|
||||
for line in inputLines:
|
||||
s = line.split(' ')
|
||||
bytes = base64.b64decode(s[0])
|
||||
score = -float(s[1])
|
||||
tokens.append(bytes)
|
||||
scores.append(score)
|
||||
|
||||
specialTokenIndex = nLines
|
||||
for token in specialTokens:
|
||||
bytes = token.encode('utf-8')
|
||||
score = -float(specialTokenIndex)
|
||||
tokens.append(bytes)
|
||||
scores.append(score)
|
||||
specialTokenIndex += 1
|
||||
|
||||
writer.writeTokenizer(
|
||||
outputFile,
|
||||
tokens,
|
||||
scores,
|
||||
chatTemplate.encode('utf-8'),
|
||||
bosId,
|
||||
[eosId, chatEosId])
|
||||
|
||||
print(f'✅ Created {outputFileName}')
|
||||
Reference in New Issue
Block a user