init
Some checks failed
main / Linux (amd64, ubuntu-22.04) (push) Successful in 49s
main / Linux (arm64, ubuntu-24.04-arm) (push) Has been cancelled
main / Windows (push) Has been cancelled

This commit is contained in:
2025-10-24 11:42:14 +02:00
commit 42172cbb6f
85 changed files with 40316 additions and 0 deletions

View File

@@ -0,0 +1,78 @@
import sys
import base64
writer = __import__('tokenizer-writer')
# Format of input file:
# ```
# IQ== 0
# Ig== 1
# Iw== 2
# ...
# ```
nSpecialTokens = 256
specialTokens = [
'<|begin_of_text|>',
'<|end_of_text|>',
'<|reserved_special_token_0|>',
'<|reserved_special_token_1|>',
'<|reserved_special_token_2|>',
'<|reserved_special_token_3|>',
'<|start_header_id|>',
'<|end_header_id|>',
'<|reserved_special_token_4|>',
'<|eot_id|>',
] + [
f'<|reserved_special_token_{i}|>'
for i in range(5, nSpecialTokens - 5)
]
bosId = 128000
eosId = 128001
chatEosId = 128009
chatTemplate = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
def printUsage():
print('Usage: python convert-tokenizer-llama3.py <tokenizerPath>')
print()
print('Options:')
print(' <tokenizerPath> The path to the Llama 3 tokenizer model (tokenizer.model)')
if __name__ == '__main__':
if (len(sys.argv) < 2):
printUsage()
exit(1)
modelPath = sys.argv[1]
outputFileName = 'dllama_tokenizer_llama3.t'
with open(modelPath, 'r') as inputFile:
with open(outputFileName, 'wb') as outputFile:
inputLines = inputFile.readlines()
nLines = len(inputLines)
tokens = []
scores = []
for line in inputLines:
s = line.split(' ')
bytes = base64.b64decode(s[0])
score = -float(s[1])
tokens.append(bytes)
scores.append(score)
specialTokenIndex = nLines
for token in specialTokens:
bytes = token.encode('utf-8')
score = -float(specialTokenIndex)
tokens.append(bytes)
scores.append(score)
specialTokenIndex += 1
writer.writeTokenizer(
outputFile,
tokens,
scores,
chatTemplate.encode('utf-8'),
bosId,
[eosId, chatEosId])
print(f'✅ Created {outputFileName}')