dllama/converter/writer.py

import struct
import torch
import time
import numpy as np

class FloatType:
    F32 = 0
    F16 = 1
    Q40 = 2
    Q80 = 3

floatTypeMap = {
    'f32': FloatType.F32,
    'f16': FloatType.F16,
    'q40': FloatType.Q40,
    'q80': FloatType.Q80,
}
floatTypeNames = list(floatTypeMap.keys())

def parseFloatType(type):
    floatType = floatTypeMap.get(type)
    if floatType is not None:
        return floatType
    raise Exception(f'{type} is not supported')

def strFloatType(type):
    return floatTypeNames[type]

def writeQuantizedQ40Tensor(file, x):
    x = x.to(torch.float32).numpy().astype(np.float32)
    blockSize = 32
    blockHalfSize = blockSize // 2
    assert(x.shape[0] % blockSize == 0)
    groups = x.reshape(-1, blockSize)
    gmax = np.max(groups, axis=1)
    gmin = np.min(groups, axis=1)
    deltas = np.divide(np.where(-gmin > gmax, gmin, gmax), -8)
    deltas16 = deltas.astype(np.float16)
    ids = np.where(deltas != 0, 1.0 / deltas, 0)
    groups = np.add(groups * ids[:, np.newaxis], 8.5)
    groups = np.clip(groups, 0, 15).astype(int)

    gLow = groups[:, :blockHalfSize] & 0xF
    gHigh = (groups[:, blockHalfSize:] & 0xF) << 4
    gCombined = gLow | gHigh

    nBytes = 0
    for groupIndex in range(0, len(groups)):
        delta16 = deltas16[groupIndex]
        buffer = struct.pack(f'e{blockHalfSize}B', delta16, *gCombined[groupIndex])
        file.write(buffer)
        nBytes += len(buffer)
    return nBytes

def writeQuantizedQ80Tensor(file, x):
    x = x.to(torch.float32).numpy().astype(np.float32)
    blockSize = 32
    assert(x.shape[0] % blockSize == 0)
    groups = x.reshape(-1, blockSize)
    gmax = np.max(groups, axis=1)
    gmin = np.min(groups, axis=1)
    gabsMax = np.where(-gmin > gmax, -gmin, gmax)
    deltas = gabsMax / ((1 << 7) - 1)
    deltas16 = deltas.astype(np.float16)
    ids = np.where(deltas != 0, 1.0 / deltas, 0)
    groups = groups * ids[:, np.newaxis]
    groups8 = np.round(groups).astype(np.int8)

    nBytes = 0
    for groupIndex in range(0, len(groups)):
        buffer = struct.pack(f'e{blockSize}b', deltas16[groupIndex], *groups8[groupIndex])
        file.write(buffer)
        nBytes += len(buffer)
    return nBytes

def writeF32Tensor(file, d):
    chunkSize = 10000
    nBytes = 0
    for i in range(0, len(d), chunkSize):
        chunk = d[i:i+chunkSize].to(torch.float32).numpy().astype(np.float32)
        b = struct.pack(f'{len(chunk)}f', *chunk)
        nBytes += len(b)
        file.write(b)
    return nBytes

def writeF16Tensor(file, d):
    d = d.to(torch.float16).numpy().astype(np.float16)
    b = struct.pack(f'{len(d)}e', *d)
    file.write(b)
    return len(b)

def writeTensor(file, tensor, floatType):
    d = tensor.detach().cpu().view(-1)
    t0 = time.time()
    nBytes = 0
    if (floatType == FloatType.F16):
        nBytes = writeF16Tensor(file, d)
    elif (floatType == FloatType.F32):
        nBytes = writeF32Tensor(file, d)
    elif (floatType == FloatType.Q40):
        nBytes = writeQuantizedQ40Tensor(file, d)
    elif (floatType == FloatType.Q80):
        nBytes = writeQuantizedQ80Tensor(file, d)
    else:
        raise Exception(f'Unknown float type')
    t1 = time.time()
    print(f'Saved {strFloatType(floatType)} tensor in {t1 - t0:.2f}s, {nBytes} bytes')

def writeHeader(file, params):
    headerKeys = {
        'version': 0,
        'arch_type': 1,
        'dim': 2,
        'hidden_dim': 3,
        'n_layers': 4,
        'n_heads': 5,
        'n_kv_heads': 6,
        'n_experts': 7,
        'n_active_experts': 8,
        'vocab_size': 9,
        'max_seq_len': 10,
        'hidden_act': 11,
        'rope_theta': 12,
        'weights_float_type': 13,
        'rope_scaling_factor': 14,
        'rope_scaling_low_freq_factor': 15,
        'rope_scaling_high_freq_factory': 16,
        'rope_scaling_orig_max_seq_len': 17,
        'rope_type': 18,
        'head_dim': 19,
        'norm_epsilon': 20,
        'moe_hidden_dim': 21,
    }
    header = struct.pack('i', 0xA00ABCD)

    data = b''
    for key in params:
        if key in headerKeys:
            data += struct.pack('ii', headerKeys[key], params[key])
        else:
            print(f'Warning: Unknown header key: {key}')

    header += struct.pack('i', len(header) * 2 + len(data))
    file.write(header)
    file.write(data)
    for key in params:
        print(f'🎓 {key}: {params[key]}')
    print()