import struct import torch import time import numpy as np class FloatType: F32 = 0 F16 = 1 Q40 = 2 Q80 = 3 floatTypeMap = { 'f32': FloatType.F32, 'f16': FloatType.F16, 'q40': FloatType.Q40, 'q80': FloatType.Q80, } floatTypeNames = list(floatTypeMap.keys()) def parseFloatType(type): floatType = floatTypeMap.get(type) if floatType is not None: return floatType raise Exception(f'{type} is not supported') def strFloatType(type): return floatTypeNames[type] def writeQuantizedQ40Tensor(file, x): x = x.to(torch.float32).numpy().astype(np.float32) blockSize = 32 blockHalfSize = blockSize // 2 assert(x.shape[0] % blockSize == 0) groups = x.reshape(-1, blockSize) gmax = np.max(groups, axis=1) gmin = np.min(groups, axis=1) deltas = np.divide(np.where(-gmin > gmax, gmin, gmax), -8) deltas16 = deltas.astype(np.float16) ids = np.where(deltas != 0, 1.0 / deltas, 0) groups = np.add(groups * ids[:, np.newaxis], 8.5) groups = np.clip(groups, 0, 15).astype(int) gLow = groups[:, :blockHalfSize] & 0xF gHigh = (groups[:, blockHalfSize:] & 0xF) << 4 gCombined = gLow | gHigh nBytes = 0 for groupIndex in range(0, len(groups)): delta16 = deltas16[groupIndex] buffer = struct.pack(f'e{blockHalfSize}B', delta16, *gCombined[groupIndex]) file.write(buffer) nBytes += len(buffer) return nBytes def writeQuantizedQ80Tensor(file, x): x = x.to(torch.float32).numpy().astype(np.float32) blockSize = 32 assert(x.shape[0] % blockSize == 0) groups = x.reshape(-1, blockSize) gmax = np.max(groups, axis=1) gmin = np.min(groups, axis=1) gabsMax = np.where(-gmin > gmax, -gmin, gmax) deltas = gabsMax / ((1 << 7) - 1) deltas16 = deltas.astype(np.float16) ids = np.where(deltas != 0, 1.0 / deltas, 0) groups = groups * ids[:, np.newaxis] groups8 = np.round(groups).astype(np.int8) nBytes = 0 for groupIndex in range(0, len(groups)): buffer = struct.pack(f'e{blockSize}b', deltas16[groupIndex], *groups8[groupIndex]) file.write(buffer) nBytes += len(buffer) return nBytes def writeF32Tensor(file, d): chunkSize = 10000 nBytes = 0 for i in range(0, len(d), chunkSize): chunk = d[i:i+chunkSize].to(torch.float32).numpy().astype(np.float32) b = struct.pack(f'{len(chunk)}f', *chunk) nBytes += len(b) file.write(b) return nBytes def writeF16Tensor(file, d): d = d.to(torch.float16).numpy().astype(np.float16) b = struct.pack(f'{len(d)}e', *d) file.write(b) return len(b) def writeTensor(file, tensor, floatType): d = tensor.detach().cpu().view(-1) t0 = time.time() nBytes = 0 if (floatType == FloatType.F16): nBytes = writeF16Tensor(file, d) elif (floatType == FloatType.F32): nBytes = writeF32Tensor(file, d) elif (floatType == FloatType.Q40): nBytes = writeQuantizedQ40Tensor(file, d) elif (floatType == FloatType.Q80): nBytes = writeQuantizedQ80Tensor(file, d) else: raise Exception(f'Unknown float type') t1 = time.time() print(f'Saved {strFloatType(floatType)} tensor in {t1 - t0:.2f}s, {nBytes} bytes') def writeHeader(file, params): headerKeys = { 'version': 0, 'arch_type': 1, 'dim': 2, 'hidden_dim': 3, 'n_layers': 4, 'n_heads': 5, 'n_kv_heads': 6, 'n_experts': 7, 'n_active_experts': 8, 'vocab_size': 9, 'max_seq_len': 10, 'hidden_act': 11, 'rope_theta': 12, 'weights_float_type': 13, 'rope_scaling_factor': 14, 'rope_scaling_low_freq_factor': 15, 'rope_scaling_high_freq_factory': 16, 'rope_scaling_orig_max_seq_len': 17, 'rope_type': 18, 'head_dim': 19, 'norm_epsilon': 20, 'moe_hidden_dim': 21, } header = struct.pack('i', 0xA00ABCD) data = b'' for key in params: if key in headerKeys: data += struct.pack('ii', headerKeys[key], params[key]) else: print(f'Warning: Unknown header key: {key}') header += struct.pack('i', len(header) * 2 + len(data)) file.write(header) file.write(data) for key in params: print(f'🎓 {key}: {params[key]}') print()