256 lines
8.6 KiB
C++
256 lines
8.6 KiB
C++
#include "nn-quants.hpp"
|
|
#include <cassert>
|
|
#include <cstring>
|
|
#include <cmath>
|
|
#include <stdexcept>
|
|
#include <cstdio>
|
|
|
|
#if defined(CONVERT_F16_TO_F32_LOOKUP)
|
|
float f16ToF32Lookup[65536];
|
|
#endif
|
|
|
|
void initQuants() {
|
|
#if defined(CONVERT_F16_TO_F32_LOOKUP)
|
|
for (NnUint i = 0; i < 65536; i++)
|
|
f16ToF32Lookup[i] = convertF16toF32Impl((NnFp16)i);
|
|
#endif
|
|
}
|
|
|
|
float convertF16toF32Impl(const NnFp16 value) {
|
|
union Fl32 {
|
|
uint32_t u;
|
|
float f;
|
|
};
|
|
const Fl32 magic = { (254U - 15U) << 23 };
|
|
const Fl32 infNan = { (127U + 16U) << 23 };
|
|
Fl32 result;
|
|
result.u = (value & 0x7FFFU) << 13;
|
|
result.f *= magic.f;
|
|
if (result.f >= infNan.f)
|
|
result.u |= 255U << 23;
|
|
result.u |= (value & 0x8000U) << 16;
|
|
return result.f;
|
|
}
|
|
|
|
NnFp16 convertF32ToF16Impl(const float x) {
|
|
int i = *(int *)&x;
|
|
int s = (i >> 16) & 0x00008000;
|
|
int e = ((i >> 23) & 0x000000ff) - (127 - 15);
|
|
int m = i & 0x007fffff;
|
|
if (e <= 0) {
|
|
if (e < -10) {
|
|
return s;
|
|
}
|
|
m = m | 0x00800000;
|
|
int t = 14 - e;
|
|
int a = (1 << (t - 1)) - 1;
|
|
int b = (m >> t) & 1;
|
|
m = (m + a + b) >> t;
|
|
return s | m;
|
|
}
|
|
if (e == 0xff - (127 - 15)) {
|
|
if (m == 0) {
|
|
return s | 0x7c00;
|
|
}
|
|
m >>= 13;
|
|
return s | 0x7c00 | m | (m == 0);
|
|
}
|
|
m = m + 0x00000fff + ((m >> 13) & 1);
|
|
if (m & 0x00800000) {
|
|
m = 0;
|
|
e += 1;
|
|
}
|
|
assert(e <= 30);
|
|
return s | (e << 10) | (m >> 13);
|
|
}
|
|
|
|
void quantizeF32toQ80(const float *input, NnBlockQ80 *output, const NnUint n, const NnUint nThreads, const NnUint threadIndex) {
|
|
assert(n % Q80_BLOCK_SIZE == 0);
|
|
const NnUint nBlocks = n / Q80_BLOCK_SIZE;
|
|
SPLIT_THREADS(start, end, nBlocks, nThreads, threadIndex);
|
|
|
|
#if defined(__ARM_NEON)
|
|
for (NnUint i = start; i < end; i++) {
|
|
const float *x = &input[i * Q80_BLOCK_SIZE];
|
|
NnBlockQ80 *y = &output[i];
|
|
|
|
float32x4_t amaxVec = vdupq_n_f32(0.0f);
|
|
for (NnUint j = 0; j < Q80_BLOCK_SIZE; j += 4) {
|
|
const float32x4_t vec = vld1q_f32(&x[j]);
|
|
const float32x4_t abs_vec = vabsq_f32(vec);
|
|
amaxVec = vmaxq_f32(amaxVec, abs_vec);
|
|
}
|
|
|
|
float amax = vmaxvq_f32(amaxVec);
|
|
|
|
const float d = amax / 127.0f;
|
|
const float id = d != 0.0f ? 1.0f / d : 0.0f;
|
|
|
|
y->d = CONVERT_F32_TO_F16(d);
|
|
|
|
const float32x4_t vid_vec = vdupq_n_f32(id);
|
|
|
|
for (NnUint j = 0; j < Q80_BLOCK_SIZE; j += 4) {
|
|
float32x4_t vec = vld1q_f32(&x[j]);
|
|
vec = vmulq_f32(vec, vid_vec);
|
|
|
|
const uint32x4_t sign_mask = vcgeq_f32(vec, vdupq_n_f32(0.0f));
|
|
const float32x4_t half = vbslq_f32(sign_mask, vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
|
|
vec = vaddq_f32(vec, half);
|
|
|
|
const int32x4_t vec_i32 = vcvtq_s32_f32(vec);
|
|
const int16x4_t vec_i16 = vqmovn_s32(vec_i32);
|
|
const int8x8_t vec_i8 = vqmovn_s16(vcombine_s16(vec_i16, vec_i16));
|
|
|
|
vst1_lane_s32((int32_t *)(y->qs + j), vreinterpret_s32_s8(vec_i8), 0);
|
|
}
|
|
}
|
|
#elif defined(__AVX2__)
|
|
for (NnUint i = start; i < end; ++i) {
|
|
const float *x = input + i * Q80_BLOCK_SIZE;
|
|
NnBlockQ80 *y = output + i;
|
|
|
|
__m256 max_abs = _mm256_setzero_ps();
|
|
for (int j = 0; j < Q80_BLOCK_SIZE; j += 8) {
|
|
__m256 vec = _mm256_loadu_ps(x + j);
|
|
__m256 abs_vec = _mm256_and_ps(vec, _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)));
|
|
max_abs = _mm256_max_ps(max_abs, abs_vec);
|
|
}
|
|
__m128 max_hi = _mm256_extractf128_ps(max_abs, 1);
|
|
__m128 max_lo = _mm256_castps256_ps128(max_abs);
|
|
__m128 max_128 = _mm_max_ps(max_hi, max_lo);
|
|
max_128 = _mm_max_ps(max_128, _mm_movehl_ps(max_128, max_128));
|
|
max_128 = _mm_max_ss(max_128, _mm_shuffle_ps(max_128, max_128, _MM_SHUFFLE(1, 1, 1, 1)));
|
|
float amax = _mm_cvtss_f32(max_128);
|
|
|
|
const float d = amax / 127.0f;
|
|
const float id = (d != 0.0f) ? 1.0f / d : 0.0f;
|
|
y->d = CONVERT_F32_TO_F16(d);
|
|
|
|
const __m256 id_vec = _mm256_set1_ps(id);
|
|
const __m128i shuffle_mask = _mm_set_epi8(
|
|
-1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, 12, 8, 4, 0
|
|
);
|
|
|
|
for (int j = 0; j < Q80_BLOCK_SIZE; j += 8) {
|
|
__m256 vec = _mm256_loadu_ps(x + j);
|
|
__m256 scaled = _mm256_mul_ps(vec, id_vec);
|
|
__m256 rounded = _mm256_round_ps(scaled, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
__m256i integers = _mm256_cvtps_epi32(rounded);
|
|
|
|
__m128i low = _mm256_extracti128_si256(integers, 0);
|
|
__m128i high = _mm256_extracti128_si256(integers, 1);
|
|
|
|
__m128i low_bytes = _mm_shuffle_epi8(low, shuffle_mask);
|
|
__m128i high_bytes = _mm_shuffle_epi8(high, shuffle_mask);
|
|
|
|
uint32_t low_part = _mm_extract_epi32(low_bytes, 0);
|
|
uint32_t high_part = _mm_extract_epi32(high_bytes, 0);
|
|
uint64_t packed = (static_cast<uint64_t>(high_part) << 32) | low_part;
|
|
std::memcpy(y->qs + j, &packed, sizeof(packed));
|
|
}
|
|
}
|
|
#else
|
|
for (NnUint i = start; i < end; i++) {
|
|
const float *x = &input[i * Q80_BLOCK_SIZE];
|
|
NnBlockQ80 *y = &output[i];
|
|
|
|
float amax = 0.0f;
|
|
for (NnUint j = 0; j < Q80_BLOCK_SIZE; j++) {
|
|
const float v = fabsf(x[j]);
|
|
amax = amax > v ? amax : v;
|
|
}
|
|
|
|
const float d = amax / ((1 << 7) - 1);
|
|
const float id = d ? 1.0f / d : 0.0f;
|
|
y->d = CONVERT_F32_TO_F16(d);
|
|
for (NnUint j = 0; j < Q80_BLOCK_SIZE; ++j) {
|
|
y->qs[j] = roundf(x[j] * id);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void dequantizeQ80toF32(const NnBlockQ80 *input, float* output, const NnUint k, const NnUint nThreads, const NnUint threadIndex) {
|
|
assert(k % Q80_BLOCK_SIZE == 0);
|
|
const int nBlocks = k / Q80_BLOCK_SIZE;
|
|
const int blocksPerThread = nBlocks / nThreads;
|
|
const int sk = blocksPerThread * Q80_BLOCK_SIZE;
|
|
const int currentThreadBlocks = blocksPerThread + (threadIndex == nThreads - 1 ? nBlocks % nThreads : 0);
|
|
|
|
const NnBlockQ80 *x = &input[blocksPerThread * threadIndex];
|
|
float* y = &output[sk * threadIndex];
|
|
|
|
for (int i = 0; i < currentThreadBlocks; i++) {
|
|
const float d = CONVERT_F16_TO_F32(x[i].d);
|
|
for (int j = 0; j < Q80_BLOCK_SIZE; j++) {
|
|
y[i * Q80_BLOCK_SIZE + j] = x[i].qs[j] * d;
|
|
}
|
|
}
|
|
}
|
|
|
|
void quantizeF32toQ40(const float *x, NnBlockQ40 *output, const NnUint n, const NnUint nThreads, const NnUint threadIndex) {
|
|
assert(n % Q40_BLOCK_SIZE == 0);
|
|
const NnUint nBlocks = n / Q40_BLOCK_SIZE;
|
|
const NnUint halfSize = Q40_BLOCK_SIZE / 2;
|
|
SPLIT_THREADS(start, end, nBlocks, nThreads, threadIndex);
|
|
|
|
for (NnUint i = start; i < end; i++) {
|
|
float amax = 0.0f;
|
|
float max = 0.0f;
|
|
for (NnUint j = 0; j < Q40_BLOCK_SIZE; j++) {
|
|
float v = x[i * Q40_BLOCK_SIZE + j];
|
|
if (amax < fabsf(v)) {
|
|
amax = fabsf(v);
|
|
max = v;
|
|
}
|
|
}
|
|
|
|
const float d = max / -8.0f;
|
|
const float id = d ? 1.0f / d : 0.0f;
|
|
|
|
NnBlockQ40 *o = &output[i];
|
|
o->d = CONVERT_F32_TO_F16(d);
|
|
for (NnUint j = 0; j < halfSize; j++) {
|
|
const float x0 = x[i * Q40_BLOCK_SIZE + j] * id;
|
|
const float x1 = x[i * Q40_BLOCK_SIZE + halfSize + j] * id;
|
|
|
|
uint8_t xi0 = (int8_t)(x0 + 8.5f);
|
|
uint8_t xi1 = (int8_t)(x1 + 8.5f);
|
|
if (xi0 > 15) xi0 = 15;
|
|
if (xi1 > 15) xi1 = 15;
|
|
|
|
o->qs[j] = xi0 | (xi1 << 4);
|
|
}
|
|
}
|
|
}
|
|
|
|
void dequantizeQ40toF32(const NnBlockQ40 *x, float *output, const NnUint n, const NnUint nThreads, const NnUint threadIndex) {
|
|
assert(n % Q40_BLOCK_SIZE == 0);
|
|
const NnUint nBlocks = n / Q40_BLOCK_SIZE;
|
|
SPLIT_THREADS(start, end, nBlocks, nThreads, threadIndex);
|
|
|
|
for (NnUint i = start; i < end; i++) {
|
|
const NnBlockQ40 *b = &x[i];
|
|
const float d = CONVERT_F16_TO_F32(b->d);
|
|
|
|
for (int j = 0; j < Q40_BLOCK_SIZE / 2; ++j) {
|
|
const int x0 = (b->qs[j] & 0x0F) - 8;
|
|
const int x1 = (b->qs[j] >> 4) - 8;
|
|
|
|
output[i * Q40_BLOCK_SIZE + j] = x0 * d;
|
|
output[i * Q40_BLOCK_SIZE + j + Q40_BLOCK_SIZE / 2] = x1 * d;
|
|
}
|
|
}
|
|
}
|
|
|
|
const char *floatTypeToString(NnFloatType type) {
|
|
if (type == F_UNK) return "F_UNK";
|
|
if (type == F_32) return "F_32";
|
|
if (type == F_16) return "F_16";
|
|
if (type == F_Q40) return "F_Q40";
|
|
if (type == F_Q80) return "F_Q80";
|
|
throw std::invalid_argument("Unknown float type");
|
|
}
|