sglang_v0.5.2/pytorch_2.8.0/third_party/fbgemm/test/QuantUtilsTest.cc

807 lines
26 KiB
C++

/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <algorithm>
#include <climits>
#include <limits>
#include <random>
#include <sstream>
#include <type_traits>
#include "TestUtils.h"
#include "fbgemm/FloatConversion.h"
#include "fbgemm/QuantUtils.h"
#include "fbgemm/Types.h"
#include "fbgemm/Utils.h"
using namespace std;
using namespace fbgemm;
// tuple represents K, C, X, G, layout_t
// layout_t can be KCX or KXC
class QuantizeGroupwiseTest
: public testing::TestWithParam<tuple<int, int, int, int, layout_t>> {};
class QuantizeTest : public testing::TestWithParam<int> {};
class FusedQuantizeDequantizeTest : public testing::TestWithParam<int> {};
// Parameter are bit_rate (i.e., the number of bits in quantized values),
// input rows, and input columns
class EmbeddingQuantizeTest
: public testing::TestWithParam<tuple<int, int, int>> {};
// Parameter are input rows and input columns
// Scale and Bias are of type float (SBFloat)
class EmbeddingQuantizeSBFloatTest
: public testing::TestWithParam<tuple<int, int>> {};
INSTANTIATE_TEST_CASE_P(
InstantiationName,
QuantizeGroupwiseTest,
::testing::Combine(
::testing::ValuesIn({4, 12, 64}), // K
::testing::ValuesIn({12, 16, 32}), // C
::testing::ValuesIn({1, 10, 15, 30}), // X
::testing::ValuesIn({1, 4}), // G
::testing::ValuesIn({layout_t::KCX, layout_t::KXC})));
INSTANTIATE_TEST_CASE_P(
InstantiationName,
QuantizeTest,
::testing::Values(1, 2, 5, 8, 9, 16, 20, 28, 32, 33));
INSTANTIATE_TEST_CASE_P(
InstantiationName,
FusedQuantizeDequantizeTest,
::testing::Values(1, 2, 5, 8, 9, 16, 20, 28, 32, 33));
INSTANTIATE_TEST_CASE_P(
InstantiationName,
EmbeddingQuantizeTest,
::testing::Combine(
::testing::ValuesIn({2, 4, 8}),
::testing::ValuesIn({1, 2, 3}),
::testing::ValuesIn({4, 8, 16, 20, 28, 32, 64, 84})));
INSTANTIATE_TEST_CASE_P(
InstantiationName,
EmbeddingQuantizeSBFloatTest,
::testing::Combine(
::testing::ValuesIn({1, 2, 3}),
::testing::ValuesIn({1, 2, 5, 8, 9, 16, 20, 28, 32, 33, 64, 65})));
template <typename T, layout_t LT>
void ref_impl(
const vector<float>& src,
int K,
int C,
int X,
int G,
const vector<float>& scales,
const vector<int>& zero_points,
vector<T>& dst) {
int C_per_G = C / G;
for (int i = 0; i < K; ++i) {
for (int g = 0; g < G; ++g) {
for (int c = 0; c < C / G; ++c) {
for (int x = 0; x < X; ++x) {
float num;
if (LT == layout_t::KCX) {
num = src[(i * C + g * C_per_G + c) * X + x];
} else {
num = src[(i * X + x) * C + g * C_per_G + c];
}
int res = nearbyint(zero_points[g] + num / scales[g]);
T final_res = min<T>(
max<T>(res, numeric_limits<T>::min()), numeric_limits<T>::max());
if (LT == layout_t::KCX) {
dst[(i * C + g * C_per_G + c) * X + x] = final_res;
} else {
dst[(i * X + x) * C + g * C_per_G + c] = final_res;
}
}
}
}
}
}
template <typename T, layout_t LT>
void runTests(
const vector<float>& src,
int K,
int C,
int X,
int G,
const vector<float>& scales,
const vector<int>& zero_points,
vector<T>& dst,
vector<T>& dst_ref) {
QuantizeGroupwise<T, LT>(
src.data(), K, C, X, G, scales.data(), zero_points.data(), dst.data());
ref_impl<T, LT>(src, K, C, X, G, scales, zero_points, dst_ref);
}
/**
* There can be off-by-one error in quantized values due to how the mid-point
* cases are rounded-off in vectorized vs scalar codes and due to adding of
* zero_point before rounding vs after rounding. We ignore such differences
* while comparing results.
*/
template <typename T>
::testing::AssertionResult isNear(
const vector<T>& res,
const vector<T>& res_ref) {
bool match = true;
if (res.size() == res_ref.size()) {
for (size_t i = 0; i < res.size(); ++i) {
if (!(res[i] == res_ref[i] || res[i] == res_ref[i] + 1 ||
res[i] == res_ref[i] - 1)) {
match = false;
break;
}
}
}
if (match)
return ::testing::AssertionSuccess();
else
return ::testing::AssertionFailure() << " Quantized results do not match";
}
template <typename T>
::testing::AssertionResult isQEmbeddingClose(
const vector<uint8_t>& res_ref,
const vector<uint8_t>& res,
int out_rows,
int out_emb_cols) {
bool match = true;
std::stringstream ss;
int ld = out_emb_cols + 2 * sizeof(T);
if (res.size() == res_ref.size()) {
for (int i = 0; i < out_rows; ++i) {
if (!match) {
break;
}
// compare embedding values
for (int j = 0; j < out_emb_cols; ++j) {
if (res[i * ld + j] != res_ref[i * ld + j]) {
match = false;
ss << " mismatch at (" << i << ", " << j << ") ";
ss << "ref: " << static_cast<uint32_t>(res_ref[i * ld + j])
<< ", test: " << static_cast<uint32_t>(res[i * ld + j]) << "\n";
break;
}
}
// compare scale/bias
float scaleTest, scaleRef, biasTest, biasRef;
if (is_same<T, float16>::value) {
// half scale and bias
scaleTest = cpu_half2float(reinterpret_cast<const float16*>(
res.data() + i * ld + out_emb_cols)[0]);
biasTest = cpu_half2float(reinterpret_cast<const float16*>(
res.data() + i * ld + out_emb_cols)[1]);
scaleRef = cpu_half2float(reinterpret_cast<const float16*>(
res_ref.data() + i * ld + out_emb_cols)[0]);
biasRef = cpu_half2float(reinterpret_cast<const float16*>(
res_ref.data() + i * ld + out_emb_cols)[1]);
} else {
// float scale and bias
scaleTest = reinterpret_cast<const float*>(
res.data() + i * ld + out_emb_cols)[0];
biasTest = reinterpret_cast<const float*>(
res.data() + i * ld + out_emb_cols)[1];
scaleRef = reinterpret_cast<const float*>(
res_ref.data() + i * ld + out_emb_cols)[0];
biasRef = reinterpret_cast<const float*>(
res_ref.data() + i * ld + out_emb_cols)[1];
}
if (fabs(scaleTest - scaleRef) > std::numeric_limits<float>::epsilon()) {
ss << " scale mismatch for row:" << i;
ss << " ref: " << scaleRef << ", test: " << scaleTest << "\n";
match = false;
}
if (fabs(biasTest - biasRef) > std::numeric_limits<float>::epsilon()) {
ss << " bias mismatch for row:" << i;
ss << " ref: " << biasRef << ", test: " << biasTest << "\n";
match = false;
}
}
} else {
ss << " size mismatch ";
match = false;
}
if (match)
return ::testing::AssertionSuccess();
else
return ::testing::AssertionFailure()
<< " Quantized Embeddings do not match." << ss.str();
}
/**
* Test for QuantizeGroupwise
*/
TEST_P(QuantizeGroupwiseTest, quantizeGTest) {
int K, C, X, G;
layout_t layout;
tie(K, C, X, G, layout) = GetParam();
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<float> disFP(0.1, 1.1);
vector<float> inp(K * C * X);
generate(inp.begin(), inp.end(), [&, disFP]() mutable { return disFP(gen); });
vector<float> scales(G);
generate(scales.begin(), scales.end(), [&, disFP]() mutable {
return disFP(gen);
});
uniform_int_distribution<> disUInt8(0, 8);
vector<int> zero_points_uint8(G);
generate(
zero_points_uint8.begin(),
zero_points_uint8.end(),
[&, disUInt8]() mutable { return disUInt8(gen); });
uniform_int_distribution<> disInt8(-64, 63);
vector<int> zero_points_int8(G);
generate(
zero_points_int8.begin(), zero_points_int8.end(), [&, disInt8]() mutable {
return disInt8(gen);
});
uniform_int_distribution<> disInt32(-512, 512);
vector<int> zero_points_int32(G);
generate(
zero_points_int32.begin(),
zero_points_int32.end(),
[&, disInt32]() mutable { return disInt32(gen); });
vector<uint8_t> dstuint8(K * C * X);
vector<uint8_t> dstuint8_ref(K * C * X);
vector<int8_t> dstint8(K * C * X);
vector<int8_t> dstint8_ref(K * C * X);
vector<int32_t> dstint32(K * C * X);
vector<int32_t> dstint32_ref(K * C * X);
if (layout == layout_t::KCX) {
runTests<uint8_t, layout_t::KCX>(
inp, K, C, X, G, scales, zero_points_uint8, dstuint8, dstuint8_ref);
runTests<int8_t, layout_t::KCX>(
inp, K, C, X, G, scales, zero_points_int8, dstint8, dstint8_ref);
runTests<int32_t, layout_t::KCX>(
inp, K, C, X, G, scales, zero_points_int32, dstint32, dstint32_ref);
} else {
runTests<uint8_t, layout_t::KXC>(
inp, K, C, X, G, scales, zero_points_uint8, dstuint8, dstuint8_ref);
runTests<int8_t, layout_t::KXC>(
inp, K, C, X, G, scales, zero_points_int8, dstint8, dstint8_ref);
runTests<int32_t, layout_t::KXC>(
inp, K, C, X, G, scales, zero_points_int32, dstint32, dstint32_ref);
}
EXPECT_TRUE(isNear(dstuint8, dstuint8_ref));
EXPECT_TRUE(isNear(dstint8, dstint8_ref));
EXPECT_TRUE(isNear(dstint32, dstint32_ref));
}
template <typename T>
void runQuantizeTests(
const vector<float>& src,
float scale,
int zero_point,
vector<T>& dst,
vector<T>& dst_ref) {
// reference
for (size_t i = 0; i < src.size(); ++i) {
dst_ref[i] = Quantize<T>(src[i], zero_point, scale, CHAR_BIT * sizeof(T));
}
TensorQuantizationParams qparams;
qparams.scale = scale;
qparams.zero_point = zero_point;
qparams.precision = CHAR_BIT * sizeof(T);
Quantize<T>(src.data(), dst.data(), src.size(), qparams);
}
/**
* Test for QuantizeGroupwise
*/
TEST_P(QuantizeTest, quantizeTest) {
int len;
len = GetParam();
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<float> disFP(-1.0e6, 1.0e6);
vector<float> inp(len);
generate(inp.begin(), inp.end(), [&, disFP]() mutable { return disFP(gen); });
float scale = disFP(gen);
// Generate a number between [0, 255] both inclusive
uniform_int_distribution<> disUInt8(0, 255);
int zero_point_uint8 = disUInt8(gen);
uniform_int_distribution<> disInt8(-128, 127);
int zero_point_int8 = disInt8(gen);
vector<uint8_t> dstuint8(len);
vector<uint8_t> dstuint8_ref(len);
vector<int8_t> dstint8(len);
vector<int8_t> dstint8_ref(len);
runQuantizeTests<uint8_t>(
inp, scale, zero_point_uint8, dstuint8, dstuint8_ref);
runQuantizeTests<int8_t>(inp, scale, zero_point_int8, dstint8, dstint8_ref);
EXPECT_TRUE(isNear(dstuint8, dstuint8_ref));
EXPECT_TRUE(isNear(dstint8, dstint8_ref));
}
// vector and scalar code should have the same behavior
TEST(QuantizeTestSingle, vectorScalar) {
// This length will exercise both the vector and scalar path
int len = 33;
vector<float> src(len);
vector<uint8_t> dst(len);
for (int i = 0; i < len; ++i) {
src[i] = -2.9483526e-05;
}
float scale = 2.3124334356729307e-07;
int zero_point = 128;
TensorQuantizationParams qparams;
qparams.scale = scale;
qparams.zero_point = zero_point;
qparams.precision = CHAR_BIT * sizeof(uint8_t);
Quantize<uint8_t>(src.data(), dst.data(), len, qparams);
// Check if all elements are equal
EXPECT_TRUE(
adjacent_find(dst.begin(), dst.end(), not_equal_to<int>()) == dst.end());
}
TEST(QuantizeTest, cornerCases) {
TensorQuantizationParams qparams;
qparams.scale = 1.19209e-07;
qparams.zero_point = 0;
qparams.precision = 8;
std::vector<float> src1 = {3.40282e+38, -2.16845e+38};
std::vector<int8_t> dst_int8(src1.size());
Quantize<int8_t>(src1.data(), dst_int8.data(), dst_int8.size(), qparams);
EXPECT_EQ(dst_int8[0], 127);
EXPECT_EQ(dst_int8[1], -128);
// Tests vectorized and remainder paths
std::vector<float> src2 = {
3.40282e+38,
-2.16845e+38,
3.40282e+38,
-2.16845e+38,
3.40282e+38,
-2.16845e+38,
3.40282e+38,
-2.16845e+38,
3.40282e+38};
std::vector<uint8_t> dst_uint8(src2.size());
Quantize<uint8_t>(src2.data(), dst_uint8.data(), dst_uint8.size(), qparams);
EXPECT_EQ(dst_uint8[0], 255);
EXPECT_EQ(dst_uint8[1], 0);
EXPECT_EQ(dst_uint8[8], 255);
qparams.precision = 16;
std::vector<int16_t> dst_int16(src2.size());
Quantize<int16_t>(src2.data(), dst_int16.data(), dst_int16.size(), qparams);
EXPECT_EQ(dst_int16[0], 32767);
EXPECT_EQ(dst_int16[1], -32768);
}
TEST(QuantizeTestQParams, chooseQParamsSymmetric) {
// Test that symmetric quantization of weights set zero point exactly to 0.
float min = -1.6165;
float max = 0.5685;
int32_t qmin = -128;
int32_t qmax = 127;
bool preserve_sparsity = true;
TensorQuantizationParams result =
ChooseQuantizationParams(min, max, qmin, qmax, preserve_sparsity);
EXPECT_FLOAT_EQ(result.scale, 0.012628906);
EXPECT_EQ(result.zero_point, 0);
}
template <typename T>
void runFusedQuantizeDequantizeTests(
const vector<float>& src,
float scale,
int zero_point,
vector<float>& dst,
vector<float>& dst_ref) {
TensorQuantizationParams qparams;
qparams.scale = scale;
qparams.zero_point = zero_point;
qparams.precision = CHAR_BIT * sizeof(T);
// reference
for (size_t i = 0; i < src.size(); ++i) {
dst_ref[i] = FusedQuantizeDequantize<T>(src[i], qparams);
}
FusedQuantizeDequantize<T>(src.data(), dst.data(), src.size(), qparams);
}
TEST_P(FusedQuantizeDequantizeTest, fusedQuantizeDequantizeTest) {
int len;
len = GetParam();
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<float> disFP(-1.0e6, 1.0e6);
vector<float> inp(len);
generate(inp.begin(), inp.end(), [&, disFP]() mutable { return disFP(gen); });
float scale = disFP(gen);
// Generate a number between [0, 255] both inclusive
uniform_int_distribution<> disUInt8(0, 255);
int zero_point_uint8 = disUInt8(gen);
uniform_int_distribution<> disInt8(-128, 127);
int zero_point_int8 = disInt8(gen);
vector<float> dstfloat(len);
vector<float> dstfloat_ref(len);
runFusedQuantizeDequantizeTests<uint8_t>(
inp, scale, zero_point_uint8, dstfloat, dstfloat_ref);
EXPECT_TRUE(floatCloseAll(dstfloat, dstfloat_ref));
runFusedQuantizeDequantizeTests<int8_t>(
inp, scale, zero_point_int8, dstfloat, dstfloat_ref);
EXPECT_TRUE(floatCloseAll(dstfloat, dstfloat_ref));
}
// vector and scalar code should have the same behavior
TEST(FusedQuantizeDequantizeTestSingle, vectorScalar) {
// This length will exercise both the vector and scalar path
int len = 33;
vector<float> src(len);
vector<float> dst(len);
for (int i = 0; i < len; ++i) {
src[i] = -2.9483526e-05;
}
float scale = 2.3124334356729307e-07;
int zero_point = 128;
TensorQuantizationParams qparams;
qparams.scale = scale;
qparams.zero_point = zero_point;
qparams.precision = CHAR_BIT * sizeof(uint8_t);
FusedQuantizeDequantize<uint8_t>(src.data(), dst.data(), src.size(), qparams);
// Check if all elements are equal
EXPECT_TRUE(
adjacent_find(dst.begin(), dst.end(), not_equal_to<float>()) ==
dst.end());
}
TEST(FusedQuantizeDequantizeTest, cornerCases) {
TensorQuantizationParams qparams;
qparams.scale = 1.19209e-07;
qparams.zero_point = 0;
qparams.precision = 8;
vector<float> src1 = {3.40282e+38, -2.16845e+38};
vector<float> ref = {1.5139543e-05, -1.5258752e-05};
vector<float> dst_int8(src1.size());
FusedQuantizeDequantize<int8_t>(
src1.data(), dst_int8.data(), src1.size(), qparams);
EXPECT_TRUE(floatCloseAll(dst_int8, ref));
// Tests vectorized and remainder paths
vector<float> src2 = {
3.40282e+38,
-2.16845e+38,
3.40282e+38,
-2.16845e+38,
3.40282e+38,
-2.16845e+38,
3.40282e+38,
-2.16845e+38,
3.40282e+38};
vector<float> ref2 = {
3.0398295e-05,
0,
3.0398295e-05,
0,
3.0398295e-05,
0,
3.0398295e-05,
0,
3.0398295e-05};
std::vector<float> dst_uint8(src2.size(), 0);
FusedQuantizeDequantize<uint8_t>(
src2.data(), dst_uint8.data(), src2.size(), qparams);
EXPECT_TRUE(floatCloseAll(dst_uint8, ref2));
}
// Parameter are bit_rate (i.e., the number of bits in quantized values).
class EmbeddingQuantizeFixedNumberTest : public testing::TestWithParam<int> {
protected:
// clang-format off
EmbeddingQuantizeFixedNumberTest() {
float_test_input = {
1, 1, 1, 1, // All the same. Range: 0, min: 1
-64, -2.75, 61.625, 191, // Range: 255, min: -64. Picking 61.625 because it differs under FP16 (will become 61.5).
};
assert(float_test_input.size() == row * col);
float16_test_input.resize(float_test_input.size());
std::transform(
float_test_input.begin(),
float_test_input.end(),
float16_test_input.begin(),
[](float input) { return cpu_float2half_rn(input); });
// Results are hand calculated.
expected_output_half[8] = {
0, 0, 0, 0, 0x00, 0x3c, 0x00, 0x3c, // Scale: 1, bias: 1
0, 61, 126, 255, 0x00, 0x3c, 0x00, 0xd4, // Scale: 1, bias: -64
};
expected_output_half[4] = {
0x00, 0x00, 0x00, 0x3c, 0x00, 0x3c, // 0, 0, 0, 0, Scale: 1, bias: 1
0x40, 0xf7, 0x40, 0x4c, 0x00, 0xd4, // 0, 4, 7, 15, Scale: 17, bias: -64
0, 0, 0, 0 // Padding
};
expected_output_half[2] = {
0b00000000, 0x00, 0x3c, 0x00, 0x3c, // 0, 0, 0, 0, Scale: 1, bias: 1
0b11010100, 0x50, 0x55, 0x00, 0xd4, // 0, 1, 1, 3, Scale: 85, bias: -64
0, 0, 0, 0, 0, 0 // Padding
};
expected_output_float = {
0, 0, 0, 0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, // Scale: 0, bias: 1
0, 61, 126, 255, 0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x80, 0xc2, // Scale: 1, bias: -64
};
}
// clang-format on
const int row = 2;
const int col = 4;
const int out_cols_half = col + 2 * sizeof(float16);
const int out_cols_float = col + 2 * sizeof(float);
std::vector<float> float_test_input;
std::vector<float16> float16_test_input;
std::map</*bit_rate*/ int, /*output*/ std::vector<uint8_t>>
expected_output_half;
std::vector<uint8_t> expected_output_float;
};
INSTANTIATE_TEST_CASE_P(
InstantiationName,
EmbeddingQuantizeFixedNumberTest,
::testing::ValuesIn({2, 4, 8}));
TEST_P(EmbeddingQuantizeFixedNumberTest, embeddingFloatToQuantizedSBHalfTest) {
const int bit_rate = GetParam();
vector<uint8_t> outVectHalfTest(row * out_cols_half);
FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef<float>(
bit_rate, float_test_input.data(), row, col, outVectHalfTest.data());
EXPECT_TRUE(isQEmbeddingClose<float16>(
expected_output_half[bit_rate], outVectHalfTest, row, col));
FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
bit_rate, float_test_input.data(), row, col, outVectHalfTest.data());
EXPECT_TRUE(isQEmbeddingClose<float16>(
expected_output_half[bit_rate], outVectHalfTest, row, col));
FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef<float16>(
bit_rate, float16_test_input.data(), row, col, outVectHalfTest.data());
EXPECT_TRUE(isQEmbeddingClose<float16>(
expected_output_half[bit_rate], outVectHalfTest, row, col));
FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float16>(
bit_rate, float16_test_input.data(), row, col, outVectHalfTest.data());
EXPECT_TRUE(isQEmbeddingClose<float16>(
expected_output_half[bit_rate], outVectHalfTest, row, col));
vector<uint8_t> outVecFloatTest(row * out_cols_float);
FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef<float>(
float_test_input.data(), row, col, outVecFloatTest.data());
EXPECT_TRUE(isQEmbeddingClose<float>(
expected_output_float, outVecFloatTest, row, col));
FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>(
float_test_input.data(), row, col, outVecFloatTest.data());
EXPECT_TRUE(isQEmbeddingClose<float>(
expected_output_float, outVecFloatTest, row, col));
FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef<float16>(
float16_test_input.data(), row, col, outVecFloatTest.data());
EXPECT_TRUE(isQEmbeddingClose<float>(
expected_output_float, outVecFloatTest, row, col));
FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float16>(
float16_test_input.data(), row, col, outVecFloatTest.data());
EXPECT_TRUE(isQEmbeddingClose<float>(
expected_output_float, outVecFloatTest, row, col));
}
// Scale and bias are of type float16
TEST_P(EmbeddingQuantizeTest, embeddingHalfTest) {
int bit_rate, rows, cols;
tie(bit_rate, rows, cols) = GetParam();
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<float> disFP(-10.0f, 10.0f);
vector<float> inpVec(rows * cols);
vector<float> dequantOutRef(rows * cols);
vector<float> dequantOutTest(rows * cols);
generate(inpVec.begin(), inpVec.end(), [&, disFP]() mutable {
return disFP(gen);
});
int elements_per_byte = 8 / bit_rate;
int out_emb_cols = (cols + elements_per_byte - 1) / elements_per_byte;
int out_cols = out_emb_cols + 2 * sizeof(float16);
int outVecSize = rows * out_cols;
vector<uint8_t> outVecRef(outVecSize);
vector<uint8_t> outVecTest(outVecSize);
FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef<float>(
bit_rate, inpVec.data(), rows, cols, outVecRef.data());
FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
bit_rate, inpVec.data(), rows, cols, outVecTest.data());
EXPECT_TRUE(
isQEmbeddingClose<float16>(outVecRef, outVecTest, rows, out_emb_cols));
FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef<float>(
bit_rate, outVecTest.data(), rows, out_cols, dequantOutRef.data());
FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf<float>(
bit_rate, outVecTest.data(), rows, out_cols, dequantOutTest.data());
EXPECT_TRUE(floatCloseAll(dequantOutRef, dequantOutTest, 1e-3));
generate(inpVec.begin(), inpVec.end(), [&, disFP]() mutable {
return cpu_half2float(cpu_float2half_rn(disFP(gen)));
});
vector<float16> inpHalfVec(rows * cols);
std::transform(
inpVec.begin(), inpVec.end(), inpHalfVec.begin(), [](float input) {
return cpu_float2half_rn(input);
});
vector<uint8_t> outVecRefFromHalf(outVecSize);
vector<uint8_t> outVecTestFromHalf(outVecSize);
FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef<float>(
bit_rate, inpVec.data(), rows, cols, outVecRef.data());
FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef<float16>(
bit_rate, inpHalfVec.data(), rows, cols, outVecRefFromHalf.data());
EXPECT_TRUE(isQEmbeddingClose<float16>(
outVecRefFromHalf, outVecRef, rows, out_emb_cols));
FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float16>(
bit_rate, inpHalfVec.data(), rows, cols, outVecTestFromHalf.data());
EXPECT_TRUE(isQEmbeddingClose<float16>(
outVecRefFromHalf, outVecTestFromHalf, rows, out_emb_cols));
vector<float16> dequantOutHalfRef(rows * cols);
vector<float16> dequantOutHalfTest(rows * cols);
FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef<float>(
bit_rate, outVecRef.data(), rows, out_cols, dequantOutRef.data());
FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef<float16>(
bit_rate, outVecRef.data(), rows, out_cols, dequantOutHalfRef.data());
constexpr int NumberOfFP16Matissa = 9;
EXPECT_TRUE(floatCloseAll(
dequantOutRef, dequantOutHalfRef, 1e-3, pow(2, NumberOfFP16Matissa)));
FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf<float16>(
bit_rate, outVecRef.data(), rows, out_cols, dequantOutHalfTest.data());
EXPECT_TRUE(floatCloseAll(
dequantOutHalfRef,
dequantOutHalfTest,
1e-3,
pow(2, NumberOfFP16Matissa)));
}
// Scale and bias are of type float
TEST_P(EmbeddingQuantizeSBFloatTest, embeddingFloatTest) {
int rows, cols;
tie(rows, cols) = GetParam();
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<float> disFP(-10.0f, 10.0f);
vector<float> inpVec(rows * cols);
vector<float> dequantOutTest(rows * cols);
vector<float> dequantOutRef(rows * cols);
generate(inpVec.begin(), inpVec.end(), [&, disFP]() mutable {
return disFP(gen);
});
int out_cols = cols + 2 * sizeof(float);
int outVecSize = rows * out_cols;
vector<uint8_t> outVecRef(outVecSize);
vector<uint8_t> outVecTest(outVecSize);
FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef<float>(
inpVec.data(), rows, cols, outVecRef.data());
FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>(
inpVec.data(), rows, cols, outVecTest.data());
// The number of input columns is the same as the number of output columns
EXPECT_TRUE(isQEmbeddingClose<float>(outVecRef, outVecTest, rows, cols));
Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef<float>(
outVecTest.data(), rows, out_cols, dequantOutRef.data());
Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf<float>(
outVecTest.data(), rows, out_cols, dequantOutTest.data());
EXPECT_TRUE(floatCloseAll(dequantOutRef, dequantOutTest, 1e-3));
generate(inpVec.begin(), inpVec.end(), [&, disFP]() mutable {
return cpu_half2float(cpu_float2half_rn(disFP(gen)));
});
vector<float16> inpHalfVec(rows * cols);
std::transform(
inpVec.begin(), inpVec.end(), inpHalfVec.begin(), [](float input) {
return cpu_float2half_rn(input);
});
vector<uint8_t> outVecRefFromHalf(outVecSize);
vector<uint8_t> outVecTestFromHalf(outVecSize);
FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef<float>(
inpVec.data(), rows, cols, outVecRef.data());
FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef<float16>(
inpHalfVec.data(), rows, cols, outVecRefFromHalf.data());
EXPECT_TRUE(
isQEmbeddingClose<float16>(outVecRefFromHalf, outVecRef, rows, cols));
FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float16>(
inpHalfVec.data(), rows, cols, outVecTestFromHalf.data());
EXPECT_TRUE(isQEmbeddingClose<float16>(
outVecRefFromHalf, outVecTestFromHalf, rows, cols));
vector<float16> dequantOutHalfRef(rows * cols);
vector<float16> dequantOutHalfTest(rows * cols);
Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef<float>(
outVecRef.data(), rows, out_cols, dequantOutRef.data());
Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef<float16>(
outVecRef.data(), rows, out_cols, dequantOutHalfRef.data());
constexpr int NumberOfFP16Matissa = 9;
EXPECT_TRUE(floatCloseAll(
dequantOutRef, dequantOutHalfRef, 1e-3, pow(2, NumberOfFP16Matissa)));
Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf<float16>(
outVecRef.data(), rows, out_cols, dequantOutHalfTest.data());
EXPECT_TRUE(floatCloseAll(
dequantOutHalfRef,
dequantOutHalfTest,
1e-3,
pow(2, NumberOfFP16Matissa)));
}