/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include "TestUtils.h" #include "fbgemm/FloatConversion.h" #include "fbgemm/QuantUtils.h" #include "fbgemm/Types.h" #include "fbgemm/Utils.h" using namespace std; using namespace fbgemm; // tuple represents K, C, X, G, layout_t // layout_t can be KCX or KXC class QuantizeGroupwiseTest : public testing::TestWithParam> {}; class QuantizeTest : public testing::TestWithParam {}; class FusedQuantizeDequantizeTest : public testing::TestWithParam {}; // Parameter are bit_rate (i.e., the number of bits in quantized values), // input rows, and input columns class EmbeddingQuantizeTest : public testing::TestWithParam> {}; // Parameter are input rows and input columns // Scale and Bias are of type float (SBFloat) class EmbeddingQuantizeSBFloatTest : public testing::TestWithParam> {}; INSTANTIATE_TEST_CASE_P( InstantiationName, QuantizeGroupwiseTest, ::testing::Combine( ::testing::ValuesIn({4, 12, 64}), // K ::testing::ValuesIn({12, 16, 32}), // C ::testing::ValuesIn({1, 10, 15, 30}), // X ::testing::ValuesIn({1, 4}), // G ::testing::ValuesIn({layout_t::KCX, layout_t::KXC}))); INSTANTIATE_TEST_CASE_P( InstantiationName, QuantizeTest, ::testing::Values(1, 2, 5, 8, 9, 16, 20, 28, 32, 33)); INSTANTIATE_TEST_CASE_P( InstantiationName, FusedQuantizeDequantizeTest, ::testing::Values(1, 2, 5, 8, 9, 16, 20, 28, 32, 33)); INSTANTIATE_TEST_CASE_P( InstantiationName, EmbeddingQuantizeTest, ::testing::Combine( ::testing::ValuesIn({2, 4, 8}), ::testing::ValuesIn({1, 2, 3}), ::testing::ValuesIn({4, 8, 16, 20, 28, 32, 64, 84}))); INSTANTIATE_TEST_CASE_P( InstantiationName, EmbeddingQuantizeSBFloatTest, ::testing::Combine( ::testing::ValuesIn({1, 2, 3}), ::testing::ValuesIn({1, 2, 5, 8, 9, 16, 20, 28, 32, 33, 64, 65}))); template void ref_impl( const vector& src, int K, int C, int X, int G, const vector& scales, const vector& zero_points, vector& dst) { int C_per_G = C / G; for (int i = 0; i < K; ++i) { for (int g = 0; g < G; ++g) { for (int c = 0; c < C / G; ++c) { for (int x = 0; x < X; ++x) { float num; if (LT == layout_t::KCX) { num = src[(i * C + g * C_per_G + c) * X + x]; } else { num = src[(i * X + x) * C + g * C_per_G + c]; } int res = nearbyint(zero_points[g] + num / scales[g]); T final_res = min( max(res, numeric_limits::min()), numeric_limits::max()); if (LT == layout_t::KCX) { dst[(i * C + g * C_per_G + c) * X + x] = final_res; } else { dst[(i * X + x) * C + g * C_per_G + c] = final_res; } } } } } } template void runTests( const vector& src, int K, int C, int X, int G, const vector& scales, const vector& zero_points, vector& dst, vector& dst_ref) { QuantizeGroupwise( src.data(), K, C, X, G, scales.data(), zero_points.data(), dst.data()); ref_impl(src, K, C, X, G, scales, zero_points, dst_ref); } /** * There can be off-by-one error in quantized values due to how the mid-point * cases are rounded-off in vectorized vs scalar codes and due to adding of * zero_point before rounding vs after rounding. We ignore such differences * while comparing results. */ template ::testing::AssertionResult isNear( const vector& res, const vector& res_ref) { bool match = true; if (res.size() == res_ref.size()) { for (size_t i = 0; i < res.size(); ++i) { if (!(res[i] == res_ref[i] || res[i] == res_ref[i] + 1 || res[i] == res_ref[i] - 1)) { match = false; break; } } } if (match) return ::testing::AssertionSuccess(); else return ::testing::AssertionFailure() << " Quantized results do not match"; } template ::testing::AssertionResult isQEmbeddingClose( const vector& res_ref, const vector& res, int out_rows, int out_emb_cols) { bool match = true; std::stringstream ss; int ld = out_emb_cols + 2 * sizeof(T); if (res.size() == res_ref.size()) { for (int i = 0; i < out_rows; ++i) { if (!match) { break; } // compare embedding values for (int j = 0; j < out_emb_cols; ++j) { if (res[i * ld + j] != res_ref[i * ld + j]) { match = false; ss << " mismatch at (" << i << ", " << j << ") "; ss << "ref: " << static_cast(res_ref[i * ld + j]) << ", test: " << static_cast(res[i * ld + j]) << "\n"; break; } } // compare scale/bias float scaleTest, scaleRef, biasTest, biasRef; if (is_same::value) { // half scale and bias scaleTest = cpu_half2float(reinterpret_cast( res.data() + i * ld + out_emb_cols)[0]); biasTest = cpu_half2float(reinterpret_cast( res.data() + i * ld + out_emb_cols)[1]); scaleRef = cpu_half2float(reinterpret_cast( res_ref.data() + i * ld + out_emb_cols)[0]); biasRef = cpu_half2float(reinterpret_cast( res_ref.data() + i * ld + out_emb_cols)[1]); } else { // float scale and bias scaleTest = reinterpret_cast( res.data() + i * ld + out_emb_cols)[0]; biasTest = reinterpret_cast( res.data() + i * ld + out_emb_cols)[1]; scaleRef = reinterpret_cast( res_ref.data() + i * ld + out_emb_cols)[0]; biasRef = reinterpret_cast( res_ref.data() + i * ld + out_emb_cols)[1]; } if (fabs(scaleTest - scaleRef) > std::numeric_limits::epsilon()) { ss << " scale mismatch for row:" << i; ss << " ref: " << scaleRef << ", test: " << scaleTest << "\n"; match = false; } if (fabs(biasTest - biasRef) > std::numeric_limits::epsilon()) { ss << " bias mismatch for row:" << i; ss << " ref: " << biasRef << ", test: " << biasTest << "\n"; match = false; } } } else { ss << " size mismatch "; match = false; } if (match) return ::testing::AssertionSuccess(); else return ::testing::AssertionFailure() << " Quantized Embeddings do not match." << ss.str(); } /** * Test for QuantizeGroupwise */ TEST_P(QuantizeGroupwiseTest, quantizeGTest) { int K, C, X, G; layout_t layout; tie(K, C, X, G, layout) = GetParam(); random_device rd; mt19937 gen(rd()); uniform_real_distribution disFP(0.1, 1.1); vector inp(K * C * X); generate(inp.begin(), inp.end(), [&, disFP]() mutable { return disFP(gen); }); vector scales(G); generate(scales.begin(), scales.end(), [&, disFP]() mutable { return disFP(gen); }); uniform_int_distribution<> disUInt8(0, 8); vector zero_points_uint8(G); generate( zero_points_uint8.begin(), zero_points_uint8.end(), [&, disUInt8]() mutable { return disUInt8(gen); }); uniform_int_distribution<> disInt8(-64, 63); vector zero_points_int8(G); generate( zero_points_int8.begin(), zero_points_int8.end(), [&, disInt8]() mutable { return disInt8(gen); }); uniform_int_distribution<> disInt32(-512, 512); vector zero_points_int32(G); generate( zero_points_int32.begin(), zero_points_int32.end(), [&, disInt32]() mutable { return disInt32(gen); }); vector dstuint8(K * C * X); vector dstuint8_ref(K * C * X); vector dstint8(K * C * X); vector dstint8_ref(K * C * X); vector dstint32(K * C * X); vector dstint32_ref(K * C * X); if (layout == layout_t::KCX) { runTests( inp, K, C, X, G, scales, zero_points_uint8, dstuint8, dstuint8_ref); runTests( inp, K, C, X, G, scales, zero_points_int8, dstint8, dstint8_ref); runTests( inp, K, C, X, G, scales, zero_points_int32, dstint32, dstint32_ref); } else { runTests( inp, K, C, X, G, scales, zero_points_uint8, dstuint8, dstuint8_ref); runTests( inp, K, C, X, G, scales, zero_points_int8, dstint8, dstint8_ref); runTests( inp, K, C, X, G, scales, zero_points_int32, dstint32, dstint32_ref); } EXPECT_TRUE(isNear(dstuint8, dstuint8_ref)); EXPECT_TRUE(isNear(dstint8, dstint8_ref)); EXPECT_TRUE(isNear(dstint32, dstint32_ref)); } template void runQuantizeTests( const vector& src, float scale, int zero_point, vector& dst, vector& dst_ref) { // reference for (size_t i = 0; i < src.size(); ++i) { dst_ref[i] = Quantize(src[i], zero_point, scale, CHAR_BIT * sizeof(T)); } TensorQuantizationParams qparams; qparams.scale = scale; qparams.zero_point = zero_point; qparams.precision = CHAR_BIT * sizeof(T); Quantize(src.data(), dst.data(), src.size(), qparams); } /** * Test for QuantizeGroupwise */ TEST_P(QuantizeTest, quantizeTest) { int len; len = GetParam(); random_device rd; mt19937 gen(rd()); uniform_real_distribution disFP(-1.0e6, 1.0e6); vector inp(len); generate(inp.begin(), inp.end(), [&, disFP]() mutable { return disFP(gen); }); float scale = disFP(gen); // Generate a number between [0, 255] both inclusive uniform_int_distribution<> disUInt8(0, 255); int zero_point_uint8 = disUInt8(gen); uniform_int_distribution<> disInt8(-128, 127); int zero_point_int8 = disInt8(gen); vector dstuint8(len); vector dstuint8_ref(len); vector dstint8(len); vector dstint8_ref(len); runQuantizeTests( inp, scale, zero_point_uint8, dstuint8, dstuint8_ref); runQuantizeTests(inp, scale, zero_point_int8, dstint8, dstint8_ref); EXPECT_TRUE(isNear(dstuint8, dstuint8_ref)); EXPECT_TRUE(isNear(dstint8, dstint8_ref)); } // vector and scalar code should have the same behavior TEST(QuantizeTestSingle, vectorScalar) { // This length will exercise both the vector and scalar path int len = 33; vector src(len); vector dst(len); for (int i = 0; i < len; ++i) { src[i] = -2.9483526e-05; } float scale = 2.3124334356729307e-07; int zero_point = 128; TensorQuantizationParams qparams; qparams.scale = scale; qparams.zero_point = zero_point; qparams.precision = CHAR_BIT * sizeof(uint8_t); Quantize(src.data(), dst.data(), len, qparams); // Check if all elements are equal EXPECT_TRUE( adjacent_find(dst.begin(), dst.end(), not_equal_to()) == dst.end()); } TEST(QuantizeTest, cornerCases) { TensorQuantizationParams qparams; qparams.scale = 1.19209e-07; qparams.zero_point = 0; qparams.precision = 8; std::vector src1 = {3.40282e+38, -2.16845e+38}; std::vector dst_int8(src1.size()); Quantize(src1.data(), dst_int8.data(), dst_int8.size(), qparams); EXPECT_EQ(dst_int8[0], 127); EXPECT_EQ(dst_int8[1], -128); // Tests vectorized and remainder paths std::vector src2 = { 3.40282e+38, -2.16845e+38, 3.40282e+38, -2.16845e+38, 3.40282e+38, -2.16845e+38, 3.40282e+38, -2.16845e+38, 3.40282e+38}; std::vector dst_uint8(src2.size()); Quantize(src2.data(), dst_uint8.data(), dst_uint8.size(), qparams); EXPECT_EQ(dst_uint8[0], 255); EXPECT_EQ(dst_uint8[1], 0); EXPECT_EQ(dst_uint8[8], 255); qparams.precision = 16; std::vector dst_int16(src2.size()); Quantize(src2.data(), dst_int16.data(), dst_int16.size(), qparams); EXPECT_EQ(dst_int16[0], 32767); EXPECT_EQ(dst_int16[1], -32768); } TEST(QuantizeTestQParams, chooseQParamsSymmetric) { // Test that symmetric quantization of weights set zero point exactly to 0. float min = -1.6165; float max = 0.5685; int32_t qmin = -128; int32_t qmax = 127; bool preserve_sparsity = true; TensorQuantizationParams result = ChooseQuantizationParams(min, max, qmin, qmax, preserve_sparsity); EXPECT_FLOAT_EQ(result.scale, 0.012628906); EXPECT_EQ(result.zero_point, 0); } template void runFusedQuantizeDequantizeTests( const vector& src, float scale, int zero_point, vector& dst, vector& dst_ref) { TensorQuantizationParams qparams; qparams.scale = scale; qparams.zero_point = zero_point; qparams.precision = CHAR_BIT * sizeof(T); // reference for (size_t i = 0; i < src.size(); ++i) { dst_ref[i] = FusedQuantizeDequantize(src[i], qparams); } FusedQuantizeDequantize(src.data(), dst.data(), src.size(), qparams); } TEST_P(FusedQuantizeDequantizeTest, fusedQuantizeDequantizeTest) { int len; len = GetParam(); random_device rd; mt19937 gen(rd()); uniform_real_distribution disFP(-1.0e6, 1.0e6); vector inp(len); generate(inp.begin(), inp.end(), [&, disFP]() mutable { return disFP(gen); }); float scale = disFP(gen); // Generate a number between [0, 255] both inclusive uniform_int_distribution<> disUInt8(0, 255); int zero_point_uint8 = disUInt8(gen); uniform_int_distribution<> disInt8(-128, 127); int zero_point_int8 = disInt8(gen); vector dstfloat(len); vector dstfloat_ref(len); runFusedQuantizeDequantizeTests( inp, scale, zero_point_uint8, dstfloat, dstfloat_ref); EXPECT_TRUE(floatCloseAll(dstfloat, dstfloat_ref)); runFusedQuantizeDequantizeTests( inp, scale, zero_point_int8, dstfloat, dstfloat_ref); EXPECT_TRUE(floatCloseAll(dstfloat, dstfloat_ref)); } // vector and scalar code should have the same behavior TEST(FusedQuantizeDequantizeTestSingle, vectorScalar) { // This length will exercise both the vector and scalar path int len = 33; vector src(len); vector dst(len); for (int i = 0; i < len; ++i) { src[i] = -2.9483526e-05; } float scale = 2.3124334356729307e-07; int zero_point = 128; TensorQuantizationParams qparams; qparams.scale = scale; qparams.zero_point = zero_point; qparams.precision = CHAR_BIT * sizeof(uint8_t); FusedQuantizeDequantize(src.data(), dst.data(), src.size(), qparams); // Check if all elements are equal EXPECT_TRUE( adjacent_find(dst.begin(), dst.end(), not_equal_to()) == dst.end()); } TEST(FusedQuantizeDequantizeTest, cornerCases) { TensorQuantizationParams qparams; qparams.scale = 1.19209e-07; qparams.zero_point = 0; qparams.precision = 8; vector src1 = {3.40282e+38, -2.16845e+38}; vector ref = {1.5139543e-05, -1.5258752e-05}; vector dst_int8(src1.size()); FusedQuantizeDequantize( src1.data(), dst_int8.data(), src1.size(), qparams); EXPECT_TRUE(floatCloseAll(dst_int8, ref)); // Tests vectorized and remainder paths vector src2 = { 3.40282e+38, -2.16845e+38, 3.40282e+38, -2.16845e+38, 3.40282e+38, -2.16845e+38, 3.40282e+38, -2.16845e+38, 3.40282e+38}; vector ref2 = { 3.0398295e-05, 0, 3.0398295e-05, 0, 3.0398295e-05, 0, 3.0398295e-05, 0, 3.0398295e-05}; std::vector dst_uint8(src2.size(), 0); FusedQuantizeDequantize( src2.data(), dst_uint8.data(), src2.size(), qparams); EXPECT_TRUE(floatCloseAll(dst_uint8, ref2)); } // Parameter are bit_rate (i.e., the number of bits in quantized values). class EmbeddingQuantizeFixedNumberTest : public testing::TestWithParam { protected: // clang-format off EmbeddingQuantizeFixedNumberTest() { float_test_input = { 1, 1, 1, 1, // All the same. Range: 0, min: 1 -64, -2.75, 61.625, 191, // Range: 255, min: -64. Picking 61.625 because it differs under FP16 (will become 61.5). }; assert(float_test_input.size() == row * col); float16_test_input.resize(float_test_input.size()); std::transform( float_test_input.begin(), float_test_input.end(), float16_test_input.begin(), [](float input) { return cpu_float2half_rn(input); }); // Results are hand calculated. expected_output_half[8] = { 0, 0, 0, 0, 0x00, 0x3c, 0x00, 0x3c, // Scale: 1, bias: 1 0, 61, 126, 255, 0x00, 0x3c, 0x00, 0xd4, // Scale: 1, bias: -64 }; expected_output_half[4] = { 0x00, 0x00, 0x00, 0x3c, 0x00, 0x3c, // 0, 0, 0, 0, Scale: 1, bias: 1 0x40, 0xf7, 0x40, 0x4c, 0x00, 0xd4, // 0, 4, 7, 15, Scale: 17, bias: -64 0, 0, 0, 0 // Padding }; expected_output_half[2] = { 0b00000000, 0x00, 0x3c, 0x00, 0x3c, // 0, 0, 0, 0, Scale: 1, bias: 1 0b11010100, 0x50, 0x55, 0x00, 0xd4, // 0, 1, 1, 3, Scale: 85, bias: -64 0, 0, 0, 0, 0, 0 // Padding }; expected_output_float = { 0, 0, 0, 0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, // Scale: 0, bias: 1 0, 61, 126, 255, 0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x80, 0xc2, // Scale: 1, bias: -64 }; } // clang-format on const int row = 2; const int col = 4; const int out_cols_half = col + 2 * sizeof(float16); const int out_cols_float = col + 2 * sizeof(float); std::vector float_test_input; std::vector float16_test_input; std::map> expected_output_half; std::vector expected_output_float; }; INSTANTIATE_TEST_CASE_P( InstantiationName, EmbeddingQuantizeFixedNumberTest, ::testing::ValuesIn({2, 4, 8})); TEST_P(EmbeddingQuantizeFixedNumberTest, embeddingFloatToQuantizedSBHalfTest) { const int bit_rate = GetParam(); vector outVectHalfTest(row * out_cols_half); FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef( bit_rate, float_test_input.data(), row, col, outVectHalfTest.data()); EXPECT_TRUE(isQEmbeddingClose( expected_output_half[bit_rate], outVectHalfTest, row, col)); FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf( bit_rate, float_test_input.data(), row, col, outVectHalfTest.data()); EXPECT_TRUE(isQEmbeddingClose( expected_output_half[bit_rate], outVectHalfTest, row, col)); FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef( bit_rate, float16_test_input.data(), row, col, outVectHalfTest.data()); EXPECT_TRUE(isQEmbeddingClose( expected_output_half[bit_rate], outVectHalfTest, row, col)); FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf( bit_rate, float16_test_input.data(), row, col, outVectHalfTest.data()); EXPECT_TRUE(isQEmbeddingClose( expected_output_half[bit_rate], outVectHalfTest, row, col)); vector outVecFloatTest(row * out_cols_float); FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef( float_test_input.data(), row, col, outVecFloatTest.data()); EXPECT_TRUE(isQEmbeddingClose( expected_output_float, outVecFloatTest, row, col)); FloatOrHalfToFused8BitRowwiseQuantizedSBFloat( float_test_input.data(), row, col, outVecFloatTest.data()); EXPECT_TRUE(isQEmbeddingClose( expected_output_float, outVecFloatTest, row, col)); FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef( float16_test_input.data(), row, col, outVecFloatTest.data()); EXPECT_TRUE(isQEmbeddingClose( expected_output_float, outVecFloatTest, row, col)); FloatOrHalfToFused8BitRowwiseQuantizedSBFloat( float16_test_input.data(), row, col, outVecFloatTest.data()); EXPECT_TRUE(isQEmbeddingClose( expected_output_float, outVecFloatTest, row, col)); } // Scale and bias are of type float16 TEST_P(EmbeddingQuantizeTest, embeddingHalfTest) { int bit_rate, rows, cols; tie(bit_rate, rows, cols) = GetParam(); random_device rd; mt19937 gen(rd()); uniform_real_distribution disFP(-10.0f, 10.0f); vector inpVec(rows * cols); vector dequantOutRef(rows * cols); vector dequantOutTest(rows * cols); generate(inpVec.begin(), inpVec.end(), [&, disFP]() mutable { return disFP(gen); }); int elements_per_byte = 8 / bit_rate; int out_emb_cols = (cols + elements_per_byte - 1) / elements_per_byte; int out_cols = out_emb_cols + 2 * sizeof(float16); int outVecSize = rows * out_cols; vector outVecRef(outVecSize); vector outVecTest(outVecSize); FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef( bit_rate, inpVec.data(), rows, cols, outVecRef.data()); FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf( bit_rate, inpVec.data(), rows, cols, outVecTest.data()); EXPECT_TRUE( isQEmbeddingClose(outVecRef, outVecTest, rows, out_emb_cols)); FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef( bit_rate, outVecTest.data(), rows, out_cols, dequantOutRef.data()); FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf( bit_rate, outVecTest.data(), rows, out_cols, dequantOutTest.data()); EXPECT_TRUE(floatCloseAll(dequantOutRef, dequantOutTest, 1e-3)); generate(inpVec.begin(), inpVec.end(), [&, disFP]() mutable { return cpu_half2float(cpu_float2half_rn(disFP(gen))); }); vector inpHalfVec(rows * cols); std::transform( inpVec.begin(), inpVec.end(), inpHalfVec.begin(), [](float input) { return cpu_float2half_rn(input); }); vector outVecRefFromHalf(outVecSize); vector outVecTestFromHalf(outVecSize); FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef( bit_rate, inpVec.data(), rows, cols, outVecRef.data()); FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef( bit_rate, inpHalfVec.data(), rows, cols, outVecRefFromHalf.data()); EXPECT_TRUE(isQEmbeddingClose( outVecRefFromHalf, outVecRef, rows, out_emb_cols)); FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf( bit_rate, inpHalfVec.data(), rows, cols, outVecTestFromHalf.data()); EXPECT_TRUE(isQEmbeddingClose( outVecRefFromHalf, outVecTestFromHalf, rows, out_emb_cols)); vector dequantOutHalfRef(rows * cols); vector dequantOutHalfTest(rows * cols); FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef( bit_rate, outVecRef.data(), rows, out_cols, dequantOutRef.data()); FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef( bit_rate, outVecRef.data(), rows, out_cols, dequantOutHalfRef.data()); constexpr int NumberOfFP16Matissa = 9; EXPECT_TRUE(floatCloseAll( dequantOutRef, dequantOutHalfRef, 1e-3, pow(2, NumberOfFP16Matissa))); FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf( bit_rate, outVecRef.data(), rows, out_cols, dequantOutHalfTest.data()); EXPECT_TRUE(floatCloseAll( dequantOutHalfRef, dequantOutHalfTest, 1e-3, pow(2, NumberOfFP16Matissa))); } // Scale and bias are of type float TEST_P(EmbeddingQuantizeSBFloatTest, embeddingFloatTest) { int rows, cols; tie(rows, cols) = GetParam(); random_device rd; mt19937 gen(rd()); uniform_real_distribution disFP(-10.0f, 10.0f); vector inpVec(rows * cols); vector dequantOutTest(rows * cols); vector dequantOutRef(rows * cols); generate(inpVec.begin(), inpVec.end(), [&, disFP]() mutable { return disFP(gen); }); int out_cols = cols + 2 * sizeof(float); int outVecSize = rows * out_cols; vector outVecRef(outVecSize); vector outVecTest(outVecSize); FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef( inpVec.data(), rows, cols, outVecRef.data()); FloatOrHalfToFused8BitRowwiseQuantizedSBFloat( inpVec.data(), rows, cols, outVecTest.data()); // The number of input columns is the same as the number of output columns EXPECT_TRUE(isQEmbeddingClose(outVecRef, outVecTest, rows, cols)); Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef( outVecTest.data(), rows, out_cols, dequantOutRef.data()); Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf( outVecTest.data(), rows, out_cols, dequantOutTest.data()); EXPECT_TRUE(floatCloseAll(dequantOutRef, dequantOutTest, 1e-3)); generate(inpVec.begin(), inpVec.end(), [&, disFP]() mutable { return cpu_half2float(cpu_float2half_rn(disFP(gen))); }); vector inpHalfVec(rows * cols); std::transform( inpVec.begin(), inpVec.end(), inpHalfVec.begin(), [](float input) { return cpu_float2half_rn(input); }); vector outVecRefFromHalf(outVecSize); vector outVecTestFromHalf(outVecSize); FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef( inpVec.data(), rows, cols, outVecRef.data()); FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef( inpHalfVec.data(), rows, cols, outVecRefFromHalf.data()); EXPECT_TRUE( isQEmbeddingClose(outVecRefFromHalf, outVecRef, rows, cols)); FloatOrHalfToFused8BitRowwiseQuantizedSBFloat( inpHalfVec.data(), rows, cols, outVecTestFromHalf.data()); EXPECT_TRUE(isQEmbeddingClose( outVecRefFromHalf, outVecTestFromHalf, rows, cols)); vector dequantOutHalfRef(rows * cols); vector dequantOutHalfTest(rows * cols); Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef( outVecRef.data(), rows, out_cols, dequantOutRef.data()); Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef( outVecRef.data(), rows, out_cols, dequantOutHalfRef.data()); constexpr int NumberOfFP16Matissa = 9; EXPECT_TRUE(floatCloseAll( dequantOutRef, dequantOutHalfRef, 1e-3, pow(2, NumberOfFP16Matissa))); Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf( outVecRef.data(), rows, out_cols, dequantOutHalfTest.data()); EXPECT_TRUE(floatCloseAll( dequantOutHalfRef, dequantOutHalfTest, 1e-3, pow(2, NumberOfFP16Matissa))); }