/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #ifdef _OPENMP #include #endif #include #include "./QuantizationHelpers.h" #include "./TestUtils.h" #include "bench/BenchUtils.h" #include "fbgemm/Fbgemm.h" #include "src/RefImplementations.h" using namespace std; using namespace fbgemm; vector transposeVals{ matrix_op_t::NoTranspose, matrix_op_t::Transpose}; vector qGranularityVals{ QuantizationGranularity::TENSOR, QuantizationGranularity::GROUP, QuantizationGranularity::OUT_CHANNEL}; namespace { class fbgemmu8s8acc16WithQuantGranularityTest : public testing::TestWithParam< tuple> {}; class fbgemmu8s8acc16Test : public testing::TestWithParam> {}; class fbgemmPackUnpackAcc16Test : public testing::TestWithParam> {}; }; // namespace INSTANTIATE_TEST_CASE_P( InstantiationName, fbgemmu8s8acc16WithQuantGranularityTest, ::testing::Combine( ::testing::Values(matrix_op_t::NoTranspose), ::testing::ValuesIn(transposeVals), ::testing::Bool(), ::testing::ValuesIn(qGranularityVals))); INSTANTIATE_TEST_CASE_P( InstantiationName, fbgemmu8s8acc16Test, ::testing::Combine( ::testing::Values(matrix_op_t::NoTranspose), ::testing::ValuesIn(transposeVals), ::testing::Bool())); INSTANTIATE_TEST_CASE_P( InstantiationName, fbgemmPackUnpackAcc16Test, ::testing::Combine(::testing::ValuesIn(transposeVals), ::testing::Bool())); /** * @brief Shapes for unit test. */ static vector> GetShapes_() { // clang-format off // NMT vector> shapes = { // {M, N, K} {1, 128, 512}, {1, 1024, 256}, {1, 2048, 512}, {1, 2048, 513}, {1, 2048, 514}, {6, 512, 512}, {6, 2048, 512}, {6, 256, 1024}, {6, 1024, 256}, {6, 2048, 256}, {6, 2048, 257}, {6, 2048, 258}, {102, 1024, 512}, {102, 2323, 256}, {102, 512, 256}, {102, 512, 257}, {102, 512, 258}, {1024, 512, 258}, {120, 4, 288}, }; // clang-format on return shapes; } /** * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit * accumulation. Output processing: requantization -> nothing */ TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, Test) { cpuinfo_initialize(); if (fbgemmHasAvx512VnniSupport()) { // No need to use acc16 if VNNI is available return; } vector> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; QuantizationGranularity q_granularity; tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { int m = shape[0]; int n = shape[1]; int k = shape[2]; if (k % groups != 0) { continue; } int k_per_group = k / groups; aligned_vector Aint8(m * k); aligned_vector Bint8_ref(k * n); aligned_vector Cint32_ref(m * n * groups); aligned_vector Cint8_ref(Cint32_ref.size()); aligned_vector Cint32_fb(Cint32_ref.size()); aligned_vector Cint8_fb(Cint32_ref.size()); aligned_vector Cint32_buffer(Cint32_ref.size()); randFill(Aint8, 0, 255); int32_t Aint8_zero_point = 43; randFill(Bint8_ref, -128, 127); aligned_vector Bint8(Bint8_ref); if (btrans == matrix_op_t::Transpose) { aligned_vector Bint8_temp(Bint8.size()); for (int g = 0; g < groups; ++g) { transpose_matrix( k_per_group, n, Bint8.data() + g * k_per_group * n, n, Bint8_temp.data() + g * k_per_group * n, k_per_group); } Bint8 = Bint8_temp; } // To test lda != k , we just reduce k by half and use the original k // as lda. int n_adjusted = n; if (test_ld) { assert( atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); if (btrans == matrix_op_t::NoTranspose) { n_adjusted = std::max(n / 2, 1); } } int ncols_per_quant_group = groups * n_adjusted; if (q_granularity == QuantizationGranularity::GROUP) { ncols_per_quant_group = n_adjusted; } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { ncols_per_quant_group = 1; } aligned_vector Bint8_zero_point( groups * n_adjusted / ncols_per_quant_group); randFill(Bint8_zero_point, -60, 0); // computing column offset vector col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8_ref.data() + g * k_per_group * n, Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, col_offsets.data() + g * n_adjusted, ncols_per_quant_group); } vector row_offsets(m); aligned_vector C_multiplier(Bint8_zero_point.size()); randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int brow = 256; for (int g = 0; g < groups; ++g) { matmul_u8i8acc16_ref( m, n_adjusted, k_per_group, k, n, groups * n, brow, Aint8.data() + g * k_per_group, Bint8_ref.data() + g * k_per_group * n, Cint32_ref.data() + g * n_adjusted); row_offsets_u8acc32_ref( m, k_per_group, k, Aint8.data() + g * k_per_group, row_offsets.data()); requantize_u8acc32_ref( m, n_adjusted, groups * n, Cint32_ref.data() + g * n_adjusted, Cint8_ref.data() + g * n_adjusted, C_multiplier.data() + g * n_adjusted / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * n_adjusted, nullptr, ncols_per_quant_group); } PackBMatrix packedBN( btrans, k, n_adjusted, Bint8.data(), (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, groups); #ifdef _OPENMP #pragma omp parallel #endif { vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize()); PackAWithRowOffset packAN( matrix_op_t::NoTranspose, m, k, Aint8.data(), k, nullptr, groups, row_offset_buf.data()); int num_threads = fbgemm_get_num_threads(); int tid = fbgemm_get_thread_num(); DoNothing<> doNothingObj{}; if (q_granularity == QuantizationGranularity::TENSOR) { ReQuantizeOutput outputProcObj( doNothingObj, C_multiplier.data(), C_zero_pt, Aint8_zero_point, Bint8_zero_point.data(), packAN.getRowOffsetBuffer(), col_offsets.data(), nullptr, groups * n_adjusted, groups); fbgemmPacked( packAN, packedBN, Cint8_fb.data(), Cint32_buffer.data(), groups * n, outputProcObj, tid, num_threads); } else if (q_granularity == QuantizationGranularity::GROUP) { ReQuantizeOutput outputProcObj( doNothingObj, C_multiplier.data(), C_zero_pt, Aint8_zero_point, Bint8_zero_point.data(), packAN.getRowOffsetBuffer(), col_offsets.data(), nullptr, groups * n_adjusted, groups); fbgemmPacked( packAN, packedBN, Cint8_fb.data(), Cint32_buffer.data(), groups * n, outputProcObj, tid, num_threads); } else { ReQuantizeOutput outputProcObj( doNothingObj, C_multiplier.data(), C_zero_pt, Aint8_zero_point, Bint8_zero_point.data(), packAN.getRowOffsetBuffer(), col_offsets.data(), nullptr, groups * n_adjusted, groups); fbgemmPacked( packAN, packedBN, Cint8_fb.data(), Cint32_buffer.data(), groups * n, outputProcObj, tid, num_threads); } } // omp parallel compare_validate_buffers( Cint8_ref.data(), Cint8_fb.data(), m, groups * n_adjusted, groups * n, static_cast(0)); } // for each groups } // for each shape } /** * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit * accumulation. Output processing: spmdm -> requantization -> nothing */ TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, SpMDMTest) { cpuinfo_initialize(); if (fbgemmHasAvx512VnniSupport()) { // No need to use acc16 if VNNI is available return; } vector> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; QuantizationGranularity q_granularity; tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { // very small density to test hyper sparsity case // moderate density to test the implementation using transpose for (float density : {0.0001f, 0.1f}) { int m = shape[0]; int n = shape[1]; int k = shape[2]; if (k % groups != 0) { continue; } int k_per_group = k / groups; aligned_vector Aint8(m * k); aligned_vector Bint8(k * n); aligned_vector Cint32_ref(m * n * groups); aligned_vector Cint8_ref(Cint32_ref.size()); aligned_vector Cint32_fb(Cint32_ref.size()); aligned_vector Cint8_fb(Cint32_ref.size()); aligned_vector Cint32_buffer(Cint32_ref.size()); randFill(Aint8, 0, 255); int32_t Aint8_zero_point = 43; randFill(Bint8, -128, 127); // To test lda != k , we just reduce k by half and use the original k // as lda. int n_adjusted = n; if (test_ld) { assert( atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); if (btrans == matrix_op_t::NoTranspose) { n_adjusted = std::max(n / 2, 1); } } int ncols_per_quant_group = groups * n_adjusted; if (q_granularity == QuantizationGranularity::GROUP) { ncols_per_quant_group = n_adjusted; } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { ncols_per_quant_group = 1; } aligned_vector Bint8_zero_point( groups * n_adjusted / ncols_per_quant_group); randFill(Bint8_zero_point, -50, -10); // computing column offset vector col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8.data() + g * k_per_group * n, Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, col_offsets.data() + g * n_adjusted, ncols_per_quant_group); } CompressedSparseColumn B_csc(k_per_group, groups * n_adjusted); // Make sure density is big enough. Otherwise, we're not really testing // spmdm. // deterministic random number random_device r; default_random_engine eng(r()); binomial_distribution<> per_col_nnz_dist(k_per_group, density); vector row_indices(k_per_group); int total_nnz = 0; for (int g = 0; g < groups; ++g) { for (int j = 0; j < n_adjusted; ++j) { B_csc.ColPtr()[g * n_adjusted + j] = total_nnz; int nnz_of_j = per_col_nnz_dist(eng); total_nnz += nnz_of_j; iota(row_indices.begin(), row_indices.end(), 0); shuffle(row_indices.begin(), row_indices.end(), eng); sort(row_indices.begin(), row_indices.begin() + nnz_of_j); for (int kidx = 0; kidx < nnz_of_j; ++kidx) { int rowidx = row_indices[kidx]; B_csc.RowIdx().push_back(rowidx); int8_t* bptr = &Bint8[(g * k_per_group + rowidx) * n + j]; int b_remainder = 0; if (kidx % 2 == 1) { // Make sure abs(b_prev + *bptr - b_remainder) <= 128 int b_prev = B_csc.Values().back(); b_remainder = std::max(b_prev + *bptr - 128, b_remainder); b_remainder = std::min(b_prev + *bptr + 128, b_remainder); } // put a portion of current B value that won't saturate // _mm256_maddubs_epi16 . B_csc.Values().push_back(*bptr - b_remainder); // put the remainder *bptr = b_remainder; } } } B_csc.ColPtr()[groups * n_adjusted] = total_nnz; aligned_vector Bint8_ref(Bint8); if (btrans == matrix_op_t::Transpose) { aligned_vector Bint8_temp(Bint8.size()); for (int g = 0; g < groups; ++g) { transpose_matrix( k_per_group, n, Bint8.data() + g * k_per_group * n, n, Bint8_temp.data() + g * k_per_group * n, k_per_group); } Bint8 = Bint8_temp; } vector row_offsets(m); aligned_vector C_multiplier(Bint8_zero_point.size()); randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int brow = 256; for (int g = 0; g < groups; ++g) { matmul_u8i8acc16_ref( m, n_adjusted, k_per_group, k, n, groups * n, brow, Aint8.data() + g * k_per_group, Bint8_ref.data() + g * k_per_group * n, Cint32_ref.data() + g * n_adjusted); } bool accumulation = true; spmdm_ref( m, Aint8.data(), k, B_csc, accumulation, Cint32_ref.data(), groups * n, groups); for (int g = 0; g < groups; ++g) { row_offsets_u8acc32_ref( m, k_per_group, k, Aint8.data() + g * k_per_group, row_offsets.data()); requantize_u8acc32_ref( m, n_adjusted, groups * n, Cint32_ref.data() + g * n_adjusted, Cint8_ref.data() + g * n_adjusted, C_multiplier.data() + g * n_adjusted / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * n_adjusted, nullptr, ncols_per_quant_group); } PackBMatrix packedB( btrans, k, n_adjusted, Bint8.data(), (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, groups); #ifdef _OPENMP #pragma omp parallel #endif { vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize()); PackAWithRowOffset packAN( matrix_op_t::NoTranspose, m, k, Aint8.data(), k, nullptr, groups, row_offset_buf.data()); int num_threads = fbgemm_get_num_threads(); int tid = fbgemm_get_thread_num(); // spmdm -> requantization -> nothing // construct an output processing pipeline in reverse order // i.e. last output operation first // Last operation should always be DoNothing with // correct input and output type. DoNothing<> doNothingObj{}; if (q_granularity == QuantizationGranularity::TENSOR) { // The second last operation is requantization back // to int8 ReQuantizeOutput reqObj( doNothingObj, C_multiplier.data(), C_zero_pt, Aint8_zero_point, Bint8_zero_point.data(), packAN.getRowOffsetBuffer(), col_offsets.data(), nullptr, groups * n_adjusted, groups); // the top most (first) operation in the output processing // pipeline is spmdm // outType = final output type after fullly processing through // pipeline inType = initial input type at the first call to the // whole pipeline DoSpmdmOnInpBuffer< ReQuantizeOutput::outType, int32_t, ReQuantizeOutput> spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); fbgemmPacked( packAN, packedB, Cint8_fb.data(), Cint32_fb.data(), groups * n, spmdmObj, tid, num_threads); } else if (q_granularity == QuantizationGranularity::GROUP) { ReQuantizeOutput reqObj( doNothingObj, C_multiplier.data(), C_zero_pt, Aint8_zero_point, Bint8_zero_point.data(), packAN.getRowOffsetBuffer(), col_offsets.data(), nullptr, groups * n_adjusted, groups); DoSpmdmOnInpBuffer< ReQuantizeOutput::outType, int32_t, ReQuantizeOutput> spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); fbgemmPacked( packAN, packedB, Cint8_fb.data(), Cint32_fb.data(), groups * n, spmdmObj, tid, num_threads); } else { ReQuantizeOutput reqObj( doNothingObj, C_multiplier.data(), C_zero_pt, Aint8_zero_point, Bint8_zero_point.data(), packAN.getRowOffsetBuffer(), col_offsets.data(), nullptr, groups * n_adjusted, groups); DoSpmdmOnInpBuffer< ReQuantizeOutput::outType, int32_t, ReQuantizeOutput> spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); fbgemmPacked( packAN, packedB, Cint8_fb.data(), Cint32_fb.data(), groups * n, spmdmObj, tid, num_threads); } } compare_validate_buffers( Cint8_ref.data(), Cint8_fb.data(), m, groups * n_adjusted, groups * n, static_cast(0)); } // for each density } // for each groups } // for each shape } /** * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit * accumulation. Output processing: nothing */ TEST_P(fbgemmu8s8acc16Test, NoRequantizeTest) { cpuinfo_initialize(); if (fbgemmHasAvx512VnniSupport()) { // No need to use acc16 if VNNI is available return; } vector> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; tie(atrans, btrans, test_ld) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { int m = shape[0]; int n = shape[1]; int k = shape[2]; if (k % groups != 0) { continue; } int k_per_group = k / groups; aligned_vector Aint8(m * k); aligned_vector Bint8_ref(k * n); aligned_vector Cint32_ref(m * n * groups); aligned_vector Cint32_fb(Cint32_ref.size()); aligned_vector Cint32_buffer(Cint32_ref.size()); randFill(Aint8, 0, 255); randFill(Bint8_ref, -128, 127); aligned_vector Bint8(Bint8_ref); if (btrans == matrix_op_t::Transpose) { aligned_vector Bint8_temp(Bint8.size()); for (int g = 0; g < groups; ++g) { transpose_matrix( k_per_group, n, Bint8.data() + g * k_per_group * n, n, Bint8_temp.data() + g * k_per_group * n, k_per_group); } Bint8 = Bint8_temp; } int32_t Bint8_zero_point = -30; // To test lda != k , we just reduce k by half and use the original k // as lda. int n_adjusted = n; if (test_ld) { assert( atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); if (btrans == matrix_op_t::NoTranspose) { n_adjusted = std::max(n / 2, 1); } } // computing column offset vector col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8_ref.data() + g * k_per_group * n, &Bint8_zero_point, col_offsets.data() + g * n_adjusted, n_adjusted); } vector row_offsets(m); int brow = 256; for (int g = 0; g < groups; ++g) { matmul_u8i8acc16_ref( m, n_adjusted, k_per_group, k, n, groups * n, brow, Aint8.data() + g * k_per_group, Bint8_ref.data() + g * k_per_group * n, Cint32_ref.data() + g * n_adjusted); row_offsets_u8acc32_ref( m, k_per_group, k, Aint8.data() + g * k_per_group, row_offsets.data()); } PackBMatrix packedBN( btrans, k, n_adjusted, Bint8.data(), (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, groups); #ifdef _OPENMP #pragma omp parallel #endif { vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize()); PackAWithRowOffset packAN( matrix_op_t::NoTranspose, m, k, Aint8.data(), k, nullptr, groups, row_offset_buf.data()); // DoNothing<> doNothingObj{}; DoNothing doNothingObj{}; memCopy<> outputProcObj(doNothingObj); int num_threads = fbgemm_get_num_threads(); int tid = fbgemm_get_thread_num(); fbgemmPacked( packAN, packedBN, Cint32_fb.data(), Cint32_buffer.data(), groups * n, outputProcObj, tid, num_threads); } compare_validate_buffers( Cint32_ref.data(), Cint32_fb.data(), m, groups * n_adjusted, groups * n, static_cast(0)); } // for each groups } // for each shape } /** * @brief Unit test for packing and unpacking the weight tensor. */ TEST_P(fbgemmPackUnpackAcc16Test, TestPackUnpack) { vector> shapes(GetShapes_()); matrix_op_t btrans; bool test_ld; tie(btrans, test_ld) = GetParam(); BlockingFactors params; params.MCB = 48; params.NCB = 16; params.KCB = 256; params.MR = 1; params.NR = 16; params.ROW_INTERLEAVE = 4; params.NR_MIN = 16; vector vec_params_ptr = {¶ms, nullptr}; for (auto shape : shapes) { for (int groups : {1, 3, 4}) { for (auto params_ptr : vec_params_ptr) { int n = shape[1]; int k = shape[2]; if (k % groups != 0) { continue; } int k_per_group = k / groups; // kxn matrix aligned_vector Bint8(k * n); randFill(Bint8, -128, 127); // To test lda != k , we just reduce k by half and use the original k // as lda. int n_adjusted = n; if (test_ld) { if (btrans == matrix_op_t::NoTranspose) { n_adjusted = std::max(n / 2, 1); } } // Note that packing for weight is performed during the constructor // stage. PackBMatrix packedWeights( btrans, k, n_adjusted, Bint8.data(), (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, groups, params_ptr); // Setup a buffer to get pack -> unpacked results aligned_vector unpack_buf(k * n, 0); // Perform unpacking packedWeights.unpack(unpack_buf.data(), params_ptr); // Sanity check for (int i = 0; i < k; i++) { for (int j = 0; j < n_adjusted; j++) { EXPECT_EQ(unpack_buf.data()[i * n + j], Bint8.data()[i * n + j]) << "Pack/Unpack results differ at index (" << i << ", " << j << ", Reference: " << static_cast(Bint8.data()[i * n + j]) << ", Pack-Unpacked: " << static_cast(unpack_buf.data()[i * n + j]); } } } } } }