sglang_v0.5.2/pytorch_2.8.0/third_party/fbgemm/test/Im2ColFusedRequantizeTest.cc

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cmath>
#include <cstdio>
#include <numeric>
#include <random>

#ifdef _OPENMP
#include <omp.h>
#endif

#include <gtest/gtest.h>

#include "./TestUtils.h"
#include "bench/AlignedVec.h"
#include "bench/BenchUtils.h"
#include "fbgemm/Fbgemm.h"
#include "src/RefImplementations.h"

using namespace std;
using namespace fbgemm;

vector<QuantizationGranularity> qGranularityVals{
    QuantizationGranularity::TENSOR,
    QuantizationGranularity::GROUP,
    QuantizationGranularity::OUT_CHANNEL};

namespace {
class fbgemmIm2colTest
    : public testing::TestWithParam<tuple<QuantizationGranularity, bool>> {};
}; // namespace

INSTANTIATE_TEST_CASE_P(
    InstantiationName,
    fbgemmIm2colTest,
    ::testing::Combine(
        ::testing::ValuesIn(qGranularityVals),
        ::testing::Bool()));

// clang-format off
// From Faster-RCNN with ShuffleNet
static vector<conv_param_t<>> shapes = {
  // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w
  conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0, 0, 0}),
  conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
  conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0, 0, 0}),
  conv_param_t<>(2, 32, 32, {28, 14}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}),
  conv_param_t<>(1, 32, 16, {12, 14}, 4, {3, 3}, {1, 1}, {0, 0, 0, 0}),
  conv_param_t<>(2, 32, 16, {16, 14}, 4, {3, 3}, {1, 1}, {0, 0, 0, 0}),
  conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
  conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}),
  // first layer of resnet50
  conv_param_t<>(1, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}),
};
// clang-format on

template <typename ACC_T, QuantizationGranularity Q_GRAN>
static void Im2colTest(bool b_symmetric) {
  for (auto conv_p : shapes) {
    for (int groups : {1, 4}) {
      if (conv_p.IC % groups != 0 || conv_p.OC % groups != 0) {
        continue;
      }
      conv_p.G = groups;
      aligned_vector<uint8_t> Aint8(
          conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
      aligned_vector<int8_t> Bint8(
          conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
      aligned_vector<int32_t> Cint32_ref(
          conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());

      int ncols_per_quant_group = conv_p.OC;
      if (Q_GRAN == QuantizationGranularity::GROUP) {
        ncols_per_quant_group = conv_p.OC / conv_p.G;
      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
        ncols_per_quant_group = 1;
      }
      int32_t Aint8_zero_point;
      aligned_vector<int32_t> Bint8_zero_point(
          conv_p.OC / ncols_per_quant_group);
      if (is_same<ACC_T, int32_t>::value) {
        randFill<uint8_t>(Aint8, 0, 80);
        Aint8_zero_point = 43;
        randFill<int8_t>(Bint8, -16, 16);
        randFill(Bint8_zero_point, -50, -10);
      } else {
        randFill<uint8_t>(Aint8, 0, 5);
        Aint8_zero_point = 4;
        randFill<int8_t>(Bint8, -4, 4);
        randFill(Bint8_zero_point, -3, -1);
      }
      if (b_symmetric) {
        randFill(Bint8_zero_point, 0, 0);
      }

      aligned_vector<float> C_multiplier(Bint8_zero_point.size());
      randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
      int32_t C_zero_pt = 5;

      int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
      int NDim = conv_p.OC / conv_p.G;
      int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
      int KDimPerGroup = KDim / conv_p.G;

      // computing row offset
      vector<int32_t> row_offsets(MDim);
      vector<uint8_t> Aint8_im2col(MDim * KDim);
      im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());

      // computing column offset
      vector<int32_t> col_offsets(conv_p.G * NDim);
      for (int g = 0; g < conv_p.G; ++g) {
        col_offsets_with_zero_pt_s8acc32_ref(
            KDimPerGroup,
            NDim,
            NDim,
            Bint8.data() + g * KDimPerGroup * NDim,
            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
            col_offsets.data() + g * NDim,
            ncols_per_quant_group);
      }

      conv_ref(
          conv_p,
          Aint8.data(),
          Aint8_zero_point,
          Bint8.data(),
          Cint32_ref.data());

      for (int g = 0; g < conv_p.G; ++g) {
        row_offsets_u8acc32_ref(
            MDim,
            KDimPerGroup,
            KDim,
            Aint8_im2col.data() + g * KDimPerGroup,
            row_offsets.data());

        requantize_u8acc32_ref(
            MDim,
            NDim,
            conv_p.G * NDim,
            Cint32_ref.data() + g * NDim,
            Cint8_ref.data() + g * NDim,
            C_multiplier.data() + g * NDim / ncols_per_quant_group,
            C_zero_pt,
            Aint8_zero_point,
            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
            row_offsets.data(),
            col_offsets.data() + g * NDim,
            nullptr,
            ncols_per_quant_group);
      }

      PackBMatrix<int8_t, ACC_T> packedB(
          matrix_op_t::NoTranspose,
          KDim,
          NDim,
          Bint8.data(),
          NDim,
          nullptr,
          conv_p.G);

#ifdef _OPENMP
#pragma omp parallel
#endif
      {
        vector<int32_t> row_offset_buf(
            PackAWithIm2Col<uint8_t, ACC_T>::rowOffsetBufferSize());

        PackAWithIm2Col<uint8_t, ACC_T> packA(
            conv_p,
            Aint8.data(),
            nullptr,
            Aint8_zero_point,
            row_offset_buf.data(),
            b_symmetric);

        DoNothing<> doNothingObj{};
        ReQuantizeOutput<false, Q_GRAN> outputProcObj(
            doNothingObj,
            C_multiplier.data(),
            C_zero_pt,
            Aint8_zero_point,
            Bint8_zero_point.data(),
            packA.getRowOffsetBuffer(),
            col_offsets.data(),
            nullptr,
            conv_p.G * NDim,
            conv_p.G);

        int num_threads = fbgemm_get_num_threads();
        int tid = fbgemm_get_thread_num();

        fbgemmPacked(
            packA,
            packedB,
            Cint8_fb.data(),
            Cint32_fb.data(),
            conv_p.G * NDim,
            outputProcObj,
            tid,
            num_threads);
      } // omp parallel

      // correctness check
      for (int n = 0; n < conv_p.MB; ++n) {
        for (int h = 0; h < conv_p.OUT_DIM[0]; ++h) {
          for (int w = 0; w < conv_p.OUT_DIM[1]; ++w) {
            for (int k = 0; k < conv_p.OC; ++k) {
              int32_t expected = Cint8_ref
                  [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) *
                       conv_p.OC +
                   k];
              int32_t actual = Cint8_fb
                  [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) *
                       conv_p.OC +
                   k];
              EXPECT_EQ(actual, expected)
                  << "Im2Col fused results differ at (" << n << ", " << h
                  << ", " << w << ", " << k << ").";
            }
          }
        }
      }
    } // for each groups
  } // for each shape
}

TEST_P(fbgemmIm2colTest, Acc32Test) {
  QuantizationGranularity q_granularity;
  bool b_symmetric;
  tie(q_granularity, b_symmetric) = GetParam();
  if (q_granularity == QuantizationGranularity::TENSOR) {
    Im2colTest<int32_t, QuantizationGranularity::TENSOR>(b_symmetric);
  } else if (q_granularity == QuantizationGranularity::GROUP) {
    Im2colTest<int32_t, QuantizationGranularity::GROUP>(b_symmetric);
  } else {
    Im2colTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric);
  }
}

TEST_P(fbgemmIm2colTest, Acc16Test) {
  QuantizationGranularity q_granularity;
  bool b_symmetric;
  tie(q_granularity, b_symmetric) = GetParam();
  if (q_granularity == QuantizationGranularity::TENSOR) {
    Im2colTest<int16_t, QuantizationGranularity::TENSOR>(b_symmetric);
  } else if (q_granularity == QuantizationGranularity::GROUP) {
    Im2colTest<int16_t, QuantizationGranularity::GROUP>(b_symmetric);
  } else {
    Im2colTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric);
  }
}

template <QuantizationGranularity Q_GRAN>
void SConvTest() {
  for (auto conv_p : shapes) {
    for (int groups : {1, 4}) {
      if (conv_p.IC % groups != 0 || conv_p.OC % groups != 0) {
        continue;
      }
      conv_p.G = groups;
      aligned_vector<uint8_t> Aint8(
          conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
      aligned_vector<int8_t> Bint8(
          conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
      aligned_vector<int32_t> Cint32_ref(
          conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());

      int ncols_per_quant_group = conv_p.OC;
      if (Q_GRAN == QuantizationGranularity::GROUP) {
        ncols_per_quant_group = conv_p.OC / conv_p.G;
      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
        ncols_per_quant_group = 1;
      }
      int32_t Aint8_zero_point;
      aligned_vector<int32_t> Bint8_zero_point(
          conv_p.OC / ncols_per_quant_group);
      randFill<uint8_t>(Aint8, 0, 5);
      Aint8_zero_point = 4;
      randFill<int8_t>(Bint8, -4, 4);
      randFill(Bint8_zero_point, -3, -1);

      aligned_vector<float> C_multiplier(Bint8_zero_point.size());
      randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
      int32_t C_zero_pt = 5;

      int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
      int NDim = conv_p.OC / conv_p.G;
      int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
      int KDimPerGroup = KDim / conv_p.G;

      // computing row offset
      vector<int32_t> row_offsets(MDim);
      vector<uint8_t> Aint8_im2col(MDim * KDim);
      im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());

      // computing column offset
      vector<int32_t> col_offsets(conv_p.G * NDim);
      for (int g = 0; g < conv_p.G; ++g) {
        col_offsets_with_zero_pt_s8acc32_ref(
            KDimPerGroup,
            NDim,
            NDim,
            Bint8.data() + g * KDimPerGroup * NDim,
            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
            col_offsets.data() + g * NDim,
            ncols_per_quant_group);
      }

      conv_ref(
          conv_p,
          Aint8.data(),
          Aint8_zero_point,
          Bint8.data(),
          Cint32_ref.data());

      for (int g = 0; g < conv_p.G; ++g) {
        row_offsets_u8acc32_ref(
            MDim,
            KDimPerGroup,
            KDim,
            Aint8_im2col.data() + g * KDimPerGroup,
            row_offsets.data());

        requantize_u8acc32_ref(
            MDim,
            NDim,
            conv_p.G * NDim,
            Cint32_ref.data() + g * NDim,
            Cint8_ref.data() + g * NDim,
            C_multiplier.data() + g * NDim / ncols_per_quant_group,
            C_zero_pt,
            Aint8_zero_point,
            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
            row_offsets.data(),
            col_offsets.data() + g * NDim,
            nullptr,
            ncols_per_quant_group);
      }

      float density = 0.0001f;
      CompressedSparseColumn B_csc(KDimPerGroup, conv_p.G * NDim);
      random_device r;
      default_random_engine eng(r());
      binomial_distribution<> per_col_nnz_dist(KDimPerGroup, density);

      // TODO: refactor CSC construction as a reusable function
      vector<int> row_indices(KDimPerGroup);
      int total_nnz = 0;
      int ic_per_group = conv_p.IC / conv_p.G;
      for (int g = 0; g < conv_p.G; ++g) {
        for (int j = 0; j < NDim; ++j) {
          B_csc.ColPtr()[g * NDim + j] = total_nnz;

          int nnz_of_j = per_col_nnz_dist(eng);
          total_nnz += nnz_of_j;

          iota(row_indices.begin(), row_indices.end(), 0);
          shuffle(row_indices.begin(), row_indices.end(), eng);
          sort(row_indices.begin(), row_indices.begin() + nnz_of_j);

          for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
            int rowidx = row_indices[kidx];
            int ic = g * ic_per_group + rowidx % ic_per_group;
            int kw = rowidx / ic_per_group % conv_p.K[1];
            int kh = rowidx / ic_per_group / conv_p.K[1];
            assert(kh < conv_p.K[0]);

            B_csc.KHs().push_back(kh);
            B_csc.KWs().push_back(kw);
            B_csc.ICs().push_back(ic);

            int8_t* bptr = &Bint8[(g * KDimPerGroup + rowidx) * NDim + j];
            B_csc.Values().push_back(*bptr);
            *bptr = 0;
          }
        }
      }
      B_csc.ColPtr()[conv_p.G * NDim] = total_nnz;

      PackBMatrix<int8_t, int16_t> packedB(
          matrix_op_t::NoTranspose,
          KDim,
          NDim,
          Bint8.data(),
          NDim,
          nullptr,
          conv_p.G);

#ifdef _OPENMP
#pragma omp parallel
#endif
      {
        vector<int32_t> row_offset_buf(
            PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());

        PackAWithIm2Col<uint8_t, int16_t> packA(
            conv_p,
            Aint8.data(),
            nullptr,
            Aint8_zero_point,
            row_offset_buf.data());

        DoNothing<> doNothingObj{};
        ReQuantizeOutput<false, Q_GRAN> reqObj(
            doNothingObj,
            C_multiplier.data(),
            C_zero_pt,
            Aint8_zero_point,
            Bint8_zero_point.data(),
            packA.getRowOffsetBuffer(),
            col_offsets.data(),
            nullptr,
            conv_p.G * NDim,
            conv_p.G);
        DoSConvOnInpBuffer<
            ReQuantizeOutput<false>::outType,
            int32_t,
            ReQuantizeOutput<false, Q_GRAN>>
            sconvObj(reqObj, Aint8.data(), conv_p, Aint8_zero_point, B_csc);

        int num_threads = fbgemm_get_num_threads();
        int tid = fbgemm_get_thread_num();

        fbgemmPacked(
            packA,
            packedB,
            Cint8_fb.data(),
            Cint32_fb.data(),
            conv_p.G * NDim,
            sconvObj,
            tid,
            num_threads);
      } // omp parallel

      // correctness check
      for (int n = 0; n < conv_p.MB; ++n) {
        for (int h = 0; h < conv_p.OUT_DIM[0]; ++h) {
          for (int w = 0; w < conv_p.OUT_DIM[1]; ++w) {
            for (int k = 0; k < conv_p.OC; ++k) {
              int32_t expected = Cint8_ref
                  [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) *
                       conv_p.OC +
                   k];
              int32_t actual = Cint8_fb
                  [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) *
                       conv_p.OC +
                   k];
              EXPECT_EQ(actual, expected)
                  << "Im2Col fused results differ at (" << n << ", " << h
                  << ", " << w << ", " << k << ").";
            }
          }
        }
      }
    } // for each groups
  } // for each shape
}

TEST_P(fbgemmIm2colTest, SConvTest) {
  QuantizationGranularity q_granularity;
  bool b_symmetric;
  tie(q_granularity, b_symmetric) = GetParam();
  // b_symmetric ignored for now
  if (q_granularity == QuantizationGranularity::TENSOR) {
    SConvTest<QuantizationGranularity::TENSOR>();
  } else if (q_granularity == QuantizationGranularity::GROUP) {
    SConvTest<QuantizationGranularity::GROUP>();
  } else {
    SConvTest<QuantizationGranularity::OUT_CHANNEL>();
  }
}

static vector<conv_param_t<3>> shapes_3d = {
    // MB, IC, OC, IT, IH, IW, G, KT, KH, KW, stride_t, stride_h, stride_w,
    // pad_t, pad_h, pad_w
    // conv_param_t<
    //     3>(1, 3, 64, {32, 112, 112}, 1, {3, 7, 7}, {1, 2, 2}, {1, 3, 3, 1, 3,
    //     3}),
    // conv_param_t<
    //     3>(1, 64, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0,
    //     0}),
    // conv_param_t<
    //     3>(1, 64, 256, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0,
    //     0}),
    // conv_param_t<
    //     3>(1, 256, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0,
    //     0}),
    // conv_param_t<
    //     3>(1, 256, 128, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 256, 512, {32, 56, 56}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 128, 512, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 512, 128, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 512, 256, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 512, 1024, {16, 28, 28}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 256, 1024, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 1024, 256, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 1024, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 1024, 2048, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 2048, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
    //     0, 0}),
    // conv_param_t<
    //     3>(1, 512, 2048, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0,
    //     0}),
    conv_param_t<3>(
        1,
        3,
        4,
        {32, 112, 112},
        1,
        {3, 7, 7},
        {1, 2, 2},
        {1, 3, 3, 1, 3, 3}),
    conv_param_t<3>(
        1,
        3,
        4,
        {32, 112, 112},
        1,
        {3, 7, 7},
        {1, 2, 2},
        {1, 3, 3, 1, 1, 0}),
    conv_param_t<
        3>(1, 8, 16, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}),
    conv_param_t<
        3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, 0, 0}),
};

template <typename ACC_T, QuantizationGranularity Q_GRAN>
static void Im2col3DTest(bool b_symmetric) {
  for (auto conv_p : shapes_3d) {
    for (int groups : {1, 4}) {
      if (conv_p.IC % groups != 0 || conv_p.OC % groups != 0) {
        continue;
      }
      conv_p.G = groups;
      aligned_vector<uint8_t> Aint8(
          conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IN_DIM[2] *
          conv_p.IC);
      aligned_vector<int8_t> Bint8(
          conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC);
      aligned_vector<int32_t> Cint32_ref(
          conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] *
          conv_p.OUT_DIM[2] * conv_p.OC);
      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());

      int ncols_per_quant_group = conv_p.OC;
      if (Q_GRAN == QuantizationGranularity::GROUP) {
        ncols_per_quant_group = conv_p.OC / conv_p.G;
      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
        ncols_per_quant_group = 1;
      }
      int32_t Aint8_zero_point;
      aligned_vector<int32_t> Bint8_zero_point(
          conv_p.OC / ncols_per_quant_group);
      if (is_same<ACC_T, int32_t>::value) {
        randFill<uint8_t>(Aint8, 0, 80);
        Aint8_zero_point = 43;
        randFill<int8_t>(Bint8, -16, 16);
        randFill(Bint8_zero_point, -50, -10);
      } else {
        randFill<uint8_t>(Aint8, 0, 5);
        Aint8_zero_point = 4;
        randFill<int8_t>(Bint8, -4, 4);
        randFill(Bint8_zero_point, -3, -1);
      }
      if (b_symmetric) {
        randFill(Bint8_zero_point, 0, 0);
      }

      aligned_vector<float> C_multiplier(Bint8_zero_point.size());
      randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
      int32_t C_zero_pt = 5;

      int MDim =
          conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2];
      int NDim = conv_p.OC / conv_p.G;
      int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC;
      int KDimPerGroup = KDim / conv_p.G;

      // computing row offset
      vector<int32_t> row_offsets(MDim);
      vector<uint8_t> Aint8_im2col(MDim * KDim);
      im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());

      // computing column offset
      vector<int32_t> col_offsets(conv_p.G * NDim);
      for (int g = 0; g < conv_p.G; ++g) {
        col_offsets_with_zero_pt_s8acc32_ref(
            KDimPerGroup,
            NDim,
            NDim,
            Bint8.data() + g * KDimPerGroup * NDim,
            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
            col_offsets.data() + g * NDim,
            ncols_per_quant_group);
      }

      conv_ref(
          conv_p,
          Aint8.data(),
          Aint8_zero_point,
          Bint8.data(),
          Cint32_ref.data());

      for (int g = 0; g < conv_p.G; ++g) {
        row_offsets_u8acc32_ref(
            MDim,
            KDimPerGroup,
            KDim,
            Aint8_im2col.data() + g * KDimPerGroup,
            row_offsets.data());

        requantize_u8acc32_ref(
            MDim,
            NDim,
            conv_p.G * NDim,
            Cint32_ref.data() + g * NDim,
            Cint8_ref.data() + g * NDim,
            C_multiplier.data() + g * NDim / ncols_per_quant_group,
            C_zero_pt,
            Aint8_zero_point,
            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
            row_offsets.data(),
            col_offsets.data() + g * NDim,
            nullptr,
            ncols_per_quant_group);
      }

      PackBMatrix<int8_t, ACC_T> packedB(
          matrix_op_t::NoTranspose,
          KDim,
          NDim,
          Bint8.data(),
          NDim,
          nullptr,
          conv_p.G);

#ifdef _OPENMP
#pragma omp parallel
#endif
      {
        vector<int32_t> row_offset_buf(
            PackAWithIm2Col<uint8_t, ACC_T, 3>::rowOffsetBufferSize());

        PackAWithIm2Col<uint8_t, ACC_T, 3> packA(
            conv_p,
            Aint8.data(),
            nullptr,
            Aint8_zero_point,
            row_offset_buf.data(),
            b_symmetric);

        DoNothing<> doNothingObj{};
        ReQuantizeOutput<false, Q_GRAN> outputProcObj(
            doNothingObj,
            C_multiplier.data(),
            C_zero_pt,
            Aint8_zero_point,
            Bint8_zero_point.data(),
            packA.getRowOffsetBuffer(),
            col_offsets.data(),
            nullptr,
            conv_p.G * NDim,
            conv_p.G);

        int num_threads = fbgemm_get_num_threads();
        int tid = fbgemm_get_thread_num();

        fbgemmPacked(
            packA,
            packedB,
            Cint8_fb.data(),
            Cint32_fb.data(),
            conv_p.G * NDim,
            outputProcObj,
            tid,
            num_threads);
      } // omp parallel

      // correctness check
      for (int n = 0; n < conv_p.MB; ++n) {
        for (int t = 0; t < conv_p.OUT_DIM[0]; ++t) {
          for (int h = 0; h < conv_p.OUT_DIM[1]; ++h) {
            for (int w = 0; w < conv_p.OUT_DIM[2]; ++w) {
              for (int k = 0; k < conv_p.OC; ++k) {
                int32_t expected = Cint8_ref
                    [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) *
                          conv_p.OUT_DIM[2] +
                      w) *
                         conv_p.OC +
                     k];
                int32_t actual = Cint8_fb
                    [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) *
                          conv_p.OUT_DIM[2] +
                      w) *
                         conv_p.OC +
                     k];
                EXPECT_EQ(actual, expected)
                    << "Im2Col fused results differ at (" << n << ", " << t
                    << ", " << h << ", " << w << ", " << k << ").";
              }
            }
          }
        }
      }
    } // for each groups
  } // for each shape
}

TEST_P(fbgemmIm2colTest, 3DAcc32Test) {
  QuantizationGranularity q_granularity;
  bool b_symmetric;
  tie(q_granularity, b_symmetric) = GetParam();
  if (q_granularity == QuantizationGranularity::TENSOR) {
    Im2col3DTest<int32_t, QuantizationGranularity::TENSOR>(b_symmetric);
  } else if (q_granularity == QuantizationGranularity::GROUP) {
    Im2col3DTest<int32_t, QuantizationGranularity::GROUP>(b_symmetric);
  } else {
    Im2col3DTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric);
  }
}

TEST_P(fbgemmIm2colTest, 3DAcc16Test) {
  QuantizationGranularity q_granularity;
  bool b_symmetric;
  tie(q_granularity, b_symmetric) = GetParam();
  if (q_granularity == QuantizationGranularity::TENSOR) {
    Im2col3DTest<int16_t, QuantizationGranularity::TENSOR>(b_symmetric);
  } else if (q_granularity == QuantizationGranularity::GROUP) {
    Im2col3DTest<int16_t, QuantizationGranularity::GROUP>(b_symmetric);
  } else {
    Im2col3DTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric);
  }
}