sglang_v0.5.2/pytorch_2.8.0/third_party/fbgemm/test/I8DepthwiseTest.cc

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cmath>
#include <cstdio>

#include <gtest/gtest.h>

#include "bench/AlignedVec.h"
#include "bench/BenchUtils.h"
#include "fbgemm/FbgemmI8DepthwiseAvx2.h"
#include "src/RefImplementations.h"

using namespace std;

namespace fbgemm {

// From Xray OCR
// clang-format off
static vector<vector<int>> shapes = {
  // NOTE: clang-format wants to use a different formatting but the current
  // formatting should be easier to read.
  // N, G, H_in, W_in, stride, kernel
  {   1,  272,  47, 125, 1, 3 },
  {   1,  272,  47, 125, 1, 5 },
//  {   1,  272,  64, 125, 1, 3 },
//  {   1,  272,  66, 125, 1, 3 },
//  {   1,  272,  67, 100, 1, 3 },
//  {   1,  272,  75,  75, 1, 3 },
//   {   1,  272,  75,  76, 1, 3 },
//  {   1,  272,  75, 100, 1, 3 },
//  {   1,  272,  94,  75, 1, 3 },
//  {   1,  272, 109,  75, 1, 3 },
  {   1,  544,  24,  63, 1, 3 },
//  {   1,  544,  33,  63, 1, 3 },
//  {   1,  544,  34,  50, 1, 3 },
//  {   1,  544,  36,  63, 1, 3 },
//  {   1,  544,  38,  38, 1, 3 },
//  {   1,  544,  38,  40, 1, 3 },
  {   1,  544,  47,  38, 1, 3 },
  {   1, 1088,   7,   7, 1, 3 },
  {  2, 1088,   7,   7, 1, 3 },
  {   2, 1088,   7,   7, 1, 5 },
//  { 100, 1088,   7,   7, 1, 3 },

  {   1,  248,  93, 250, 2, 3 },
  {   1,  248,  93, 250, 2, 5 },
//  {   1,  248, 128, 250, 2, 3 },
//  {   1,  248, 133, 200, 2, 3 },
//  {   1,  248, 150, 150, 2, 3 },
  {   1,  248, 150, 151, 2, 3 },
//  {   1,  248, 150, 158, 2, 3 },
//  {   1,  248, 188, 150, 2, 3 },
//  {   1,  248, 225, 150, 2, 3 },
  {   1,  272,  47, 125, 2, 3 },
//  {   1,  272,  64, 125, 2, 3 },
//  {   1,  272,  66, 125, 2, 3 },
//  {   1,  272,  67, 100, 2, 3 },
//  {   1,  272,  75,  75, 2, 3 },
//  {   1,  272,  75,  76, 2, 3 },
  {   1,  272,  94,  75, 2, 3 },
  {   1,  544,  14,  14, 2, 3 },
  // {  51,  544,  14,  14, 2, 3 },
//  { 100,  544,  14,  14, 2, 3 },
  {   1,  544,  14,  14, 2, 7 },

  {   1,    8,   4,   4, 1, 3 },
  // Tests for the shapes when OH/OW is less than padding
  {   1,  72,  1, 1, 2, 5 },
  {   1,  72,  7, 1, 2, 5 },
  {   1,  72,  1, 7, 2, 5 },
};

static vector<vector<int>> shapes_3d = {
  // NOTE: clang-format wants to use a different formatting but the current
  // formatting should be easier to read.
  // N, K, T_in, H_in, W_in, stride_t, stride_h, stride_w, K_T, K_H, K_W
  {   1,  32,   16,  28, 28, 1, 1, 1, 3, 3, 3, },
  {   1, 128,    8,  14, 14, 2, 2, 2, 3, 3, 3, },
  {   5,  16,   32,  56, 56, 1, 1, 1, 3, 3, 3, },
  {   1,   8,    4,   4,  4, 1, 1, 1, 3, 3, 3, },
  {   1,  32,   16,  28, 28, 1, 1, 1, 3, 5, 5, },
  {   1,  32,   16,  28, 28, 1, 2, 2, 3, 5, 5, },
  {   1,  32,   16,  28, 28, 1, 1, 1, 5, 5, 5, },
};
// clang-format on

namespace {

class FBGemmDepthWiseTest
    : public testing::TestWithParam<tuple<bool, bool, int>> {};

class FBGemmDepthWisePerChannelQuantizationTest
    : public testing::TestWithParam<int> {};

// Two parameters are K (or Groups) and kernel_prod, i.e.,
// (output_channels)(kernel_prod)
// output_channels == Groups.
// For example, kernel_prod for 3x3 convolution is 9
class FBGemmDepthWisePackUnpackTest
    : public testing::TestWithParam<tuple<int, int>> {};

} // namespace

INSTANTIATE_TEST_CASE_P(
    InstantiationName,
    FBGemmDepthWiseTest,
    ::testing::Combine(
        ::testing::Bool(), // a_symmetric
        ::testing::Bool(), // b_symmetric
        ::testing::Values(1, 2))); // oc_per_g

INSTANTIATE_TEST_CASE_P(
    InstantiationName,
    FBGemmDepthWisePerChannelQuantizationTest,
    ::testing::Values(1, 2));

INSTANTIATE_TEST_CASE_P(
    InstantiationName,
    FBGemmDepthWisePackUnpackTest,
    ::testing::Combine(
        ::testing::Values(8, 16, 24, 32, 40, 64, 72),
        ::testing::Values(1, 2, 3, 4, 5, 9, 10, 11, 27)));

TEST_P(FBGemmDepthWiseTest, Test2D) {
  bool a_symmetric, b_symmetric;
  int oc_per_g;
  tie(a_symmetric, b_symmetric, oc_per_g) = GetParam();

  for (auto shape : shapes) {
    int N = shape[0];
    int G = shape[1];
    int H = shape[2];
    int W = shape[3];
    int stride_h = shape[4];
    int stride_w = stride_h;
    int R = shape[5];
    int S = R;
    int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2,
        PAD_R = (S - 1) / 2;
    int OC = G * oc_per_g;

    conv_param_t<2> conv_p(
        N,
        G,
        OC,
        {H, W},
        G,
        {R, S},
        {stride_h, stride_w},
        {PAD_T, PAD_L, PAD_B, PAD_R});
    int H_OUT = conv_p.OUT_DIM[0];
    int W_OUT = conv_p.OUT_DIM[1];

    int MDim = N * H_OUT * W_OUT;
    int KDim = R * S * G;
    int KDimPerGroup = KDim / G;

    aligned_vector<uint8_t> A(N * H * W * G);
    aligned_vector<int8_t> B(KDim * oc_per_g);
    aligned_vector<int8_t> B_tr(B.size());
    aligned_vector<int32_t> C_ref(MDim * OC), C(C_ref.size());
    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());

    randFill<uint8_t>(A, 0, 86);
    int32_t A_zero_point = a_symmetric ? 0 : 43;

    randFill<int8_t>(B, -16, 16);
    int32_t B_zero_point = b_symmetric ? 0 : 5;

    aligned_vector<float> C_multiplier(1);
    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
    int32_t C_zero_point = 5;

    aligned_vector<int32_t> col_offsets(OC);
    aligned_vector<int32_t> bias(OC);
    randFill(col_offsets, -100, 100);
    randFill(bias, -40, 40);

    vector<int32_t> row_offsets(MDim);
    // im2col to compute row offset later
    vector<uint8_t> A_im2col;
    if (!b_symmetric) {
      A_im2col.resize(MDim * KDim);
      im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
    }

    // reference implementation conv_ref expects weights to be in G (R S C/G)
    // K/G
    transposeConvWeights(conv_p, B.data(), B_tr.data());
    conv_ref(conv_p, A.data(), A_zero_point, B_tr.data(), C_ref.data());

    for (int g = 0; g < conv_p.G; ++g) {
      // Compute row offset
      if (!b_symmetric) {
        row_offsets_u8acc32_ref(
            MDim,
            KDimPerGroup,
            KDim,
            A_im2col.data() + g * KDimPerGroup,
            row_offsets.data());
      }

      // Requantization
      requantize_u8acc32_ref(
          MDim,
          oc_per_g,
          OC,
          C_ref.data() + g * oc_per_g,
          C_uint8_ref.data() + g * oc_per_g,
          C_multiplier.data(),
          C_zero_point,
          A_zero_point,
          &B_zero_point,
          row_offsets.data(),
          col_offsets.data() + g * oc_per_g,
          bias.data() + g * oc_per_g,
          OC);
    }

    PackedDepthWiseConvMatrix Bp(OC, R * S, B.data());
    depthwise_2d_same_pad<QuantizationGranularity::TENSOR>(
        N,
        H,
        W,
        G,
        OC,
        stride_h,
        stride_w,
        A_zero_point,
        A.data(),
        &B_zero_point,
        Bp,
        C_multiplier.data(),
        C_zero_point,
        C_uint8.data(),
        a_symmetric ? nullptr : col_offsets.data(),
        bias.data(),
        false, /* fuse_relu */
        nullptr, /* act_scale * w_scale */
        0,
        1);

    // correctness check
    for (int n = 0; n < N; ++n) {
      for (int h = 0; h < H_OUT; ++h) {
        for (int w = 0; w < W_OUT; ++w) {
          for (int k = 0; k < OC; ++k) {
            int32_t expected =
                C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * OC + k];
            int32_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * OC + k];
            EXPECT_EQ(actual, expected)
                << "Depthwise " << R << "x" << S << " results differ at (" << n
                << ", " << h << ", " << w << ", " << k << ").";
          }
        }
      }
    }
  } // for each shape
} // Test3x3

TEST_P(FBGemmDepthWiseTest, Test3D) {
  bool a_symmetric, b_symmetric;
  int oc_per_g;
  tie(a_symmetric, b_symmetric, oc_per_g) = GetParam();

  // 3D tests take a long time so for a symmetric quantization, we only
  // test with 2 shapes.
  for (auto shape : shapes_3d) {
    int N = shape[0];
    int G = shape[1];
    int T = shape[2];
    int H = shape[3];
    int W = shape[4];
    int stride_t = shape[5];
    int stride_h = shape[6];
    int stride_w = shape[7];
    int K_T = shape[8];
    int K_H = shape[9];
    int K_W = shape[10];
    int PAD_P = (K_T - 1) / 2, PAD_N = PAD_P, PAD_T = (K_H - 1) / 2,
        PAD_B = PAD_T, PAD_L = (K_W - 1) / 2, PAD_R = PAD_L;
    int OC = G * oc_per_g;

    conv_param_t<3> conv_p(
        N,
        G,
        OC,
        {T, H, W},
        G,
        {K_T, K_H, K_W},
        {stride_t, stride_h, stride_w},
        {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R});
    int T_OUT = conv_p.OUT_DIM[0];
    int H_OUT = conv_p.OUT_DIM[1];
    int W_OUT = conv_p.OUT_DIM[2];

    int MDim = N * T_OUT * H_OUT * W_OUT;
    int KDim = K_T * K_H * K_W * G;
    int KDimPerGroup = KDim / G;

    aligned_vector<uint8_t> A(N * T * H * W * G);
    aligned_vector<int8_t> B(KDim * oc_per_g);
    aligned_vector<int8_t> B_tr(B.size());
    aligned_vector<int32_t> C_ref(MDim * OC), C(C_ref.size());
    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());

    randFill<uint8_t>(A, 0, 86);
    int32_t A_zero_point = a_symmetric ? 0 : 43;

    randFill<int8_t>(B, -16, 16);
    int32_t B_zero_point = b_symmetric ? 0 : 5;

    aligned_vector<float> C_multiplier(1);
    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
    int32_t C_zero_point = 5;

    aligned_vector<int32_t> col_offsets(OC);
    aligned_vector<int32_t> bias(OC);
    randFill(col_offsets, -100, 100);
    randFill(bias, -40, 40);

    vector<int32_t> row_offsets(MDim);
    // im2col to compute row offset later
    vector<uint8_t> A_im2col;
    if (!b_symmetric) {
      A_im2col.resize(MDim * KDim);
      im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
    }

    // reference implementation conv_ref expects weights to be in G (T R S C/G)
    // K/G
    transposeConvWeights(conv_p, B.data(), B_tr.data());
    conv_ref(conv_p, A.data(), A_zero_point, B_tr.data(), C_ref.data());

    for (int g = 0; g < conv_p.G; ++g) {
      // Compute row offset
      if (!b_symmetric) {
        row_offsets_u8acc32_ref(
            MDim,
            KDimPerGroup,
            KDim,
            A_im2col.data() + g * KDimPerGroup,
            row_offsets.data());
      }

      // Requantization
      requantize_u8acc32_ref(
          MDim,
          oc_per_g,
          OC,
          C_ref.data() + g * oc_per_g,
          C_uint8_ref.data() + g * oc_per_g,
          C_multiplier.data(),
          C_zero_point,
          A_zero_point,
          &B_zero_point,
          row_offsets.data(),
          col_offsets.data() + g * oc_per_g,
          bias.data() + g * oc_per_g,
          OC);
    }

    PackedDepthWiseConvMatrix Bp(OC, K_T * K_H * K_W, B.data());

    depthwise_3d_same_pad<QuantizationGranularity::TENSOR>(
        conv_p,
        A_zero_point,
        A.data(),
        &B_zero_point,
        Bp,
        C_multiplier.data(),
        C_zero_point,
        C_uint8.data(),
        a_symmetric ? nullptr : col_offsets.data(),
        bias.data(),
        false, /* fuse_relu */
        nullptr, /* act_scale * w_scale */
        0,
        1);

    // correctness check
    for (int n = 0; n < N; ++n) {
      for (int t = 0; t < T_OUT; ++t) {
        for (int h = 0; h < H_OUT; ++h) {
          for (int w = 0; w < W_OUT; ++w) {
            for (int k = 0; k < OC; ++k) {
              int32_t expected = C_uint8_ref
                  [(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * OC + k];
              int32_t actual =
                  C_uint8[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * OC + k];
              EXPECT_EQ(actual, expected)
                  << "Depthwise 3D results differ at (" << n << ", " << t
                  << ", " << h << ", " << w << ", " << k << ").";
            }
          } // w
        } // h
      } // t
    } // n
  } // for each shape
} // Test3D

TEST_P(
    FBGemmDepthWisePerChannelQuantizationTest,
    Test2DPerChannelQuantization) {
  int oc_per_g = GetParam();

  for (auto shape : shapes) {
    int N = shape[0];
    int G = shape[1];
    int H = shape[2];
    int W = shape[3];
    int stride_h = shape[4];
    int stride_w = stride_h;
    int R = shape[5];
    int S = R;
    int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2,
        PAD_R = (S - 1) / 2;
    int OC = G * oc_per_g;

    conv_param_t<2> conv_p(
        N,
        G,
        OC,
        {H, W},
        G,
        {R, S},
        {stride_h, stride_w},
        {PAD_T, PAD_L, PAD_B, PAD_R});
    int H_OUT = conv_p.OUT_DIM[0];
    int W_OUT = conv_p.OUT_DIM[1];

    int MDim = N * H_OUT * W_OUT;
    int KDim = R * S * G;
    int KDimPerGroup = KDim / G;

    aligned_vector<uint8_t> A(N * H * W * G);
    aligned_vector<int8_t> B(KDim * oc_per_g);
    aligned_vector<int8_t> B_tr(B.size());
    aligned_vector<int32_t> C_ref(MDim * OC), C(C_ref.size());
    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());

    randFill<uint8_t>(A, 0, 86);
    int32_t A_zero_point = 43;

    // Each row of G has a different range to really test per-channel
    // quantization.
    vector<int32_t> B_zero_point(OC);
    for (auto k = 0; k < OC; ++k) {
      aligned_vector<int8_t> Bk(R * S);
      // limit min, max to int8_t range
      randFill<int8_t>(Bk, -16 + k % 112, 16 + k % 112);
      copy(Bk.begin(), Bk.end(), B.begin() + k * R * S);

      B_zero_point[k] = 5 + k;
    }

    aligned_vector<float> C_multiplier(OC);
    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
    int32_t C_zero_point = 5;

    aligned_vector<int32_t> col_offsets(OC);
    aligned_vector<int32_t> bias(OC);
    randFill(col_offsets, -100, 100);
    randFill(bias, -40, 40);

    // im2col to compute row offset later
    vector<int32_t> row_offsets(MDim);
    vector<uint8_t> A_im2col(MDim * KDim);
    im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());

    // reference implementation conv_ref expects weights to be in G (R S C/G)
    // K/G
    transposeConvWeights(conv_p, B.data(), B_tr.data());
    conv_ref(conv_p, A.data(), A_zero_point, B_tr.data(), C_ref.data());

    for (int g = 0; g < conv_p.G; ++g) {
      // Compute row offset
      row_offsets_u8acc32_ref(
          MDim,
          KDimPerGroup,
          KDim,
          A_im2col.data() + g * KDimPerGroup,
          row_offsets.data());

      // Requantization
      requantize_u8acc32_ref(
          MDim,
          oc_per_g,
          OC,
          C_ref.data() + g * oc_per_g,
          C_uint8_ref.data() + g * oc_per_g,
          C_multiplier.data() + g * oc_per_g,
          C_zero_point,
          A_zero_point,
          B_zero_point.data() + g * oc_per_g,
          row_offsets.data(),
          col_offsets.data() + g * oc_per_g,
          bias.data() + g * oc_per_g,
          1);
    }

    PackedDepthWiseConvMatrix Bp(OC, R * S, B.data());
    depthwise_2d_same_pad<QuantizationGranularity::OUT_CHANNEL>(
        N,
        H,
        W,
        G,
        OC,
        stride_h,
        stride_w,
        A_zero_point,
        A.data(),
        B_zero_point.data(),
        Bp,
        C_multiplier.data(),
        C_zero_point,
        C_uint8.data(),
        col_offsets.data(),
        bias.data(),
        false, /* fuse_relu */
        nullptr, /* act_scale * w_scale */
        0,
        1);

    // correctness check
    for (int n = 0; n < N; ++n) {
      for (int h = 0; h < H_OUT; ++h) {
        for (int w = 0; w < W_OUT; ++w) {
          for (int k = 0; k < OC; ++k) {
            int32_t expected =
                C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * OC + k];
            int32_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * OC + k];
            EXPECT_EQ(actual, expected)
                << "Depthwise " << R << "x" << S << " results differ at (" << n
                << ", " << h << ", " << w << ", " << k << ").";
          }
        }
      }
    }
  } // for each shape
} // Test3x3PerChannelQuantization

TEST_P(
    FBGemmDepthWisePerChannelQuantizationTest,
    Test3DPerChannelQuantization) {
  int oc_per_g = GetParam();

  for (auto shape : shapes_3d) {
    int N = shape[0];
    int G = shape[1];
    int T = shape[2];
    int H = shape[3];
    int W = shape[4];
    int stride_t = shape[5];
    int stride_h = shape[6];
    int stride_w = shape[7];
    int K_T = shape[8];
    int K_H = shape[9];
    int K_W = shape[10];
    int PAD_P = (K_T - 1) / 2, PAD_N = PAD_P, PAD_T = (K_H - 1) / 2,
        PAD_B = PAD_T, PAD_L = (K_W - 1) / 2, PAD_R = PAD_L;
    int OC = G * oc_per_g;

    conv_param_t<3> conv_p(
        N,
        G,
        OC,
        {T, H, W},
        G,
        {K_T, K_H, K_W},
        {stride_t, stride_h, stride_w},
        {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R});
    int T_OUT = conv_p.OUT_DIM[0];
    int H_OUT = conv_p.OUT_DIM[1];
    int W_OUT = conv_p.OUT_DIM[2];

    int MDim = N * T_OUT * H_OUT * W_OUT;
    int KDim = K_T * K_H * K_W * G;
    int KDimPerGroup = KDim / G;

    aligned_vector<uint8_t> A(N * T * H * W * G);
    aligned_vector<int8_t> B(KDim * oc_per_g);
    aligned_vector<int8_t> B_tr(B.size());
    aligned_vector<int32_t> C_ref(MDim * OC), C(C_ref.size());
    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());

    randFill<uint8_t>(A, 0, 86);
    int32_t A_zero_point = 43;

    // Each row of G has a different range to really test per-channel
    // quantization.
    vector<int32_t> B_zero_point(OC);
    for (auto k = 0; k < OC; ++k) {
      aligned_vector<int8_t> Bk(K_T * K_H * K_W);
      // limit min, max to int8_t range
      randFill<int8_t>(Bk, -16 + k % 112, 16 + k % 112);
      copy(Bk.begin(), Bk.end(), B.begin() + k * K_T * K_H * K_W);

      B_zero_point[k] = 5 + k;
    }

    aligned_vector<float> C_multiplier(OC);
    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
    int32_t C_zero_point = 5;

    aligned_vector<int32_t> col_offsets(OC);
    aligned_vector<int32_t> bias(OC);
    randFill(col_offsets, -100, 100);
    randFill(bias, -40, 40);

    vector<int32_t> row_offsets(MDim);
    // im2col to compute row offset later
    vector<uint8_t> A_im2col(MDim * KDim);
    im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());

    // reference implementation conv_ref expects weights to be in G (T R S C/G)
    // K/G
    transposeConvWeights(conv_p, B.data(), B_tr.data());
    conv_ref(conv_p, A.data(), A_zero_point, B_tr.data(), C_ref.data());

    for (int g = 0; g < conv_p.G; ++g) {
      // Compute row offset
      row_offsets_u8acc32_ref(
          MDim,
          KDimPerGroup,
          KDim,
          A_im2col.data() + g * KDimPerGroup,
          row_offsets.data());

      // Requantization
      requantize_u8acc32_ref(
          MDim,
          oc_per_g,
          OC,
          C_ref.data() + g * oc_per_g,
          C_uint8_ref.data() + g * oc_per_g,
          C_multiplier.data() + g * oc_per_g,
          C_zero_point,
          A_zero_point,
          B_zero_point.data() + g * oc_per_g,
          row_offsets.data(),
          col_offsets.data() + g * oc_per_g,
          bias.data() + g * oc_per_g,
          1);
    }

    PackedDepthWiseConvMatrix Bp(OC, K_T * K_H * K_W, B.data());

    depthwise_3d_same_pad<QuantizationGranularity::OUT_CHANNEL>(
        conv_p,
        A_zero_point,
        A.data(),
        B_zero_point.data(),
        Bp,
        C_multiplier.data(),
        C_zero_point,
        C_uint8.data(),
        col_offsets.data(),
        bias.data(),
        false, /* fuse_relu */
        nullptr, /* act_scale * w_scale */
        0,
        1);

    // correctness check
    for (int n = 0; n < N; ++n) {
      for (int t = 0; t < T_OUT; ++t) {
        for (int h = 0; h < H_OUT; ++h) {
          for (int w = 0; w < W_OUT; ++w) {
            for (int k = 0; k < OC; ++k) {
              int32_t expected = C_uint8_ref
                  [(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * OC + k];
              int32_t actual =
                  C_uint8[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * OC + k];
              ASSERT_EQ(actual, expected)
                  << "Depthwise 3D results differ at (" << n << ", " << t
                  << ", " << h << ", " << w << ", " << k << ").";
            }
          } // w
        } // h
      } // t
    } // n
  } // for each shape
} // Test3DPerChannelQuantization

TEST_P(FBGemmDepthWisePackUnpackTest, TestPackUnpack) {
  int K, kernel_prod;
  tie(K, kernel_prod) = GetParam();

  ASSERT_EQ(K % 8, 0)
      << "output channels (== groups) should be a multiple of 8";
  aligned_vector<int8_t> B(K * kernel_prod);
  randFill<int8_t>(B, -16, 16);

  aligned_vector<int8_t> BUnpacked(K * kernel_prod);

  PackedDepthWiseConvMatrix BPacked(K, kernel_prod, B.data());
  BPacked.unpack(BUnpacked.data());

  ASSERT_EQ(BUnpacked, B)
      << "Original and unpacked data elements are not the same";
} // TestPackUnpack

} // namespace fbgemm