/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include "bench/AlignedVec.h" #include "bench/BenchUtils.h" #include "fbgemm/FbgemmI8DepthwiseAvx2.h" #include "src/RefImplementations.h" using namespace std; namespace fbgemm { // From Xray OCR // clang-format off static vector> shapes = { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. // N, G, H_in, W_in, stride, kernel { 1, 272, 47, 125, 1, 3 }, { 1, 272, 47, 125, 1, 5 }, // { 1, 272, 64, 125, 1, 3 }, // { 1, 272, 66, 125, 1, 3 }, // { 1, 272, 67, 100, 1, 3 }, // { 1, 272, 75, 75, 1, 3 }, // { 1, 272, 75, 76, 1, 3 }, // { 1, 272, 75, 100, 1, 3 }, // { 1, 272, 94, 75, 1, 3 }, // { 1, 272, 109, 75, 1, 3 }, { 1, 544, 24, 63, 1, 3 }, // { 1, 544, 33, 63, 1, 3 }, // { 1, 544, 34, 50, 1, 3 }, // { 1, 544, 36, 63, 1, 3 }, // { 1, 544, 38, 38, 1, 3 }, // { 1, 544, 38, 40, 1, 3 }, { 1, 544, 47, 38, 1, 3 }, { 1, 1088, 7, 7, 1, 3 }, { 2, 1088, 7, 7, 1, 3 }, { 2, 1088, 7, 7, 1, 5 }, // { 100, 1088, 7, 7, 1, 3 }, { 1, 248, 93, 250, 2, 3 }, { 1, 248, 93, 250, 2, 5 }, // { 1, 248, 128, 250, 2, 3 }, // { 1, 248, 133, 200, 2, 3 }, // { 1, 248, 150, 150, 2, 3 }, { 1, 248, 150, 151, 2, 3 }, // { 1, 248, 150, 158, 2, 3 }, // { 1, 248, 188, 150, 2, 3 }, // { 1, 248, 225, 150, 2, 3 }, { 1, 272, 47, 125, 2, 3 }, // { 1, 272, 64, 125, 2, 3 }, // { 1, 272, 66, 125, 2, 3 }, // { 1, 272, 67, 100, 2, 3 }, // { 1, 272, 75, 75, 2, 3 }, // { 1, 272, 75, 76, 2, 3 }, { 1, 272, 94, 75, 2, 3 }, { 1, 544, 14, 14, 2, 3 }, // { 51, 544, 14, 14, 2, 3 }, // { 100, 544, 14, 14, 2, 3 }, { 1, 544, 14, 14, 2, 7 }, { 1, 8, 4, 4, 1, 3 }, // Tests for the shapes when OH/OW is less than padding { 1, 72, 1, 1, 2, 5 }, { 1, 72, 7, 1, 2, 5 }, { 1, 72, 1, 7, 2, 5 }, }; static vector> shapes_3d = { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. // N, K, T_in, H_in, W_in, stride_t, stride_h, stride_w, K_T, K_H, K_W { 1, 32, 16, 28, 28, 1, 1, 1, 3, 3, 3, }, { 1, 128, 8, 14, 14, 2, 2, 2, 3, 3, 3, }, { 5, 16, 32, 56, 56, 1, 1, 1, 3, 3, 3, }, { 1, 8, 4, 4, 4, 1, 1, 1, 3, 3, 3, }, { 1, 32, 16, 28, 28, 1, 1, 1, 3, 5, 5, }, { 1, 32, 16, 28, 28, 1, 2, 2, 3, 5, 5, }, { 1, 32, 16, 28, 28, 1, 1, 1, 5, 5, 5, }, }; // clang-format on namespace { class FBGemmDepthWiseTest : public testing::TestWithParam> {}; class FBGemmDepthWisePerChannelQuantizationTest : public testing::TestWithParam {}; // Two parameters are K (or Groups) and kernel_prod, i.e., // (output_channels)(kernel_prod) // output_channels == Groups. // For example, kernel_prod for 3x3 convolution is 9 class FBGemmDepthWisePackUnpackTest : public testing::TestWithParam> {}; } // namespace INSTANTIATE_TEST_CASE_P( InstantiationName, FBGemmDepthWiseTest, ::testing::Combine( ::testing::Bool(), // a_symmetric ::testing::Bool(), // b_symmetric ::testing::Values(1, 2))); // oc_per_g INSTANTIATE_TEST_CASE_P( InstantiationName, FBGemmDepthWisePerChannelQuantizationTest, ::testing::Values(1, 2)); INSTANTIATE_TEST_CASE_P( InstantiationName, FBGemmDepthWisePackUnpackTest, ::testing::Combine( ::testing::Values(8, 16, 24, 32, 40, 64, 72), ::testing::Values(1, 2, 3, 4, 5, 9, 10, 11, 27))); TEST_P(FBGemmDepthWiseTest, Test2D) { bool a_symmetric, b_symmetric; int oc_per_g; tie(a_symmetric, b_symmetric, oc_per_g) = GetParam(); for (auto shape : shapes) { int N = shape[0]; int G = shape[1]; int H = shape[2]; int W = shape[3]; int stride_h = shape[4]; int stride_w = stride_h; int R = shape[5]; int S = R; int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2, PAD_R = (S - 1) / 2; int OC = G * oc_per_g; conv_param_t<2> conv_p( N, G, OC, {H, W}, G, {R, S}, {stride_h, stride_w}, {PAD_T, PAD_L, PAD_B, PAD_R}); int H_OUT = conv_p.OUT_DIM[0]; int W_OUT = conv_p.OUT_DIM[1]; int MDim = N * H_OUT * W_OUT; int KDim = R * S * G; int KDimPerGroup = KDim / G; aligned_vector A(N * H * W * G); aligned_vector B(KDim * oc_per_g); aligned_vector B_tr(B.size()); aligned_vector C_ref(MDim * OC), C(C_ref.size()); aligned_vector C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill(A, 0, 86); int32_t A_zero_point = a_symmetric ? 0 : 43; randFill(B, -16, 16); int32_t B_zero_point = b_symmetric ? 0 : 5; aligned_vector C_multiplier(1); randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_point = 5; aligned_vector col_offsets(OC); aligned_vector bias(OC); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); vector row_offsets(MDim); // im2col to compute row offset later vector A_im2col; if (!b_symmetric) { A_im2col.resize(MDim * KDim); im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); } // reference implementation conv_ref expects weights to be in G (R S C/G) // K/G transposeConvWeights(conv_p, B.data(), B_tr.data()); conv_ref(conv_p, A.data(), A_zero_point, B_tr.data(), C_ref.data()); for (int g = 0; g < conv_p.G; ++g) { // Compute row offset if (!b_symmetric) { row_offsets_u8acc32_ref( MDim, KDimPerGroup, KDim, A_im2col.data() + g * KDimPerGroup, row_offsets.data()); } // Requantization requantize_u8acc32_ref( MDim, oc_per_g, OC, C_ref.data() + g * oc_per_g, C_uint8_ref.data() + g * oc_per_g, C_multiplier.data(), C_zero_point, A_zero_point, &B_zero_point, row_offsets.data(), col_offsets.data() + g * oc_per_g, bias.data() + g * oc_per_g, OC); } PackedDepthWiseConvMatrix Bp(OC, R * S, B.data()); depthwise_2d_same_pad( N, H, W, G, OC, stride_h, stride_w, A_zero_point, A.data(), &B_zero_point, Bp, C_multiplier.data(), C_zero_point, C_uint8.data(), a_symmetric ? nullptr : col_offsets.data(), bias.data(), false, /* fuse_relu */ nullptr, /* act_scale * w_scale */ 0, 1); // correctness check for (int n = 0; n < N; ++n) { for (int h = 0; h < H_OUT; ++h) { for (int w = 0; w < W_OUT; ++w) { for (int k = 0; k < OC; ++k) { int32_t expected = C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * OC + k]; int32_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * OC + k]; EXPECT_EQ(actual, expected) << "Depthwise " << R << "x" << S << " results differ at (" << n << ", " << h << ", " << w << ", " << k << ")."; } } } } } // for each shape } // Test3x3 TEST_P(FBGemmDepthWiseTest, Test3D) { bool a_symmetric, b_symmetric; int oc_per_g; tie(a_symmetric, b_symmetric, oc_per_g) = GetParam(); // 3D tests take a long time so for a symmetric quantization, we only // test with 2 shapes. for (auto shape : shapes_3d) { int N = shape[0]; int G = shape[1]; int T = shape[2]; int H = shape[3]; int W = shape[4]; int stride_t = shape[5]; int stride_h = shape[6]; int stride_w = shape[7]; int K_T = shape[8]; int K_H = shape[9]; int K_W = shape[10]; int PAD_P = (K_T - 1) / 2, PAD_N = PAD_P, PAD_T = (K_H - 1) / 2, PAD_B = PAD_T, PAD_L = (K_W - 1) / 2, PAD_R = PAD_L; int OC = G * oc_per_g; conv_param_t<3> conv_p( N, G, OC, {T, H, W}, G, {K_T, K_H, K_W}, {stride_t, stride_h, stride_w}, {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R}); int T_OUT = conv_p.OUT_DIM[0]; int H_OUT = conv_p.OUT_DIM[1]; int W_OUT = conv_p.OUT_DIM[2]; int MDim = N * T_OUT * H_OUT * W_OUT; int KDim = K_T * K_H * K_W * G; int KDimPerGroup = KDim / G; aligned_vector A(N * T * H * W * G); aligned_vector B(KDim * oc_per_g); aligned_vector B_tr(B.size()); aligned_vector C_ref(MDim * OC), C(C_ref.size()); aligned_vector C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill(A, 0, 86); int32_t A_zero_point = a_symmetric ? 0 : 43; randFill(B, -16, 16); int32_t B_zero_point = b_symmetric ? 0 : 5; aligned_vector C_multiplier(1); randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_point = 5; aligned_vector col_offsets(OC); aligned_vector bias(OC); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); vector row_offsets(MDim); // im2col to compute row offset later vector A_im2col; if (!b_symmetric) { A_im2col.resize(MDim * KDim); im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); } // reference implementation conv_ref expects weights to be in G (T R S C/G) // K/G transposeConvWeights(conv_p, B.data(), B_tr.data()); conv_ref(conv_p, A.data(), A_zero_point, B_tr.data(), C_ref.data()); for (int g = 0; g < conv_p.G; ++g) { // Compute row offset if (!b_symmetric) { row_offsets_u8acc32_ref( MDim, KDimPerGroup, KDim, A_im2col.data() + g * KDimPerGroup, row_offsets.data()); } // Requantization requantize_u8acc32_ref( MDim, oc_per_g, OC, C_ref.data() + g * oc_per_g, C_uint8_ref.data() + g * oc_per_g, C_multiplier.data(), C_zero_point, A_zero_point, &B_zero_point, row_offsets.data(), col_offsets.data() + g * oc_per_g, bias.data() + g * oc_per_g, OC); } PackedDepthWiseConvMatrix Bp(OC, K_T * K_H * K_W, B.data()); depthwise_3d_same_pad( conv_p, A_zero_point, A.data(), &B_zero_point, Bp, C_multiplier.data(), C_zero_point, C_uint8.data(), a_symmetric ? nullptr : col_offsets.data(), bias.data(), false, /* fuse_relu */ nullptr, /* act_scale * w_scale */ 0, 1); // correctness check for (int n = 0; n < N; ++n) { for (int t = 0; t < T_OUT; ++t) { for (int h = 0; h < H_OUT; ++h) { for (int w = 0; w < W_OUT; ++w) { for (int k = 0; k < OC; ++k) { int32_t expected = C_uint8_ref [(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * OC + k]; int32_t actual = C_uint8[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * OC + k]; EXPECT_EQ(actual, expected) << "Depthwise 3D results differ at (" << n << ", " << t << ", " << h << ", " << w << ", " << k << ")."; } } // w } // h } // t } // n } // for each shape } // Test3D TEST_P( FBGemmDepthWisePerChannelQuantizationTest, Test2DPerChannelQuantization) { int oc_per_g = GetParam(); for (auto shape : shapes) { int N = shape[0]; int G = shape[1]; int H = shape[2]; int W = shape[3]; int stride_h = shape[4]; int stride_w = stride_h; int R = shape[5]; int S = R; int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2, PAD_R = (S - 1) / 2; int OC = G * oc_per_g; conv_param_t<2> conv_p( N, G, OC, {H, W}, G, {R, S}, {stride_h, stride_w}, {PAD_T, PAD_L, PAD_B, PAD_R}); int H_OUT = conv_p.OUT_DIM[0]; int W_OUT = conv_p.OUT_DIM[1]; int MDim = N * H_OUT * W_OUT; int KDim = R * S * G; int KDimPerGroup = KDim / G; aligned_vector A(N * H * W * G); aligned_vector B(KDim * oc_per_g); aligned_vector B_tr(B.size()); aligned_vector C_ref(MDim * OC), C(C_ref.size()); aligned_vector C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill(A, 0, 86); int32_t A_zero_point = 43; // Each row of G has a different range to really test per-channel // quantization. vector B_zero_point(OC); for (auto k = 0; k < OC; ++k) { aligned_vector Bk(R * S); // limit min, max to int8_t range randFill(Bk, -16 + k % 112, 16 + k % 112); copy(Bk.begin(), Bk.end(), B.begin() + k * R * S); B_zero_point[k] = 5 + k; } aligned_vector C_multiplier(OC); randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_point = 5; aligned_vector col_offsets(OC); aligned_vector bias(OC); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); // im2col to compute row offset later vector row_offsets(MDim); vector A_im2col(MDim * KDim); im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); // reference implementation conv_ref expects weights to be in G (R S C/G) // K/G transposeConvWeights(conv_p, B.data(), B_tr.data()); conv_ref(conv_p, A.data(), A_zero_point, B_tr.data(), C_ref.data()); for (int g = 0; g < conv_p.G; ++g) { // Compute row offset row_offsets_u8acc32_ref( MDim, KDimPerGroup, KDim, A_im2col.data() + g * KDimPerGroup, row_offsets.data()); // Requantization requantize_u8acc32_ref( MDim, oc_per_g, OC, C_ref.data() + g * oc_per_g, C_uint8_ref.data() + g * oc_per_g, C_multiplier.data() + g * oc_per_g, C_zero_point, A_zero_point, B_zero_point.data() + g * oc_per_g, row_offsets.data(), col_offsets.data() + g * oc_per_g, bias.data() + g * oc_per_g, 1); } PackedDepthWiseConvMatrix Bp(OC, R * S, B.data()); depthwise_2d_same_pad( N, H, W, G, OC, stride_h, stride_w, A_zero_point, A.data(), B_zero_point.data(), Bp, C_multiplier.data(), C_zero_point, C_uint8.data(), col_offsets.data(), bias.data(), false, /* fuse_relu */ nullptr, /* act_scale * w_scale */ 0, 1); // correctness check for (int n = 0; n < N; ++n) { for (int h = 0; h < H_OUT; ++h) { for (int w = 0; w < W_OUT; ++w) { for (int k = 0; k < OC; ++k) { int32_t expected = C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * OC + k]; int32_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * OC + k]; EXPECT_EQ(actual, expected) << "Depthwise " << R << "x" << S << " results differ at (" << n << ", " << h << ", " << w << ", " << k << ")."; } } } } } // for each shape } // Test3x3PerChannelQuantization TEST_P( FBGemmDepthWisePerChannelQuantizationTest, Test3DPerChannelQuantization) { int oc_per_g = GetParam(); for (auto shape : shapes_3d) { int N = shape[0]; int G = shape[1]; int T = shape[2]; int H = shape[3]; int W = shape[4]; int stride_t = shape[5]; int stride_h = shape[6]; int stride_w = shape[7]; int K_T = shape[8]; int K_H = shape[9]; int K_W = shape[10]; int PAD_P = (K_T - 1) / 2, PAD_N = PAD_P, PAD_T = (K_H - 1) / 2, PAD_B = PAD_T, PAD_L = (K_W - 1) / 2, PAD_R = PAD_L; int OC = G * oc_per_g; conv_param_t<3> conv_p( N, G, OC, {T, H, W}, G, {K_T, K_H, K_W}, {stride_t, stride_h, stride_w}, {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R}); int T_OUT = conv_p.OUT_DIM[0]; int H_OUT = conv_p.OUT_DIM[1]; int W_OUT = conv_p.OUT_DIM[2]; int MDim = N * T_OUT * H_OUT * W_OUT; int KDim = K_T * K_H * K_W * G; int KDimPerGroup = KDim / G; aligned_vector A(N * T * H * W * G); aligned_vector B(KDim * oc_per_g); aligned_vector B_tr(B.size()); aligned_vector C_ref(MDim * OC), C(C_ref.size()); aligned_vector C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill(A, 0, 86); int32_t A_zero_point = 43; // Each row of G has a different range to really test per-channel // quantization. vector B_zero_point(OC); for (auto k = 0; k < OC; ++k) { aligned_vector Bk(K_T * K_H * K_W); // limit min, max to int8_t range randFill(Bk, -16 + k % 112, 16 + k % 112); copy(Bk.begin(), Bk.end(), B.begin() + k * K_T * K_H * K_W); B_zero_point[k] = 5 + k; } aligned_vector C_multiplier(OC); randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_point = 5; aligned_vector col_offsets(OC); aligned_vector bias(OC); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); vector row_offsets(MDim); // im2col to compute row offset later vector A_im2col(MDim * KDim); im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); // reference implementation conv_ref expects weights to be in G (T R S C/G) // K/G transposeConvWeights(conv_p, B.data(), B_tr.data()); conv_ref(conv_p, A.data(), A_zero_point, B_tr.data(), C_ref.data()); for (int g = 0; g < conv_p.G; ++g) { // Compute row offset row_offsets_u8acc32_ref( MDim, KDimPerGroup, KDim, A_im2col.data() + g * KDimPerGroup, row_offsets.data()); // Requantization requantize_u8acc32_ref( MDim, oc_per_g, OC, C_ref.data() + g * oc_per_g, C_uint8_ref.data() + g * oc_per_g, C_multiplier.data() + g * oc_per_g, C_zero_point, A_zero_point, B_zero_point.data() + g * oc_per_g, row_offsets.data(), col_offsets.data() + g * oc_per_g, bias.data() + g * oc_per_g, 1); } PackedDepthWiseConvMatrix Bp(OC, K_T * K_H * K_W, B.data()); depthwise_3d_same_pad( conv_p, A_zero_point, A.data(), B_zero_point.data(), Bp, C_multiplier.data(), C_zero_point, C_uint8.data(), col_offsets.data(), bias.data(), false, /* fuse_relu */ nullptr, /* act_scale * w_scale */ 0, 1); // correctness check for (int n = 0; n < N; ++n) { for (int t = 0; t < T_OUT; ++t) { for (int h = 0; h < H_OUT; ++h) { for (int w = 0; w < W_OUT; ++w) { for (int k = 0; k < OC; ++k) { int32_t expected = C_uint8_ref [(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * OC + k]; int32_t actual = C_uint8[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * OC + k]; ASSERT_EQ(actual, expected) << "Depthwise 3D results differ at (" << n << ", " << t << ", " << h << ", " << w << ", " << k << ")."; } } // w } // h } // t } // n } // for each shape } // Test3DPerChannelQuantization TEST_P(FBGemmDepthWisePackUnpackTest, TestPackUnpack) { int K, kernel_prod; tie(K, kernel_prod) = GetParam(); ASSERT_EQ(K % 8, 0) << "output channels (== groups) should be a multiple of 8"; aligned_vector B(K * kernel_prod); randFill(B, -16, 16); aligned_vector BUnpacked(K * kernel_prod); PackedDepthWiseConvMatrix BPacked(K, kernel_prod, B.data()); BPacked.unpack(BUnpacked.data()); ASSERT_EQ(BUnpacked, B) << "Original and unpacked data elements are not the same"; } // TestPackUnpack } // namespace fbgemm