/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #define FBGEMM_EXPORTS #include #include #include #include // for logic_error #include #include "fbgemm/Fbgemm.h" namespace fbgemm { template bool takeDepthWiseFastPath(const conv_param_t& conv_p) { // Note: Depthwise convolutions (both 2D and 3D) are optimized for the most // common case. // 3x3 or 5x5 2D // (3 or 5)x(3x3 or 5x5) 3D bool ret = std::is_same::value && conv_p.G == conv_p.IC && (conv_p.G == conv_p.OC || conv_p.G * 2 == conv_p.OC) && conv_p.G % 8 == 0 && std::all_of( conv_p.stride.begin(), conv_p.stride.end(), [](int i) { return i == 1 || i == 2; }) && SPATIAL_DIM >= 2 && conv_p.K[SPATIAL_DIM - 2] == conv_p.K[SPATIAL_DIM - 1] && std::all_of( conv_p.K.begin(), conv_p.K.end(), [](int i) { return i == 3 || i == 5 || i == 7; }) && std::all_of( conv_p.dilation.begin(), conv_p.dilation.end(), [](int i) { return i == 1; }) && !conv_p.transposed; // Check pads result in same input and output spatial dim for (int i = 0; i < SPATIAL_DIM; ++i) { if (conv_p.pad[i] != (conv_p.K[i] - 1) / 2 || conv_p.pad[i] != conv_p.pad[SPATIAL_DIM + i]) { ret = false; } } return ret; } template bool takePointWiseFastPath(const conv_param_t& conv_p) { return std::accumulate(conv_p.K.begin(), conv_p.K.end(), 0) == SPATIAL_DIM && std::accumulate(conv_p.stride.begin(), conv_p.stride.end(), 0) == SPATIAL_DIM && std::accumulate(conv_p.dilation.begin(), conv_p.dilation.end(), 0) == SPATIAL_DIM && std::accumulate(conv_p.pad.begin(), conv_p.pad.end(), 0) == 0 && !conv_p.transposed; } template bool take1DFastPath(const conv_param_t& conv_p) { return false && !conv_p.transposed; } template bool takeDirectConvPath(const conv_param_t& conv_p) { // Note: Direct convolutions (2D) are optimized for // filter size: 2 x 1 to 2 x 6, transposed conv, // in_channel % 8 == 0, out_channel % 8 == 0 // stride = 1 or 2 // padding = 0 ( non-zero padding will be supported soon) bool ret = std::is_same::value && conv_p.transposed && conv_p.G == 1 && conv_p.IC % 8 == 0 && conv_p.OC % 8 == 0 && std::all_of( conv_p.stride.begin(), conv_p.stride.end(), [](int i) { return i == 1 || i == 2; }) && SPATIAL_DIM == 2 && conv_p.K[SPATIAL_DIM - 2] == 2 && conv_p.K[SPATIAL_DIM - 1] <= 6 && std::all_of(conv_p.dilation.begin(), conv_p.dilation.end(), [](int i) { return i == 1; }); // Check pads: zero padding for (int i = 0; i < SPATIAL_DIM; ++i) { if (conv_p.pad[i] != 0) { ret = false; } } ret = false; return ret; } template optimized_conv_t ConvFastPath(const conv_param_t& conv_p) { if (takeDepthWiseFastPath(conv_p)) { return optimized_conv_t::depthwise; } else if (fbgemmOptimizedGConv(conv_p)) { return optimized_conv_t::groupwise; } else if (takePointWiseFastPath(conv_p)) { return optimized_conv_t::pointwise; } else if (takeDirectConvPath(conv_p)) { return optimized_conv_t::directconv; } else if (take1DFastPath(conv_p)) { return optimized_conv_t::fastpath1d; } else { return optimized_conv_t::im2col; } } template int fbgemmConv( const conv_param_t& conv_p, const std::uint8_t* activations, PackWeightsForConv& packed_weights, typename processOutputType::outType* out, std::int32_t* outBuffer, processOutputType& outProcess, int thread_id, int num_threads, const BlockingFactors* blocking_params) { if (!packed_weights.isPackingCompliant(conv_p)) { std::string msg = "[FBGEMM_CONV_ERROR] Convolution parameters " "mismatch between pre-packed weights and conv invocation! "; msg += packed_weights.mismatchingParams(conv_p); msg += std::string( " Please pack weights using the same parameters " "with which convolution operation is invoked!"); throw std::logic_error(msg); } switch (ConvFastPath(conv_p)) { case optimized_conv_t::depthwise: { // 2D and 3D depthwise fast path // std::cout << "Depthwise fast path" << std::endl; const std::int32_t* B_zero_point = outProcess.getBZeroPoint(); const float* C_multiplier = outProcess.getCMultiplier(); const float* act_times_w_scale = outProcess.getActWScale(); if (SPATIAL_DIM == 3) { static_assert( std::is_same:: value, "For depthwise, only requantized output is supported"); if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) { depthwise_3d_same_pad( *reinterpret_cast*>(&conv_p), outProcess.getAZeroPoint(), activations, B_zero_point, *(packed_weights.getPackedWForDepthwise()), C_multiplier, outProcess.getCZeroPoint(), out, outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu act_times_w_scale, thread_id, num_threads); } else if ( processOutputType::QGRANType == QuantizationGranularity::GROUP) { depthwise_3d_same_pad( *reinterpret_cast*>(&conv_p), outProcess.getAZeroPoint(), activations, B_zero_point, *(packed_weights.getPackedWForDepthwise()), C_multiplier, outProcess.getCZeroPoint(), out, outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu act_times_w_scale, // act_scale * weight_scale thread_id, num_threads); } else if ( processOutputType::QGRANType == QuantizationGranularity::OUT_CHANNEL) { depthwise_3d_same_pad( *reinterpret_cast*>(&conv_p), outProcess.getAZeroPoint(), activations, B_zero_point, *(packed_weights.getPackedWForDepthwise()), C_multiplier, outProcess.getCZeroPoint(), out, outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu act_times_w_scale, // act_scale * weight_scale thread_id, num_threads); } else { std::string msg = "[FBGEMM_CONV_ERROR] This quantization granularity is " "not supported"; throw std::runtime_error(msg); } } else if (SPATIAL_DIM == 2) { if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) { depthwise_2d_same_pad( conv_p.MB, // mini batch conv_p.IN_DIM[0], // H conv_p.IN_DIM[1], // W conv_p.IC, // input channels conv_p.OC, // output channels conv_p.stride[0], // stride_h conv_p.stride[1], // stride_w outProcess.getAZeroPoint(), activations, B_zero_point, *(packed_weights.getPackedWForDepthwise()), C_multiplier, outProcess.getCZeroPoint(), out, outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu act_times_w_scale, thread_id, num_threads); } else if ( processOutputType::QGRANType == QuantizationGranularity::GROUP) { depthwise_2d_same_pad( conv_p.MB, // mini batch conv_p.IN_DIM[0], // H conv_p.IN_DIM[1], // W conv_p.IC, // input channels conv_p.OC, // output channels conv_p.stride[0], // stride_h conv_p.stride[1], // stride_w outProcess.getAZeroPoint(), activations, B_zero_point, *(packed_weights.getPackedWForDepthwise()), C_multiplier, outProcess.getCZeroPoint(), out, outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu act_times_w_scale, // act_scale * weight_scale thread_id, num_threads); } else if ( processOutputType::QGRANType == QuantizationGranularity::OUT_CHANNEL) { // The number of input channels == groups for depthwise convolutions depthwise_2d_same_pad( conv_p.MB, // mini batch conv_p.IN_DIM[0], // H conv_p.IN_DIM[1], // W conv_p.IC, // input channels conv_p.OC, // output channels conv_p.stride[0], // stride_h conv_p.stride[1], // stride_w outProcess.getAZeroPoint(), activations, B_zero_point, *(packed_weights.getPackedWForDepthwise()), C_multiplier, outProcess.getCZeroPoint(), out, outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu act_times_w_scale, // act_scale * weight_scale thread_id, num_threads); } else { std::string msg = "[FBGEMM_CONV_ERROR] This quantization granularity is " "not supported"; throw std::runtime_error(msg); } } else { std::string msg = "[FBGEMM_CONV_ERROR] This spatial dim is not supported"; throw std::runtime_error(msg); } break; } case optimized_conv_t::groupwise: { // optimized groupwise convolution // std::cout << "Groupwise fast path" << std::endl; std::vector row_offset_buf( rowOffsetBufferSizeGConv(conv_p)); outProcess.setRowOffsets(row_offset_buf.data()); fbgemmGroupwiseConv( conv_p, activations, outProcess.getAZeroPoint(), row_offset_buf.data(), *(packed_weights.getPackedWForGroupwise()), out, outBuffer, outProcess, thread_id, num_threads); break; } case optimized_conv_t::pointwise: { std::vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize(blocking_params)); int image_dim = std::accumulate( conv_p.IN_DIM.begin(), conv_p.IN_DIM.end(), 1, std::multiplies()); PackAWithRowOffset packA( matrix_op_t::NoTranspose, conv_p.MB * image_dim, conv_p.IC, activations, conv_p.IC, nullptr, conv_p.G, row_offset_buf.data(), blocking_params); outProcess.setRowOffsets(row_offset_buf.data()); fbgemmPacked( packA, *(packed_weights.getPackedWForPointwise()), out, outBuffer, conv_p.OC, outProcess, thread_id, num_threads, blocking_params); break; } case optimized_conv_t::directconv: { // specialized direct convolution path // std::cout << "Directconv fast path" << std::endl; fbgemmDirectConv( conv_p, // Aint8, activations, *(packed_weights.getPackedWForDirectconv()), out, outBuffer, outProcess, outProcess.getBias(), thread_id, num_threads); break; } case optimized_conv_t::fastpath1d: { break; } case optimized_conv_t::im2col: { // All other convolutions go through im2col-based implementation // std::cout << "Im2col path" << std::endl; std::vector row_offset_buf( PackAWithIm2Col::rowOffsetBufferSize( blocking_params)); const std::int32_t* b_zero_point = outProcess.getBZeroPoint(); bool b_symmetric = false; if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) { b_symmetric = b_zero_point[0] == 0; } else if ( processOutputType::QGRANType == QuantizationGranularity::GROUP) { b_symmetric = std::all_of(b_zero_point, b_zero_point + conv_p.G, [](int i) { return i == 0; }); } else if ( processOutputType::QGRANType == QuantizationGranularity::OUT_CHANNEL) { b_symmetric = std::all_of(b_zero_point, b_zero_point + conv_p.OC, [](int i) { return i == 0; }); } else { std::string msg = "[FBGEMM_CONV_ERROR] This quantization granularity is " "not supported"; throw std::runtime_error(msg); } PackAWithIm2Col packA( conv_p, activations, nullptr, /* buffer for packed matrix */ outProcess.getAZeroPoint(), row_offset_buf.data(), b_symmetric, blocking_params); outProcess.setRowOffsets(row_offset_buf.data()); fbgemmPacked( packA, *(packed_weights.getPackedWForIm2col()), out, outBuffer, conv_p.OC, outProcess, thread_id, num_threads, blocking_params); break; } } // switch return 0; } #define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, BIAS_TYPE) \ template FBGEMM_API int fbgemmConv( \ const conv_param_t& conv_p, \ const std::uint8_t* activations, \ PackWeightsForConv& packed_weights, \ std::uint8_t* out, \ std::int32_t* outBuffer, \ ReQuantizeOutput& outProcess, \ int thread_id, \ int num_threads, \ const BlockingFactors* blocking_params); #define INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \ INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, float) \ INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t) #define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \ INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 1) \ INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2) \ INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3) #define INSTANTIATE_RELU(ACC_T, Q_GRAN) \ INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, true) \ INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, false) #define INSTANTIATE_Q_GRANS(ACC_T) \ INSTANTIATE_RELU(ACC_T, QuantizationGranularity::TENSOR) \ INSTANTIATE_RELU(ACC_T, QuantizationGranularity::GROUP) \ INSTANTIATE_RELU(ACC_T, QuantizationGranularity::OUT_CHANNEL) INSTANTIATE_Q_GRANS(std::int32_t) #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_RELU #undef INSTANTIATE_SPATIAL_DIM #undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE template bool takeDepthWiseFastPath<2, std::int32_t>( const conv_param_t<2>& conv_p); template bool takeDepthWiseFastPath<3, std::int32_t>( const conv_param_t<3>& conv_p); template bool takeDepthWiseFastPath<2, std::int16_t>( const conv_param_t<2>& conv_p); template bool takeDepthWiseFastPath<3, std::int16_t>( const conv_param_t<3>& conv_p); template bool takeDirectConvPath<2, std::int32_t>( const conv_param_t<2>& conv_p); template bool takeDirectConvPath<3, std::int32_t>( const conv_param_t<3>& conv_p); template bool takeDirectConvPath<2, std::int16_t>( const conv_param_t<2>& conv_p); template bool takeDirectConvPath<3, std::int16_t>( const conv_param_t<3>& conv_p); template FBGEMM_API optimized_conv_t ConvFastPath<1, std::int32_t>(const conv_param_t<1>& conv_p); template FBGEMM_API optimized_conv_t ConvFastPath<2, std::int32_t>(const conv_param_t<2>& conv_p); template FBGEMM_API optimized_conv_t ConvFastPath<3, std::int32_t>(const conv_param_t<3>& conv_p); template FBGEMM_API optimized_conv_t ConvFastPath<1, std::int16_t>(const conv_param_t<1>& conv_p); template FBGEMM_API optimized_conv_t ConvFastPath<2, std::int16_t>(const conv_param_t<2>& conv_p); template FBGEMM_API optimized_conv_t ConvFastPath<3, std::int16_t>(const conv_param_t<3>& conv_p); } // namespace fbgemm