// Copyright 2024 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include "xnnpack/vunary.h" #include #include #include #include #include #include #include #include #include "utils.h" #include "xnnpack.h" #include "xnnpack/buffer.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" #include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams-init.h" #include "xnnpack/microparams.h" #include "xnnpack/vcvt.h" #include "xnnpack/vunary.h" #include template struct UniformDistribution { std::uniform_real_distribution dist{-10.0f, 10.0f}; template T operator()(Generator& g) { return dist(g); } }; template <> struct UniformDistribution { std::uniform_real_distribution dist{-10.0f, 10.0f}; template xnn_float16 operator()(Generator& g) { return static_cast(dist(g)); } }; template <> struct UniformDistribution { std::uniform_int_distribution dist{ std::numeric_limits::lowest(), std::numeric_limits::max()}; template int8_t operator()(Generator& g) { return dist(g); } }; template <> struct UniformDistribution { std::uniform_int_distribution dist{ std::numeric_limits::lowest(), std::numeric_limits::max()}; template uint8_t operator()(Generator& g) { return dist(g); } }; template <> struct UniformDistribution { std::uniform_int_distribution dist{ std::numeric_limits::lowest(), std::numeric_limits::max()}; template int16_t operator()(Generator& g) { return dist(g); } }; template <> struct UniformDistribution { std::uniform_int_distribution dist{ std::numeric_limits::lowest(), std::numeric_limits::max()}; template int32_t operator()(Generator& g) { return dist(g); } }; template <> struct UniformDistribution { std::uniform_int_distribution dist{ std::numeric_limits::lowest(), std::numeric_limits::max()}; template uint32_t operator()(Generator& g) { return dist(g); } }; template xnn_quantization_params InputQuantization(T) { return {0, 1.0f}; } template xnn_quantization_params OutputQuantization(T) { return {0, 1.0f}; } xnn_quantization_params InputQuantization(int8_t) { return {1, 0.5f}; } xnn_quantization_params OutputQuantization(int8_t) { return {-1, 0.7f}; } xnn_quantization_params InputQuantization(uint8_t) { return {129, 0.5f}; } xnn_quantization_params OutputQuantization(uint8_t) { return {127, 0.7f}; } // Microkernel function, templated on the `params` type. template using UKernelFn = void (*)(size_t, const TIn*, TOut*, const UKernelParams* params); template void vunary(benchmark::State& state, uint64_t arch_flags, UKernelFn ukernel, xnn_init_unary_uparams_fn init_params, const xnn_unary_params* params = nullptr, const xnn_quantization_params& input_quantization = InputQuantization(TIn()), const xnn_quantization_params& output_quantization = OutputQuantization(TOut())) { if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { return; } const size_t num_elements = state.range(0); xnn_unary_uparams uparams; if (init_params) { init_params(&uparams, params, &input_quantization, &output_quantization); } std::random_device random_device; auto rng = std::mt19937(random_device()); UniformDistribution dist; xnnpack::Buffer x( num_elements + XNN_EXTRA_BYTES / sizeof(TIn)); xnnpack::Buffer y(num_elements); std::generate(x.begin(), x.end(), [&]() { return dist(rng); }); for (auto _ : state) { ukernel(num_elements * sizeof(TIn), x.data(), y.data(), (UKernelParams*)&uparams); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } const size_t elements_per_iteration = num_elements; state.counters["elements"] = benchmark::Counter( static_cast(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); const size_t bytes_per_iteration = num_elements * (sizeof(TIn) + sizeof(TOut)); state.counters["bytes"] = benchmark::Counter( static_cast(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); } #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ datatype, params_type, init_params) \ BENCHMARK_CAPTURE(vunary, ukernel, arch_flags, ukernel, init_params) \ ->Apply( \ benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); #include "f16-vabs/f16-vabs.h" #include "f16-vhswish/f16-vhswish.h" #include "f16-vneg/f16-vneg.h" #include "f16-vrnd/f16-vrndd.h" #include "f16-vrnd/f16-vrndne.h" #include "f16-vrnd/f16-vrndu.h" #include "f16-vrnd/f16-vrndz.h" #include "f16-vrsqrt/f16-vrsqrt.h" #include "f16-vsigmoid/f16-vsigmoid.h" #include "f16-vsqr/f16-vsqr.h" #include "f16-vsqrt/f16-vsqrt.h" #include "f16-vtanh/f16-vtanh.h" #include "f32-vabs/f32-vabs.h" #include "f32-vgelu/f32-vgelu.h" #include "f32-vhswish/f32-vhswish.h" #include "f32-vlog/f32-vlog.h" #include "f32-vneg/f32-vneg.h" #include "f32-vrelu/f32-vrelu.h" #include "f32-vrnd/f32-vrndd.h" #include "f32-vrnd/f32-vrndne.h" #include "f32-vrnd/f32-vrndu.h" #include "f32-vrnd/f32-vrndz.h" #include "f32-vrsqrt/f32-vrsqrt.h" #include "f32-vsigmoid/f32-vsigmoid.h" #include "f32-vsqr/f32-vsqr.h" #include "f32-vsqrt/f32-vsqrt.h" #include "f32-vtanh/f32-vtanh.h" #undef XNN_UKERNEL_WITH_PARAMS template void velu(benchmark::State& state, uint64_t arch_flags, UKernelFn ukernel, xnn_init_unary_uparams_fn init_params) { xnn_unary_params params; params.elu.alpha = 1.0f; vunary(state, arch_flags, ukernel, init_params, ¶ms); } #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ datatype, params_type, init_params) \ BENCHMARK_CAPTURE(velu, ukernel, arch_flags, ukernel, init_params) \ ->Apply( \ benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); #include "f16-velu/f16-velu.h" #include "f32-velu/f32-velu.h" #undef XNN_UKERNEL_WITH_PARAMS template void vclamp(benchmark::State& state, uint64_t arch_flags, UKernelFn ukernel, xnn_init_unary_uparams_fn init_params) { xnn_unary_params params; params.clamp.min = -1.0f; params.clamp.max = 1.0f; // These kernels cannot handle changing quantization parameters. xnn_quantization_params input_quantization = {0, 1.0f}; xnn_quantization_params output_quantization = {0, 1.0f}; vunary(state, arch_flags, ukernel, init_params, ¶ms, input_quantization, output_quantization); } #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ datatype, params_type, init_params) \ BENCHMARK_CAPTURE(vclamp, ukernel, arch_flags, ukernel, init_params) \ ->Apply( \ benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); #include "f16-vclamp/f16-vclamp.h" #include "f32-vclamp/f32-vclamp.h" #include "s8-vclamp/s8-vclamp.h" #include "u8-vclamp/u8-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS template void vlrelu(benchmark::State& state, uint64_t arch_flags, UKernelFn ukernel, xnn_init_unary_uparams_fn init_params) { xnn_unary_params params; params.leaky_relu.negative_slope = 0.5f; vunary(state, arch_flags, ukernel, init_params, ¶ms); } #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ datatype, params_type, init_params) \ BENCHMARK_CAPTURE(vlrelu, ukernel, arch_flags, ukernel, init_params) \ ->Apply( \ benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); #include "f16-vlrelu/f16-vlrelu.h" #include "f32-vlrelu/f32-vlrelu.h" #include "qs8-vlrelu/qs8-vlrelu.h" #include "qu8-vlrelu/qu8-vlrelu.h" #undef XNN_UKERNEL_WITH_PARAMS #define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, \ vector_tile, datatype_in, datatype_out, \ params_type, init_params) \ BENCHMARK_CAPTURE(vunary, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); #include "f16-f32-vcvt/f16-f32-vcvt.h" #include "f16-qs8-vcvt/f16-qs8-vcvt.h" #include "f32-f16-vcvt/f32-f16-vcvt.h" #include "f32-qs8-vcvt/f32-qs8-vcvt.h" #include "f32-qu8-vcvt/f32-qu8-vcvt.h" #include "qs8-f16-vcvt/qs8-f16-vcvt.h" #include "qs8-f32-vcvt/qs8-f32-vcvt.h" #include "qs8-vcvt/qs8-vcvt.h" #include "qu8-f32-vcvt/qu8-f32-vcvt.h" #include "qu8-vcvt/qu8-vcvt.h" #include "s32-f32-vcvt/s32-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS #ifndef XNNPACK_BENCHMARK_NO_MAIN BENCHMARK_MAIN(); #endif