// Copyright 2024 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #pragma once #include #include #include #include #include #include #include "rsum-benchmark.h" #include "utils.h" #include "xnnpack.h" #include "xnnpack/buffer.h" #include "xnnpack/common.h" #include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/reduce.h" #include namespace { void f16_rsum( benchmark::State& state, xnn_f16_rsum_ukernel_fn rsum, xnn_init_f16_scale_params_fn init_params, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } const size_t channels = state.range(0); const size_t rows = state.range(1); xnnpack::Buffer input( rows * channels + XNN_EXTRA_BYTES / sizeof(xnn_float16)); xnnpack::Buffer output(rows); std::iota(input.begin(), input.end(), 1); // Prepare parameters. xnn_f16_scale_params params; init_params(¶ms, /*scale=*/1); for (auto _ : state) { for (int i = 0; i < rows; ++i) { rsum(channels * sizeof(xnn_float16), &input[i * channels], &output[i], ¶ms); } } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } } void f16_f32acc_rsum( benchmark::State& state, xnn_f16_f32acc_rsum_ukernel_fn rsum, xnn_init_f16_f32acc_scale_params_fn init_params, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } const size_t channels = state.range(0); const size_t rows = state.range(1); xnnpack::Buffer input( rows * channels + XNN_EXTRA_BYTES / sizeof(xnn_float16)); xnnpack::Buffer output(rows); std::iota(input.begin(), input.end(), 1); // Prepare parameters. xnn_f16_f32acc_scale_params params; init_params(¶ms, /*scale=*/1.0f); for (auto _ : state) { for (int i = 0; i < rows; ++i) { rsum(channels * sizeof(xnn_float16), &input[i * channels], &output[i], ¶ms); } } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } } void f32_rsum( benchmark::State& state, xnn_f32_rsum_ukernel_fn rsum, xnn_init_f32_scale_params_fn init_params, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } const size_t channels = state.range(0); const size_t rows = state.range(1); xnnpack::Buffer input( rows * channels + XNN_EXTRA_BYTES / sizeof(float)); xnnpack::Buffer output(rows); std::iota(input.begin(), input.end(), 1); // Prepare parameters. xnn_f32_scale_params params; init_params(¶ms, /*scale=*/1.0f); for (auto _ : state) { for (int i = 0; i < rows; ++i) { rsum(channels * sizeof(float), &input[i * channels], &output[i], ¶ms); } } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } } void qs8_rsum( benchmark::State& state, xnn_qs8_rsum_ukernel_fn rsum, xnn_init_qs8_rsum_params_fn init_params, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } const size_t channels = state.range(0); const size_t rows = state.range(1); xnnpack::Buffer input(rows * channels + XNN_EXTRA_BYTES); xnnpack::Buffer output(rows); std::iota(input.begin(), input.end(), 1); // Prepare parameters. struct xnn_qs8_rsum_params params; if (init_params) { init_params(¶ms); } for (auto _ : state) { for (int i = 0; i < rows; ++i) { rsum(channels, &input[i * channels], &output[i], ¶ms); } } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } } void qu8_rsum( benchmark::State& state, xnn_qu8_rsum_ukernel_fn rsum, xnn_init_qs8_rsum_params_fn init_params, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } const size_t channels = state.range(0); const size_t rows = state.range(1); xnnpack::Buffer input(rows * channels + XNN_EXTRA_BYTES); xnnpack::Buffer output(rows); std::iota(input.begin(), input.end(), 1); // Prepare parameters. struct xnn_qs8_rsum_params params; if (init_params) { init_params(¶ms); } for (auto _ : state) { for (int i = 0; i < rows; ++i) { rsum(channels, &input[i * channels], &output[i], ¶ms); } } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } } void f32_rdsum( benchmark::State& state, xnn_f32_rdsum_ukernel_fn rdsum, xnn_init_f32_scale_params_fn init_params, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } const size_t rows = state.range(0); const size_t channels = state.range(1); xnnpack::Buffer input( rows * channels + XNN_EXTRA_BYTES / sizeof(float)); xnnpack::Buffer output(channels); xnnpack::Buffer zero(channels + XNN_EXTRA_BYTES / sizeof(float), 0.f); std::iota(input.begin(), input.end(), 0.0f); // Prepare parameters. struct xnn_f32_scale_params params; init_params(¶ms, /*scale=*/1.0f / rows); for (auto _ : state) { rdsum(rows, channels, input.data(), channels * sizeof(float), zero.data(), output.data(), ¶ms); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } } void qs8_rdsum( benchmark::State& state, xnn_qs8_rdsum_ukernel_fn rdsum, xnn_init_qs8_rsum_params_fn init_params, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } const size_t rows = state.range(0); const size_t channels = state.range(1); xnnpack::Buffer input(rows * channels + XNN_EXTRA_BYTES); xnnpack::Buffer output(channels); xnnpack::Buffer zero(channels + XNN_EXTRA_BYTES, 0); std::fill(input.begin(), input.end(), 0); // Prepare parameters. struct xnn_qs8_rsum_params params; if (init_params) { init_params(¶ms); } for (auto _ : state) { rdsum(rows, channels, input.data(), channels * sizeof(int8_t), zero.data(), output.data(), ¶ms); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } } void qu8_rdsum( benchmark::State& state, xnn_qu8_rdsum_ukernel_fn rdsum, xnn_init_qs8_rsum_params_fn init_params, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } const size_t rows = state.range(0); const size_t channels = state.range(1); xnnpack::Buffer input(rows * channels + XNN_EXTRA_BYTES); xnnpack::Buffer output(channels); xnnpack::Buffer zero(channels + XNN_EXTRA_BYTES, 0); std::fill(input.begin(), input.end(), 0); // Prepare parameters. struct xnn_qs8_rsum_params params; if (init_params) { init_params(¶ms); } for (auto _ : state) { rdsum(rows, channels, input.data(), channels * sizeof(int8_t), zero.data(), output.data(), ¶ms); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } } void f16_f32acc_rdsum( benchmark::State& state, xnn_f16_f32acc_rdsum_ukernel_fn rdsum, xnn_init_f16_f32acc_scale_params_fn init_params, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } const size_t rows = state.range(0); const size_t channels = state.range(1); xnnpack::Buffer input( rows * channels + XNN_EXTRA_BYTES / sizeof(xnn_float16)); xnnpack::Buffer output(channels); xnnpack::Buffer zero(channels + XNN_EXTRA_BYTES / sizeof(xnn_float16), 0); std::iota(input.begin(), input.end(), 0.0f); // Prepare parameters. struct xnn_f16_f32acc_scale_params params; init_params(¶ms, /*scale=*/1.0f / rows); for (auto _ : state) { rdsum(rows, channels, input.data(), channels * sizeof(xnn_float16), zero.data(), output.data(), ¶ms); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { state.counters["cpufreq"] = cpu_frequency; } } static void BenchmarkRSUM(benchmark::internal::Benchmark* b) { b->ArgNames({"channels","rows"}); b->Args({1, 512}); b->Args({1, 1024}); b->Args({1, 8000}); b->Args({512, 512}); b->Args({512, 1024}); b->Args({512, 8000}); b->Args({1024, 64}); b->Args({32768, 1}); } static void BenchmarkRDSUM(benchmark::internal::Benchmark* b) { b->ArgNames({"rows", "channels"}); b->Args({8, 1024}); b->Args({16, 1024}); b->Args({10240, 1024}); } } // namespace