sglang_v0.5.2/pytorch_2.8.0/third_party/XNNPACK/bench/vunary.cc

303 lines
10 KiB
C++

// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include "xnnpack/vunary.h"
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <limits>
#include <random>
#include <vector>
#include "utils.h"
#include "xnnpack.h"
#include "xnnpack/buffer.h"
#include "xnnpack/common.h"
#include "xnnpack/hardware-config.h"
#include "xnnpack/math.h"
#include "xnnpack/microfnptr.h"
#include "xnnpack/microparams-init.h"
#include "xnnpack/microparams.h"
#include "xnnpack/vcvt.h"
#include "xnnpack/vunary.h"
#include <benchmark/benchmark.h>
template <typename T>
struct UniformDistribution {
std::uniform_real_distribution<T> dist{-10.0f, 10.0f};
template <class Generator>
T operator()(Generator& g) {
return dist(g);
}
};
template <>
struct UniformDistribution<xnn_float16> {
std::uniform_real_distribution<float> dist{-10.0f, 10.0f};
template <class Generator>
xnn_float16 operator()(Generator& g) {
return static_cast<xnn_float16>(dist(g));
}
};
template <>
struct UniformDistribution<int8_t> {
std::uniform_int_distribution<int32_t> dist{
std::numeric_limits<int8_t>::lowest(),
std::numeric_limits<int8_t>::max()};
template <class Generator>
int8_t operator()(Generator& g) {
return dist(g);
}
};
template <>
struct UniformDistribution<uint8_t> {
std::uniform_int_distribution<int32_t> dist{
std::numeric_limits<uint8_t>::lowest(),
std::numeric_limits<uint8_t>::max()};
template <class Generator>
uint8_t operator()(Generator& g) {
return dist(g);
}
};
template <>
struct UniformDistribution<int16_t> {
std::uniform_int_distribution<int32_t> dist{
std::numeric_limits<int16_t>::lowest(),
std::numeric_limits<int16_t>::max()};
template <class Generator>
int16_t operator()(Generator& g) {
return dist(g);
}
};
template <>
struct UniformDistribution<int32_t> {
std::uniform_int_distribution<int32_t> dist{
std::numeric_limits<int32_t>::lowest(),
std::numeric_limits<int32_t>::max()};
template <class Generator>
int32_t operator()(Generator& g) {
return dist(g);
}
};
template <>
struct UniformDistribution<uint32_t> {
std::uniform_int_distribution<uint32_t> dist{
std::numeric_limits<uint32_t>::lowest(),
std::numeric_limits<uint32_t>::max()};
template <class Generator>
uint32_t operator()(Generator& g) {
return dist(g);
}
};
template <typename T>
xnn_quantization_params InputQuantization(T) {
return {0, 1.0f};
}
template <typename T>
xnn_quantization_params OutputQuantization(T) {
return {0, 1.0f};
}
xnn_quantization_params InputQuantization(int8_t) { return {1, 0.5f}; }
xnn_quantization_params OutputQuantization(int8_t) { return {-1, 0.7f}; }
xnn_quantization_params InputQuantization(uint8_t) { return {129, 0.5f}; }
xnn_quantization_params OutputQuantization(uint8_t) { return {127, 0.7f}; }
// Microkernel function, templated on the `params` type.
template <typename TIn, typename TOut, typename UKernelParams>
using UKernelFn = void (*)(size_t, const TIn*, TOut*,
const UKernelParams* params);
template <typename TIn, typename TOut, typename UKernelParams>
void vunary(benchmark::State& state, uint64_t arch_flags,
UKernelFn<TIn, TOut, UKernelParams> ukernel,
xnn_init_unary_uparams_fn init_params,
const xnn_unary_params* params = nullptr,
const xnn_quantization_params& input_quantization =
InputQuantization(TIn()),
const xnn_quantization_params& output_quantization =
OutputQuantization(TOut())) {
if (!benchmark::utils::CheckArchFlags(state, arch_flags)) {
return;
}
const size_t num_elements = state.range(0);
xnn_unary_uparams uparams;
if (init_params) {
init_params(&uparams, params, &input_quantization, &output_quantization);
}
std::random_device random_device;
auto rng = std::mt19937(random_device());
UniformDistribution<TIn> dist;
xnnpack::Buffer<TIn, XNN_ALLOCATION_ALIGNMENT> x(
num_elements + XNN_EXTRA_BYTES / sizeof(TIn));
xnnpack::Buffer<TOut, XNN_ALLOCATION_ALIGNMENT> y(num_elements);
std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
for (auto _ : state) {
ukernel(num_elements * sizeof(TIn), x.data(), y.data(),
(UKernelParams*)&uparams);
}
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
if (cpu_frequency != 0) {
state.counters["cpufreq"] = cpu_frequency;
}
const size_t elements_per_iteration = num_elements;
state.counters["elements"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * elements_per_iteration,
benchmark::Counter::kIsRate);
const size_t bytes_per_iteration =
num_elements * (sizeof(TIn) + sizeof(TOut));
state.counters["bytes"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * bytes_per_iteration,
benchmark::Counter::kIsRate);
}
#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \
datatype, params_type, init_params) \
BENCHMARK_CAPTURE(vunary, ukernel, arch_flags, ukernel, init_params) \
->Apply( \
benchmark::utils::UnaryElementwiseParameters<datatype, datatype>) \
->UseRealTime();
#include "f16-vabs/f16-vabs.h"
#include "f16-vhswish/f16-vhswish.h"
#include "f16-vneg/f16-vneg.h"
#include "f16-vrnd/f16-vrndd.h"
#include "f16-vrnd/f16-vrndne.h"
#include "f16-vrnd/f16-vrndu.h"
#include "f16-vrnd/f16-vrndz.h"
#include "f16-vrsqrt/f16-vrsqrt.h"
#include "f16-vsigmoid/f16-vsigmoid.h"
#include "f16-vsqr/f16-vsqr.h"
#include "f16-vsqrt/f16-vsqrt.h"
#include "f16-vtanh/f16-vtanh.h"
#include "f32-vabs/f32-vabs.h"
#include "f32-vgelu/f32-vgelu.h"
#include "f32-vhswish/f32-vhswish.h"
#include "f32-vlog/f32-vlog.h"
#include "f32-vneg/f32-vneg.h"
#include "f32-vrelu/f32-vrelu.h"
#include "f32-vrnd/f32-vrndd.h"
#include "f32-vrnd/f32-vrndne.h"
#include "f32-vrnd/f32-vrndu.h"
#include "f32-vrnd/f32-vrndz.h"
#include "f32-vrsqrt/f32-vrsqrt.h"
#include "f32-vsigmoid/f32-vsigmoid.h"
#include "f32-vsqr/f32-vsqr.h"
#include "f32-vsqrt/f32-vsqrt.h"
#include "f32-vtanh/f32-vtanh.h"
#undef XNN_UKERNEL_WITH_PARAMS
template <typename TIn, typename TOut, typename UKernelParams>
void velu(benchmark::State& state, uint64_t arch_flags,
UKernelFn<TIn, TOut, UKernelParams> ukernel,
xnn_init_unary_uparams_fn init_params) {
xnn_unary_params params;
params.elu.alpha = 1.0f;
vunary(state, arch_flags, ukernel, init_params, &params);
}
#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \
datatype, params_type, init_params) \
BENCHMARK_CAPTURE(velu, ukernel, arch_flags, ukernel, init_params) \
->Apply( \
benchmark::utils::UnaryElementwiseParameters<datatype, datatype>) \
->UseRealTime();
#include "f16-velu/f16-velu.h"
#include "f32-velu/f32-velu.h"
#undef XNN_UKERNEL_WITH_PARAMS
template <typename TIn, typename TOut, typename UKernelParams>
void vclamp(benchmark::State& state, uint64_t arch_flags,
UKernelFn<TIn, TOut, UKernelParams> ukernel,
xnn_init_unary_uparams_fn init_params) {
xnn_unary_params params;
params.clamp.min = -1.0f;
params.clamp.max = 1.0f;
// These kernels cannot handle changing quantization parameters.
xnn_quantization_params input_quantization = {0, 1.0f};
xnn_quantization_params output_quantization = {0, 1.0f};
vunary(state, arch_flags, ukernel, init_params, &params, input_quantization,
output_quantization);
}
#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \
datatype, params_type, init_params) \
BENCHMARK_CAPTURE(vclamp, ukernel, arch_flags, ukernel, init_params) \
->Apply( \
benchmark::utils::UnaryElementwiseParameters<datatype, datatype>) \
->UseRealTime();
#include "f16-vclamp/f16-vclamp.h"
#include "f32-vclamp/f32-vclamp.h"
#include "s8-vclamp/s8-vclamp.h"
#include "u8-vclamp/u8-vclamp.h"
#undef XNN_UKERNEL_WITH_PARAMS
template <typename TIn, typename TOut, typename UKernelParams>
void vlrelu(benchmark::State& state, uint64_t arch_flags,
UKernelFn<TIn, TOut, UKernelParams> ukernel,
xnn_init_unary_uparams_fn init_params) {
xnn_unary_params params;
params.leaky_relu.negative_slope = 0.5f;
vunary(state, arch_flags, ukernel, init_params, &params);
}
#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \
datatype, params_type, init_params) \
BENCHMARK_CAPTURE(vlrelu, ukernel, arch_flags, ukernel, init_params) \
->Apply( \
benchmark::utils::UnaryElementwiseParameters<datatype, datatype>) \
->UseRealTime();
#include "f16-vlrelu/f16-vlrelu.h"
#include "f32-vlrelu/f32-vlrelu.h"
#include "qs8-vlrelu/qs8-vlrelu.h"
#include "qu8-vlrelu/qu8-vlrelu.h"
#undef XNN_UKERNEL_WITH_PARAMS
#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, \
vector_tile, datatype_in, datatype_out, \
params_type, init_params) \
BENCHMARK_CAPTURE(vunary, ukernel, arch_flags, ukernel, init_params) \
->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, \
datatype_out>) \
->UseRealTime();
#include "f16-f32-vcvt/f16-f32-vcvt.h"
#include "f16-qs8-vcvt/f16-qs8-vcvt.h"
#include "f32-f16-vcvt/f32-f16-vcvt.h"
#include "f32-qs8-vcvt/f32-qs8-vcvt.h"
#include "f32-qu8-vcvt/f32-qu8-vcvt.h"
#include "qs8-f16-vcvt/qs8-f16-vcvt.h"
#include "qs8-f32-vcvt/qs8-f32-vcvt.h"
#include "qs8-vcvt/qs8-vcvt.h"
#include "qu8-f32-vcvt/qu8-f32-vcvt.h"
#include "qu8-vcvt/qu8-vcvt.h"
#include "s32-f32-vcvt/s32-f32-vcvt.h"
#undef XNN_CVT_UKERNEL_WITH_PARAMS
#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif