sglang_v0.5.2/pytorch_2.8.0/third_party/XNNPACK/bench/packq-benchmark.cc

84 lines
2.8 KiB
C++

// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include "packq-benchmark.h"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <random>
#include "utils.h"
#include "xnnpack/buffer.h"
#include "xnnpack/common.h"
#include "xnnpack/microfnptr.h"
#include "xnnpack/packq.h"
#include <benchmark/benchmark.h>
void x8_packq(benchmark::State& state, xnn_x8_packq_f32qp8_ukernel_fn packq,
size_t mr, size_t kr, size_t sr,
benchmark::utils::IsaCheckFunction isa_check) {
if (isa_check != nullptr && !isa_check(state)) {
return;
}
const size_t batch = state.range(0);
const size_t dim_m = state.range(2);
const size_t dim_k = state.range(3);
const size_t rounded_n = benchmark::utils::RoundUp(dim_m, mr);
const size_t rounded_k = benchmark::utils::RoundUp(dim_k, kr);
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = [&]() {
return std::uniform_real_distribution<float>(-10, 10)(rng);
};
// Compute a num_buffers that fit cache with source weights + packed_weights.
const size_t num_buffers =
1 + benchmark::utils::DivideRoundUp<size_t>(
benchmark::utils::GetMaxCacheSize(),
sizeof(int8_t) * batch *
(dim_m * dim_k + rounded_n * rounded_k + rounded_n));
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> input(num_buffers * batch * dim_m *
dim_k);
std::generate(input.begin(), input.end(), f32rng);
const size_t packed_size =
xnn_x8_packq_f32qp8_packed_size(batch * dim_m, dim_k, mr, kr, sr);
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_weights(num_buffers *
packed_size);
size_t buffer_index = 0;
for (auto _ : state) {
if (++buffer_index == num_buffers) {
buffer_index = 0;
}
packq(batch * dim_m, dim_k, mr, kr, sr,
/*m_idx_start=*/buffer_index * dim_m,
input.data() + buffer_index * batch * dim_m * dim_k,
dim_k * sizeof(float), packed_weights.data());
}
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
if (cpu_frequency != 0) {
state.counters["cpufreq"] = cpu_frequency;
}
const size_t elements_per_iteration = batch * dim_m * dim_k;
state.counters["elements"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * elements_per_iteration,
benchmark::Counter::kIsRate);
const size_t bytes_per_iteration =
(elements_per_iteration + batch * (rounded_n * rounded_k + rounded_n)) *
sizeof(int8_t);
state.counters["bytes"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * bytes_per_iteration,
benchmark::Counter::kIsRate);
}