sglang_v0.5.2/pytorch_2.8.0/third_party/XNNPACK/test/packb-microkernel-tester.h

247 lines
8.3 KiB
C++

// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#pragma once
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <functional>
#include <numeric>
#include <vector>
#include <gtest/gtest.h>
#include "xnnpack/math.h"
#include "xnnpack/microfnptr.h"
#include "xnnpack/buffer.h"
// Reference bias packing function for f32.
static void f32_packb_reference(
size_t groups,
size_t channels,
size_t kernel_tile,
size_t channel_tile,
size_t channel_subtile,
size_t channel_round,
const float* weights,
const float* bias,
float* out,
size_t per_tile_extra_bytes,
size_t per_subtile_extra_bytes) {
assert(groups > 0);
// Group loop.
do {
// Channel tile loop.
size_t c = round_up_po2(channels, channel_round);
size_t tiled_c = round_down_po2(c, channel_tile);
size_t cr_block_start = 0;
for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
const size_t cr_block_size = min(channels - cr_block_start, channel_tile);
if (bias != nullptr) {
for (size_t i = 0; i < cr_block_size; i++) {
*out++ = bias[cr_block_start + i];
}
} else {
size_t i = cr_block_size;
do {
*out++ = 0.0f;
} while (--i != 0);
}
out += channel_tile - cr_block_size;
out += kernel_tile * channel_tile;
out += per_tile_extra_bytes;
}
// Channel subtile loop.
for (; cr_block_start < c; cr_block_start += channel_subtile) {
const size_t cr_block_size = min(channels - cr_block_start, channel_subtile);
if (bias != nullptr) {
for (size_t i = 0; i < cr_block_size; i++) {
*out++ = bias[cr_block_start + i];
}
} else {
size_t i = cr_block_size;
do {
*out++ = 0.0f;
} while (--i != 0);
}
out += channel_subtile - cr_block_size;
out += kernel_tile * channel_subtile;
out += per_subtile_extra_bytes;
}
if (bias != nullptr) {
bias += channels;
}
} while (--groups > 0);
}
class PackBMicrokernelTester {
public:
PackBMicrokernelTester& groups(size_t groups) {
this->groups_ = groups;
return *this;
}
size_t groups() const {
return this->groups_;
}
PackBMicrokernelTester& channel_tile(size_t channel_tile) {
this->channel_tile_ = channel_tile;
return *this;
}
size_t channel_tile() const {
return this->channel_tile_;
}
PackBMicrokernelTester& channel_subtile(size_t channel_subtile) {
this->channel_subtile_ = channel_subtile;
return *this;
}
size_t channel_subtile() const {
return this->channel_subtile_;
}
PackBMicrokernelTester& channel_round(size_t channel_round) {
this->channel_round_ = channel_round;
return *this;
}
size_t channel_round() const {
return this->channel_round_;
}
PackBMicrokernelTester& channels(size_t channels) {
assert(channels != 0);
this->channels_ = channels;
return *this;
}
size_t channels() const {
return this->channels_;
}
size_t packed_channels() const {
return round_up(channels(), channel_subtile());
}
PackBMicrokernelTester& kernel_tile(size_t kernel_tile) {
this->kernel_tile_ = kernel_tile;
return *this;
}
size_t kernel_tile() const {
return this->kernel_tile_;
}
void Test(xnn_x32_packb_gemm_ukernel_fn packb) const {
xnnpack::Buffer<uint32_t> weights(groups() * channels() * kernel_tile());
xnnpack::Buffer<uint32_t> bias(groups() * channels());
xnnpack::Buffer<uint32_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
groups() * (packed_channels() * kernel_tile() + packed_channels()));
xnnpack::Buffer<uint32_t> packed_w_ref(groups() * (packed_channels() * kernel_tile() + packed_channels()));
std::fill(weights.begin(), weights.end(), 0xDEADBEEF);
std::iota(bias.begin(), bias.end(), UINT32_C(0x80000000));
std::fill(packed_w.begin(), packed_w.end(), UINT32_C(0x12345678));
std::fill(packed_w_ref.begin(), packed_w_ref.end(), UINT32_C(0xDEADBEEF));
// Compute reference results.
f32_packb_reference(
groups(), channels(), kernel_tile(), channel_tile(), channel_subtile(), channel_round(),
reinterpret_cast<const float*>(weights.data()), reinterpret_cast<const float*>(bias.data()),
reinterpret_cast<float*>(packed_w_ref.data()), /*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0);
// Call optimized micro-kernel.
packb(
groups(), channels(), bias.data(), packed_w.data(),
/*channel_tile_stride=*/sizeof(float) * (kernel_tile() * channel_tile() + channel_tile()),
/*channel_subtile_stride=*/sizeof(float) * (kernel_tile() * channel_subtile() + channel_subtile()),
nullptr);
// Verify results.
for (size_t i = 0; i < packed_w.size(); i++) {
if (packed_w_ref[i] != UINT32_C(0xDEADBEEF)) { // Allow weights and padding to differ.
EXPECT_EQ(packed_w[i], packed_w_ref[i]) << "at position " << i << " / " << packed_w.size() << ", channels "
<< channels() << ", kernel tile " << kernel_tile() << ", groups "
<< groups();
} else {
// These are weights, and should be unmodified.
EXPECT_EQ(packed_w[i], 0x12345678) << "at position " << i << " / " << packed_w.size() << ", channels "
<< channels() << ", kernel tile " << kernel_tile() << ", groups "
<< groups();
}
}
}
void Test(xnn_x32_zerob_gemm_ukernel_fn zerob) const {
xnnpack::Buffer<uint32_t> weights(groups() * channels() * kernel_tile());
xnnpack::Buffer<uint32_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
groups() * (packed_channels() * kernel_tile() + packed_channels()));
xnnpack::Buffer<uint32_t> packed_w_ref(groups() * (packed_channels() * kernel_tile() + packed_channels()));
std::fill(weights.begin(), weights.end(), 0xDEADBEEF);
std::fill(packed_w.begin(), packed_w.end(), UINT32_C(0x12345678));
std::fill(packed_w_ref.begin(), packed_w_ref.end(), UINT32_C(0xDEADBEEF));
// Compute reference results.
f32_packb_reference(
groups(), channels(), kernel_tile(), channel_tile(), channel_subtile(), channel_round(),
reinterpret_cast<const float*>(weights.data()), nullptr,
reinterpret_cast<float*>(packed_w_ref.data()), /*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0);
// Call optimized micro-kernel.
zerob(
groups(), channels(), packed_w.data(),
/*channel_tile_stride=*/sizeof(float) * (kernel_tile() * channel_tile() + channel_tile()),
/*channel_subtile_stride=*/sizeof(float) * (kernel_tile() * channel_subtile() + channel_subtile()),
nullptr);
// Verify results.
for (size_t i = 0; i < packed_w.size(); i++) {
if (packed_w_ref[i] != UINT32_C(0xDEADBEEF)) { // Allow weights and padding to differ.
EXPECT_EQ(packed_w[i], packed_w_ref[i]) << "at position " << i << " / " << packed_w.size() << ", channels "
<< channels() << ", kernel tile " << kernel_tile();
// Bias should be zero.
EXPECT_EQ(packed_w[i], 0.0f) << "at position " << i << " / " << packed_w.size() << ", channels " << channels()
<< ", kernel tile " << kernel_tile();
} else {
// These are weights, and should be unmodified.
EXPECT_EQ(packed_w[i], 0x12345678) << "at position " << i << " / " << packed_w.size() << ", channels "
<< channels() << ", kernel tile " << kernel_tile();
}
}
}
struct Kernel {
explicit Kernel(xnn_x32_packb_gemm_ukernel_fn packb) {
dispatch = [packb](const PackBMicrokernelTester& tester) { tester.Test(packb); };
}
explicit Kernel(xnn_x32_zerob_gemm_ukernel_fn zerob) {
dispatch = [zerob](const PackBMicrokernelTester& tester) { tester.Test(zerob); };
}
std::function<void(const PackBMicrokernelTester)> dispatch;
};
void Test(const Kernel& kernel) const {
kernel.dispatch(*this);
}
private:
size_t groups_{1};
size_t channels_{1};
size_t channel_tile_{1};
size_t channel_subtile_{1};
size_t channel_round_{1};
size_t kernel_tile_{1};
};