sglang_v0.5.2/pytorch_2.8.0/third_party/XNNPACK/test/gemm-microkernel-tester.cc

3721 lines
172 KiB
C++

#include "gemm-microkernel-tester.h"
#include <stdint.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <functional>
#include <limits>
#include <random>
#include <vector>
#include <gtest/gtest.h>
#include "xnnpack.h"
#include "xnnpack/allocator.h"
#include "xnnpack/common.h"
#include "xnnpack/config-types.h"
#include "xnnpack/gemm.h"
#include "xnnpack/math.h"
#include "xnnpack/memory.h"
#include "xnnpack/microfnptr.h"
#include "xnnpack/microparams-init.h"
#include "xnnpack/microparams.h"
#include "xnnpack/pack.h"
#include "xnnpack/packq.h"
#include "xnnpack/quantization.h"
#include "xnnpack/requantization.h"
#include "xnnpack/buffer.h"
#include "replicable_random_device.h"
TEST_P(GemmTest, Test) {
const GemmTestParams& params = GetParam();
GemmMicrokernelTester tester = params.tester;
// Make sure that we can execute this test.
if (params.isa_check) {
params.isa_check();
if (IsSkipped()) {
return;
}
}
// Loop over the `k`, `m`, and `n` values, if required.
for (size_t k = params.loop_k_.from; k <= params.loop_k_.to;
k = params.loop_k_.next(k)) {
if (params.loop_k_.is_set) {
tester.k(k);
}
for (size_t m = params.loop_m_.from; m <= params.loop_m_.to;
m = params.loop_m_.next(m)) {
if (params.loop_m_.is_set) {
tester.m(m);
}
for (size_t n = params.loop_n_.from; n <= params.loop_n_.to;
n = params.loop_n_.next(n)) {
if (params.loop_n_.is_set) {
tester.n(n);
}
for (size_t zi = params.loop_zi_.from; zi <= params.loop_zi_.to;
zi = params.loop_zi_.next(zi)) {
if (params.loop_zi_.is_set) {
tester.zero_index(zi);
}
for (size_t bzp = params.loop_bzp_.from; bzp <= params.loop_bzp_.to;
bzp = params.loop_bzp_.next(bzp)) {
if (params.loop_bzp_.is_set) {
tester.b_zero_point(bzp);
}
for (size_t bl = params.loop_bl_.from; bl <= tester.k() / 2;
bl = params.loop_bl_.next(bl)) {
if (params.loop_bl_.is_set) {
// Require block size to divide (padded) column size.
if (round_up_po2(k, params.loop_bl_.step) % bl != 0) {
continue;
}
tester.bl(bl);
}
// Call the test function.
params.test_func(tester);
}
}
}
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qd8_f16_qc8w_igemm_ukernel_fn igemm,
xnn_init_f16_minmax_params_fn init_params,
xnn_pack_qs8_igemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.f, 1.f), std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
std::ref(rng));
xnnpack::Buffer<float> input(mr() * k());
xnnpack::Buffer<int8_t> a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<xnn_qd8_quantization_params> quantization_params(1 + XNN_EXTRA_QUANTIZATION_PARAMS);
xnnpack::Buffer<int8_t> b(n() * ks() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> kernel_scale(n());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
ks() * packed_n() * packed_k() +
packed_n() * (sizeof(int32_t) + sizeof(float) * 2));
xnnpack::Buffer<xnn_float16> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n(), 0);
xnnpack::Buffer<int8_t> junk(k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<const int8_t*> im2col(mr() * ks());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input.begin(), input.end(), std::ref(f32rng));
const auto minmax = std::minmax_element(input.begin(), input.begin() + mr() * k());
float inv_scale;
quantization_params[0] = xnn_f32_qd8_asymmetric_quantization_params(*minmax.first, *minmax.second, &inv_scale);
for (size_t i = 0; i < mr(); ++i) {
const float* input_ptr = &input[i * k()];
for (size_t j = 0; j < k(); ++j) {
float scaled_input = input_ptr[j] * inv_scale;
scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max()
- quantization_params[0].zero_point));
scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min()
- quantization_params[0].zero_point));
a[i * a_stride() + j] = int8_t(std::lrintf(scaled_input) + long(quantization_params[0].zero_point));
}
}
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale.begin(), kernel_scale.end(), std::ref(scalerng));
std::fill(packed_w.begin(), packed_w.end(), 0);
// Row sums are multiplied by input zero point, since we don't know it
// until runtime, set it to 1.
const xnn_qs8_packing_params packing_params = { /*input_zero_point=*/1 };
pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(),
b.data(), /*bias=*/nullptr, /*scale=*/nullptr, packed_w.data(), 2 * sizeof(float) * nr(), &packing_params);
// Fill in packed kernel scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
0,
kernel_scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + sizeof(float))));
// Fill in packed bias
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
0,
bias.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + 2 * sizeof(float))));
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = 0; m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = a.data() + a_stride() * m_index - a_offset();
}
}
std::shuffle(im2col.begin(), im2col.end(), rng);
const size_t k_stride = round_up_po2(k(), kr() * sr());
int32_t zp = quantization_params[0].zero_point;
if (unsigned_inputs()) {
zp += 128;
}
xnnpack::Buffer<int8_t> zero_points(k_stride + XNN_EXTRA_BYTES, zp);
const int8_t* zero_sentinel = (const int8_t*) &packing_params;
const int8_t* zero_data = zero_points.data();
if (zero_index() != SIZE_MAX) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
im2col[ks_index * mr() + zero_index()] = zero_sentinel;
}
}
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = m(); m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = junk.data();
}
}
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
if (im2col[ks_index * mr() + m_index] != zero_sentinel) {
c_ref[m_index * n() + n_index] +=
(int32_t(im2col[ks_index * mr() + m_index][k_index + a_offset()]) - quantization_params[0].zero_point) *
int32_t(b[(n_index * ks() + ks_index) * k() + k_index]);
}
}
}
c_ref[m_index * n() + n_index] *= quantization_params[0].inv_scale * kernel_scale[n_index];
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f16_minmax_params params;
init_params(&params, static_cast<xnn_float16>(c_min), static_cast<xnn_float16>(c_max));
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
if (unsigned_inputs()) {
// Some architectures require that the input be unsigned.
// Adjust the zero point and flip the sign of the input to mimic adding
// 128 to the input with correct overflow behaviour.
for (int i = 0; i < quantization_params.size(); ++i) {
quantization_params[i].zero_point += 128;
}
for (int i = 0; i < a.size(); ++i) {
a[i] ^= 0x80;
}
}
igemm(m(), n(), k(), ks() * mr() * sizeof(void*),
im2col.data(), static_cast<const void*>(packed_w.data()),
c.data(), cm_stride() * sizeof(xnn_float16), cn_stride() * sizeof(xnn_float16),
a_offset() * sizeof(uint8_t), zero_sentinel, zero_data,
&params, quantization_params.data());
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux AArch64.
const float tolerance = std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-2f);
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< "), optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qd8_f32_qc8w_igemm_ukernel_fn igemm,
xnn_init_f32_minmax_params_fn init_params,
xnn_pack_qs8_igemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.f, 1.f), std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
std::ref(rng));
xnnpack::Buffer<float> input(mr() * k());
xnnpack::Buffer<int8_t> a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<xnn_qd8_quantization_params> quantization_params(1 + XNN_EXTRA_QUANTIZATION_PARAMS);
xnnpack::Buffer<int8_t> b(n() * ks() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> kernel_scale(n());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
ks() * packed_n() * packed_k() +
packed_n() * (sizeof(int32_t) + sizeof(float) * 2));
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n(), 0);
xnnpack::Buffer<int8_t> junk(k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<const int8_t*> im2col(mr() * ks());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input.begin(), input.end(), std::ref(f32rng));
const auto minmax = std::minmax_element(input.begin(), input.begin() + mr() * k());
float inv_scale;
quantization_params[0] = xnn_f32_qd8_asymmetric_quantization_params(*minmax.first, *minmax.second, &inv_scale);
for (size_t i = 0; i < mr(); ++i) {
const float* input_ptr = &input[i * k()];
for (size_t j = 0; j < k(); ++j) {
float scaled_input = input_ptr[j] * inv_scale;
scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max()
- quantization_params[0].zero_point));
scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min()
- quantization_params[0].zero_point));
a[i * a_stride() + j] = int8_t(std::lrintf(scaled_input) + long(quantization_params[0].zero_point));
}
}
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale.begin(), kernel_scale.end(), std::ref(scalerng));
std::fill(packed_w.begin(), packed_w.end(), 0);
// Row sums are multiplied by input zero point, since we don't know it
// until runtime, set it to 1.
const xnn_qs8_packing_params packing_params = { /*input_zero_point=*/1 };
pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(),
b.data(), /*bias=*/nullptr, /*scale=*/nullptr, packed_w.data(), 2 * sizeof(float) * nr(), &packing_params);
// Fill in packed kernel scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
0,
kernel_scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + sizeof(float))));
// Fill in packed bias
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
0,
bias.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + 2 * sizeof(float))));
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = 0; m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = a.data() + a_stride() * m_index - a_offset();
}
}
std::shuffle(im2col.begin(), im2col.end(), rng);
const size_t k_stride = round_up_po2(k(), kr() * sr());
int32_t zp = quantization_params[0].zero_point;
if (unsigned_inputs()) {
zp += 128;
}
xnnpack::Buffer<int8_t> zero_points(k_stride + XNN_EXTRA_BYTES, zp);
const int8_t* zero_sentinel = (const int8_t*) &packing_params;
const int8_t* zero_data = zero_points.data();
if (zero_index() != SIZE_MAX) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
im2col[ks_index * mr() + zero_index()] = zero_sentinel;
}
}
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = m(); m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = junk.data();
}
}
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
if (im2col[ks_index * mr() + m_index] != zero_sentinel) {
c_ref[m_index * n() + n_index] +=
(int32_t(im2col[ks_index * mr() + m_index][k_index + a_offset()]) - quantization_params[0].zero_point) *
int32_t(b[(n_index * ks() + ks_index) * k() + k_index]);
}
}
}
c_ref[m_index * n() + n_index] *= quantization_params[0].inv_scale * kernel_scale[n_index];
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_minmax_params params;
init_params(&params, c_min, c_max);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
if (unsigned_inputs()) {
// Some architectures require that the input be unsigned.
// Adjust the zero point and flip the sign of the input to mimic adding
// 128 to the input with correct overflow behaviour.
for (int i = 0; i < quantization_params.size(); ++i) {
quantization_params[i].zero_point += 128;
}
for (int i = 0; i < a.size(); ++i) {
a[i] ^= 0x80;
}
}
igemm(m(), n(), k(), ks() * mr() * sizeof(void*),
im2col.data(), static_cast<const void*>(packed_w.data()),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
a_offset() * sizeof(uint8_t), zero_sentinel, zero_data,
&params, quantization_params.data());
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux AArch64.
const float tolerance = std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f);
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< "), optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qu8_gemm_minmax_ukernel_fn gemm,
xnn_init_qu8_conv_minmax_params_fn init_params,
xnn_pack_qu8_gemm_fn pack,
xnn_qu8_requantize_fn requantize) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
xnnpack::Buffer<uint8_t> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(uint8_t));
xnnpack::Buffer<uint8_t> b(n() * k());
xnnpack::Buffer<int32_t> bias(n());
xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() + packed_n() * sizeof(int32_t) / sizeof(uint8_t));
xnnpack::Buffer<uint8_t> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<uint8_t> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
xnnpack::fill_uniform_random_bits(a.data(), a.size(), rng);
xnnpack::fill_uniform_random_bits(b.data(), b.size(), rng);
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
std::fill(packed_w.begin(), packed_w.end(), b_zero_point());
const xnn_qu8_packing_params packing_params = { a_zero_point(), b_zero_point() };
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, &packing_params);
// Compute 32-bit results and output quantization arguments.
std::fill(acc.begin(), acc.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
acc[m_index * n() + n_index] +=
(int32_t(a[m_index * a_stride() + k_index]) - int32_t(a_zero_point())) *
(int32_t(b[n_index * k() + k_index]) - int32_t(b_zero_point()));
}
acc[m_index * n() + n_index] += bias[n_index];
}
}
const int32_t accumulated_min = *std::min_element(acc.cbegin(), acc.cend());
const int32_t accumulated_max = *std::max_element(acc.cbegin(), acc.cend());
const double c_scale = uint32_t(accumulated_max - accumulated_min) >= 256 ? double(uint32_t(accumulated_max - accumulated_min)) / 255.0 : 1.00001;
const uint8_t c_zero_point = uint8_t(std::max(std::min(
lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / c_scale),
long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
const float requantization_scale = 1.0f / float(c_scale);
union xnn_qu8_conv_minmax_params quantization_params;
init_params(&quantization_params,
b_zero_point(), requantization_scale, c_zero_point, qmin(), qmax());
gemm(
m(), n(), k(),
a.data(), a_stride() * sizeof(uint8_t),
packed_w.data(),
c.data(), cm_stride() * sizeof(uint8_t), cn_stride() * sizeof(uint8_t),
&quantization_params);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = requantize(
acc[m_index * n() + n_index], requantization_scale, c_zero_point, qmin(), qmax());
}
}
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(uint32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), uint32_t(qmax()));
EXPECT_GE(uint32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), uint32_t(qmin()));
EXPECT_EQ(uint32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), uint32_t(c_ref[i * n() + j]))
<< "at " << i << ", " << j << ": reference = " << (uint32_t) c_ref[i * n() + j]
<< " (accumulator = " << acc[i * n() + j]
<< "), optimized = " << (uint32_t) c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k()
<< ", requantization scale = " << requantization_scale << ", output zero point = " << int32_t(c_zero_point);
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qu8_igemm_minmax_ukernel_fn igemm,
xnn_init_qu8_conv_minmax_params_fn init_params,
xnn_pack_qu8_igemm_fn pack,
xnn_qu8_requantize_fn requantize)
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
xnnpack::Buffer<uint8_t> a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(uint8_t));
xnnpack::Buffer<uint8_t> b(n() * ks() * k());
xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
ks() * packed_n() * packed_k() +
packed_n() * sizeof(int32_t) / sizeof(uint8_t));
xnnpack::Buffer<int32_t> bias(n());
xnnpack::Buffer<uint8_t> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<uint8_t> c_ref(m() * n());
xnnpack::Buffer<uint8_t> junk(k() + XNN_EXTRA_BYTES / sizeof(uint8_t));
xnnpack::Buffer<const uint8_t*> im2col(mr() * ks());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
xnnpack::fill_uniform_random_bits(a.data(), a.size(), rng);
xnnpack::fill_uniform_random_bits(b.data(), b.size(), rng);
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
std::fill(packed_w.begin(), packed_w.end(), b_zero_point());
const xnn_qu8_packing_params packing_params = { a_zero_point(), b_zero_point() };
pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, &packing_params);
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = 0; m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = a.data() + a_stride() * m_index - a_offset();
}
}
std::shuffle(im2col.begin(), im2col.end(), rng);
if (zero_index() != SIZE_MAX) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
im2col[ks_index * mr() + zero_index()] = a.data();
}
}
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = m(); m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = junk.data();
}
}
// Compute 32-bit results and output quantization arguments.
std::fill(acc.begin(), acc.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
if (im2col[ks_index * mr() + m_index] == a.data()) {
acc[m_index * n() + n_index] +=
(int32_t(im2col[ks_index * mr() + m_index][k_index]) - int32_t(a_zero_point())) *
(int32_t(b[(n_index * ks() + ks_index) * k() + k_index]) - int32_t(b_zero_point()));
} else {
acc[m_index * n() + n_index] +=
(int32_t(im2col[ks_index * mr() + m_index][k_index + a_offset()]) - int32_t(a_zero_point())) *
(int32_t(b[(n_index * ks() + ks_index) * k() + k_index]) - int32_t(b_zero_point()));
}
}
}
acc[m_index * n() + n_index] += bias[n_index];
}
}
const int32_t accumulated_min = *std::min_element(acc.cbegin(), acc.cend());
const int32_t accumulated_max = *std::max_element(acc.cbegin(), acc.cend());
const double c_scale = uint32_t(accumulated_max - accumulated_min) >= 256 ? double(uint32_t(accumulated_max - accumulated_min)) / 255.0 : 1.00001;
const uint8_t c_zero_point = uint8_t(std::max(std::min(
lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / c_scale),
long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
const float requantization_scale = 1.0f / float(c_scale);
union xnn_qu8_conv_minmax_params quantization_params;
init_params(&quantization_params,
b_zero_point(), requantization_scale, c_zero_point, qmin(), qmax());
const uint8_t* zero_pointer = (zero_index() != SIZE_MAX) ? a.data() : nullptr;
igemm(
m(), n(), k(), ks() * mr() * sizeof(void*),
im2col.data(), packed_w.data(),
c.data(), cm_stride() * sizeof(uint8_t), cn_stride() * sizeof(uint8_t),
a_offset() * sizeof(uint8_t), zero_pointer,
&quantization_params);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = requantize(
acc[m_index * n() + n_index], requantization_scale, c_zero_point, qmin(), qmax());
}
}
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(uint32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), uint32_t(qmax()));
EXPECT_GE(uint32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), uint32_t(qmin()));
EXPECT_EQ(uint32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), uint32_t(c_ref[i * n() + j]))
<< "at " << i << ", " << j << ": reference = " << uint32_t(c_ref[i * n() + j])
<< " (accumulator = " << acc[i * n() + j]
<< "), optimized = " << (uint32_t) c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k()
<< ", requantization scale = " << requantization_scale << ", output zero point = " << int32_t(c_zero_point);
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qs8_qc8w_gemm_minmax_ukernel_fn gemm,
xnn_init_qs8_qc8w_conv_minmax_params_fn init_params,
xnn_pack_qs8_gemm_fn pack,
xnn_qs8_requantize_fn requantize) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
std::ref(rng));
xnnpack::Buffer<int8_t> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<int8_t> b(n() * k());
xnnpack::Buffer<int32_t> bias(n());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() +
packed_n() * (sizeof(int32_t) + sizeof(float)) / sizeof(int8_t));
xnnpack::Buffer<int8_t> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<float> scale(n());
xnnpack::Buffer<int8_t> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
xnnpack::fill_uniform_random_bits(a.data(), a.size(), rng);
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
std::fill(packed_w.begin(), packed_w.end(), 0);
const xnn_qs8_packing_params packing_params = { int8_t(a_zero_point() - 0x80) };
void* const packed_data = packed_w.data();
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_data, nr() * sizeof(float), &packing_params);
// Compute 32-bit results and output quantization arguments.
std::fill(acc.begin(), acc.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
acc[m_index * n() + n_index] +=
(int32_t(a[m_index * a_stride() + k_index]) - int32_t(a_zero_point() - 0x80)) *
int32_t(b[n_index * k() + k_index]);
}
acc[m_index * n() + n_index] += bias[n_index];
}
}
const int8_t c_zero_point = -1;
for (size_t n_index = 0; n_index < n(); n_index++) {
int32_t accumulated_min = acc[n_index];
int32_t accumulated_max = acc[n_index];
for (size_t m_index = 0; m_index < m(); m_index++) {
accumulated_min = std::min(accumulated_min, acc[m_index * n() + n_index]);
accumulated_max = std::max(accumulated_max, acc[m_index * n() + n_index]);
}
const uint32_t accumulated_range = uint32_t(accumulated_max - accumulated_min);
const float c_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001;
scale[n_index] = 1.0f / c_scale;
}
const size_t type_size = sizeof(int8_t);
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (packed_k() * type_size + (sizeof(int32_t) + sizeof(float))),
nr() * (packed_k() * type_size + (sizeof(int32_t) + sizeof(float))),
0,
scale.data(),
(void*) ((uintptr_t) packed_data + nr() * (packed_k() * type_size + sizeof(int32_t))));
union xnn_qs8_qc8w_conv_minmax_params minmax_params;
init_params(&minmax_params,
c_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
gemm(
m(), n(), k(),
a.data(), a_stride() * sizeof(int8_t),
packed_data,
c.data(), cm_stride() * sizeof(int8_t), cn_stride() * sizeof(int8_t),
&minmax_params);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = requantize(
acc[m_index * n() + n_index], scale[n_index], c_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
}
}
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(qmax()) - 0x80);
EXPECT_GE(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(qmin()) - 0x80);
EXPECT_EQ(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(c_ref[i * n() + j]))
<< "at " << i << ", " << j << ": reference = " << int32_t(c_ref[i * n() + j])
<< " (accumulator = " << acc[i * n() + j]
<< "), optimized = " << int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]) << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k()
<< ", requantization scale = " << scale[j] << ", output zero point = " << int32_t(c_zero_point);
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qs8_qc8w_igemm_minmax_ukernel_fn igemm,
xnn_init_qs8_qc8w_conv_minmax_params_fn init_params,
xnn_pack_qs8_igemm_fn pack,
xnn_qs8_requantize_fn requantize) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
std::ref(rng));
xnnpack::Buffer<int8_t> a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(uint8_t));
xnnpack::Buffer<int8_t> b(n() * ks() * k());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
ks() * packed_n() * packed_k() +
packed_n() * (sizeof(int32_t) + sizeof(float)) / sizeof(int8_t));
xnnpack::Buffer<int32_t> bias(n());
xnnpack::Buffer<int8_t> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<float> scale(n());
xnnpack::Buffer<int8_t> c_ref(m() * n());
xnnpack::Buffer<int8_t> junk(k() + XNN_EXTRA_BYTES / sizeof(uint8_t));
xnnpack::Buffer<const int8_t*> im2col(mr() * ks());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
xnnpack::fill_uniform_random_bits(a.data(), a.size(), rng);
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
std::fill(packed_w.begin(), packed_w.end(), 0);
const xnn_qs8_packing_params packing_params = { int8_t(a_zero_point() - 0x80) };
pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), nr() * sizeof(float), &packing_params);
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = 0; m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = a.data() + a_stride() * m_index - a_offset();
}
}
std::shuffle(im2col.begin(), im2col.end(), rng);
if (zero_index() != SIZE_MAX) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
im2col[ks_index * mr() + zero_index()] = a.data();
}
}
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = m(); m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = junk.data();
}
}
// Compute 32-bit results and output quantization arguments.
std::fill(acc.begin(), acc.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
if (im2col[ks_index * mr() + m_index] == a.data()) {
acc[m_index * n() + n_index] +=
(int32_t(im2col[ks_index * mr() + m_index][k_index]) - int32_t(a_zero_point() - 0x80)) *
int32_t(b[(n_index * ks() + ks_index) * k() + k_index]);
} else {
acc[m_index * n() + n_index] +=
(int32_t(im2col[ks_index * mr() + m_index][k_index + a_offset()]) - int32_t(a_zero_point() - 0x80)) *
int32_t(b[(n_index * ks() + ks_index) * k() + k_index]);
}
}
}
acc[m_index * n() + n_index] += bias[n_index];
}
}
const int8_t c_zero_point = -1;
for (size_t n_index = 0; n_index < n(); n_index++) {
int32_t accumulated_min = acc[n_index];
int32_t accumulated_max = acc[n_index];
for (size_t m_index = 0; m_index < m(); m_index++) {
accumulated_min = std::min(accumulated_min, acc[m_index * n() + n_index]);
accumulated_max = std::max(accumulated_max, acc[m_index * n() + n_index]);
}
const uint32_t accumulated_range = uint32_t(accumulated_max - accumulated_min);
const float c_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001;
scale[n_index] = 1.0f / c_scale;
}
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + (sizeof(int32_t) + sizeof(float))),
nr() * (ks() * packed_k() * sizeof(int8_t) + (sizeof(int32_t) + sizeof(float))),
0,
scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + sizeof(int32_t))));
union xnn_qs8_qc8w_conv_minmax_params minmax_params;
init_params(&minmax_params,
c_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
const int8_t* zero_pointer = (zero_index() != SIZE_MAX) ? a.data() : nullptr;
igemm(
m(), n(), k(), ks() * mr() * sizeof(void*),
im2col.data(), packed_w.data(),
c.data(), cm_stride() * sizeof(int8_t), cn_stride() * sizeof(int8_t),
a_offset() * sizeof(uint8_t), zero_pointer,
&minmax_params);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = requantize(
acc[m_index * n() + n_index], scale[n_index], c_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
}
}
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(qmax()) - 0x80);
EXPECT_GE(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(qmin()) - 0x80);
EXPECT_EQ(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(c_ref[i * n() + j]))
<< "at " << i << ", " << j << ": reference = " << uint32_t(c_ref[i * n() + j])
<< " (accumulator = " << acc[i * n() + j]
<< "), optimized = " << (uint32_t) c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k()
<< ", requantization scale = " << scale[j] << ", output zero point = " << int32_t(c_zero_point);
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qd8_f16_qc8w_gemm_ukernel_fn gemm,
xnn_init_f16_minmax_params_fn init_params,
xnn_pack_qs8_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.f, 1.f), std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
std::ref(rng));
xnnpack::Buffer<float> input(m() * k());
xnnpack::Buffer<int8_t> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<xnn_qd8_quantization_params> quantization_params(mr());
xnnpack::Buffer<int8_t> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> kernel_scale(n());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() +
packed_n() * (sizeof(int32_t) + sizeof(float) * 2));
xnnpack::Buffer<xnn_float16> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n(), 0);
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input.begin(), input.end(), std::ref(f32rng));
for (size_t i = 0; i < m(); ++i) {
const float* input_ptr = &input[i * k()];
const auto minmax = std::minmax_element(input_ptr, input_ptr + k());
float inv_scale;
quantization_params[i] = xnn_f32_qd8_asymmetric_quantization_params(*minmax.first, *minmax.second, &inv_scale);
for (size_t j = 0; j < k(); ++j) {
float scaled_input = input_ptr[j] * inv_scale;
scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max()
- quantization_params[i].zero_point));
scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min()
- quantization_params[i].zero_point));
a[i * a_stride() + j] = int8_t(std::lrintf(scaled_input) + long(quantization_params[i].zero_point));
}
}
for (size_t i = m(); i < mr(); ++i) {
quantization_params[i].zero_point = quantization_params[m() - 1].zero_point;
quantization_params[i].inv_scale = quantization_params[m() - 1].inv_scale;
}
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale.begin(), kernel_scale.end(), std::ref(scalerng));
std::fill(packed_w.begin(), packed_w.end(), 0);
// Row sums are multiplied by input zero point, since we don't know it
// until runtime, set it to 1.
const xnn_qs8_packing_params packing_params = { /*input_zero_point=*/1 };
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), /*bias=*/nullptr, /*scale=*/nullptr, packed_w.data(), 2 * sizeof(float) * nr(), &packing_params);
// Fill in packed kernel scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
0,
kernel_scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + sizeof(float))));
// Fill in packed bias
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
0,
bias.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + 2 * sizeof(float))));
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
int32_t ksum = 0;
for (size_t k_index = 0; k_index < k(); k_index++) {
ksum += b[n_index * k() + k_index];
c_ref[m_index * n() + n_index] +=
int32_t(a[m_index * a_stride() + k_index]) *
int32_t(b[n_index * k() + k_index]);
}
c_ref[m_index * n() + n_index] -= (quantization_params[m_index].zero_point * ksum);
c_ref[m_index * n() + n_index] *= quantization_params[m_index].inv_scale * kernel_scale[n_index];
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f16_minmax_params params;
init_params(&params, static_cast<xnn_float16>(c_min), static_cast<xnn_float16>(c_max));
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
if (unsigned_inputs()) {
// Some architectures require that the input be unsigned.
// Adjust the zero point and flip the sign of the input to mimic adding
// 128 to the input with correct overflow behaviour.
for (int i = 0; i < quantization_params.size(); ++i) {
quantization_params[i].zero_point += 128;
}
for (int i = 0; i < a.size(); ++i) {
a[i] ^= 0x80;
}
}
gemm(m(), n(), k(),
a.data(), a_stride() * sizeof(int8_t),
static_cast<const void*>(packed_w.data()),
c.data(), cm_stride() * sizeof(xnn_float16), cn_stride() * sizeof(xnn_float16), &params, quantization_params.data());
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux AArch64.
const float tolerance = std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-2f);
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< "), optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qd8_f32_qc8w_gemm_ukernel_fn gemm,
xnn_init_f32_minmax_params_fn init_params,
xnn_pack_qs8_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.f, 1.f), std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
std::ref(rng));
xnnpack::Buffer<float> input(m() * k());
xnnpack::Buffer<int8_t> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<xnn_qd8_quantization_params> quantization_params(mr());
xnnpack::Buffer<int8_t> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> kernel_scale(n());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() +
packed_n() * (sizeof(int32_t) + sizeof(float) * 2));
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<float> c_ref(m() * n(), 0);
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input.begin(), input.end(), std::ref(f32rng));
for (size_t i = 0; i < m(); ++i) {
const float* input_ptr = &input[i * k()];
const auto minmax = std::minmax_element(input_ptr, input_ptr + k());
float inv_scale;
quantization_params[i] = xnn_f32_qd8_asymmetric_quantization_params(*minmax.first, *minmax.second, &inv_scale);
for (size_t j = 0; j < k(); ++j) {
float scaled_input = input_ptr[j] * inv_scale;
scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max()
- quantization_params[i].zero_point));
scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min()
- quantization_params[i].zero_point));
a[i * a_stride() + j] = int8_t(std::lrintf(scaled_input) + long(quantization_params[i].zero_point));
}
}
for (size_t i = m(); i < mr(); ++i) {
quantization_params[i].zero_point = quantization_params[m() - 1].zero_point;
quantization_params[i].inv_scale = quantization_params[m() - 1].inv_scale;
}
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale.begin(), kernel_scale.end(), std::ref(scalerng));
std::fill(packed_w.begin(), packed_w.end(), 0);
// Row sums are multiplied by input zero point, since we don't know it
// until runtime, set it to 1.
const xnn_qs8_packing_params packing_params = { /*input_zero_point=*/1 };
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), /*bias=*/nullptr, /*scale=*/nullptr, packed_w.data(), 2 * sizeof(float) * nr(), &packing_params);
// Fill in packed kernel scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
0,
kernel_scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + sizeof(float))));
// Fill in packed bias
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
nr() * (ks() * packed_k() * sizeof(int8_t) + 3 * sizeof(float)),
0,
bias.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + 2 * sizeof(float))));
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
int32_t ksum = 0;
for (size_t k_index = 0; k_index < k(); k_index++) {
ksum += b[n_index * k() + k_index];
c_ref[m_index * n() + n_index] +=
int32_t(a[m_index * a_stride() + k_index]) *
int32_t(b[n_index * k() + k_index]);
}
c_ref[m_index * n() + n_index] -= (quantization_params[m_index].zero_point * ksum);
c_ref[m_index * n() + n_index] *= quantization_params[m_index].inv_scale * kernel_scale[n_index];
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_minmax_params params;
init_params(&params, c_min, c_max);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
if (unsigned_inputs()) {
// Some architectures require that the input be unsigned.
// Adjust the zero point and flip the sign of the input to mimic adding
// 128 to the input with correct overflow behaviour.
for (int i = 0; i < quantization_params.size(); ++i) {
quantization_params[i].zero_point += 128;
}
for (int i = 0; i < a.size(); ++i) {
a[i] ^= 0x80;
}
}
gemm(m(), n(), k(),
a.data(), a_stride() * sizeof(int8_t),
static_cast<const void*>(packed_w.data()),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float), &params, quantization_params.data());
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux AArch64.
const float tolerance = std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f);
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< " (accumulator = " << acc[i * n() + j]
<< "), optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qd8_f16_qc4w_gemm_ukernel_fn gemm,
xnn_init_f16_qc4w_minmax_params_fn init_params,
xnn_pack_qs8_qc4w_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.f, 1.f), std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(0, std::numeric_limits<uint8_t>::max()),
std::ref(rng));
const size_t planes = 2; // 4 bit is 2 planes - low nibbles and high nibbles
const size_t k2 = round_up_po2(k(), 2); // tester assumes byte aligned rows
const size_t packed_k2 = round_up_po2(k(), kr() * sr() * planes); // 2 blocks for nibbles
const size_t packed_k_bytes = (packed_k2 + 1)/ 2;
xnnpack::Buffer<float> input(m() * k2);
xnnpack::Buffer<int8_t> a((m() - 1) * a_stride() + k2 + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<xnn_qd8_quantization_params> quantization_params(mr());
xnnpack::Buffer<uint8_t> b(n() * k2 / 2);
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> kernel_scale(n());
xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k_bytes +
packed_n() * (sizeof(int32_t) + sizeof(float) * 2));
xnnpack::Buffer<xnn_float16> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<float> c_ref(m() * n(), 0.0f);
{//for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input.begin(), input.end(), std::ref(f32rng));
for (size_t i = 0; i < m(); ++i) {
const float* input_ptr = &input[i * k2];
const auto minmax = std::minmax_element(input_ptr, input_ptr + k2);
float inv_scale;
quantization_params[i] = xnn_f32_qd8_asymmetric_quantization_params(*minmax.first, *minmax.second, &inv_scale);
for (size_t j = 0; j < k2; ++j) {
float scaled_input = input_ptr[j] * inv_scale;
scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max()
- quantization_params[i].zero_point));
scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min()
- quantization_params[i].zero_point));
a[i * a_stride() + j] = int8_t(std::lrintf(scaled_input) + long(quantization_params[i].zero_point));
}
}
for (size_t i = m(); i < mr(); ++i) {
quantization_params[i].zero_point = quantization_params[m() - 1].zero_point;
quantization_params[i].inv_scale = quantization_params[m() - 1].inv_scale;
}
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale.begin(), kernel_scale.end(), std::ref(scalerng));
std::fill(packed_w.begin(), packed_w.end(), 0);
// Row sums are multiplied by input zero point, since we don't know it
// until runtime, set it to 1.
const xnn_qs8_qc4w_packing_params packing_params = { /*input_zero_point=*/1, b_zero_point()};
pack(/*g=*/1, n(), k2, nr(), kr(), sr(),
b.data(), /*bias=*/nullptr, /*scale=*/nullptr,
packed_w.data(), 2 * sizeof(float) * nr(), &packing_params);
// Fill in packed kernel scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k_bytes + 3 * sizeof(float)),
nr() * (ks() * packed_k_bytes + 3 * sizeof(float)),
0,
kernel_scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k_bytes + sizeof(float))));
// Fill in packed bias
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k_bytes + 3 * sizeof(float)),
nr() * (ks() * packed_k_bytes + 3 * sizeof(float)),
0,
bias.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k_bytes + 2 * sizeof(float))));
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
int32_t ksum = 0;
for (size_t k_index = 0; k_index < k2; k_index++) {
const size_t nb_index = (n_index * k2 + k_index) / 2;
const int32_t bv = int32_t((k_index % 2 == 0) ? (b[nb_index] & UINT8_C(0xF)) : (b[nb_index] >> 4)) - b_zero_point();
ksum += bv;
c_ref[m_index * n() + n_index] += int32_t(a[m_index * a_stride() + k_index]) * int32_t(bv);
}
c_ref[m_index * n() + n_index] -= (quantization_params[m_index].zero_point * ksum);
c_ref[m_index * n() + n_index] *= quantization_params[m_index].inv_scale * kernel_scale[n_index];
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f16_qc4w_minmax_params params;
init_params(&params, static_cast<xnn_float16>(c_min), static_cast<xnn_float16>(c_max), 8);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
if (unsigned_inputs()) {
// Some architectures require that the input be unsigned.
// Adjust the zero point and flip the sign of the input to mimic adding
// 128 to the input with correct overflow behaviour.
for (int i = 0; i < quantization_params.size(); ++i) {
quantization_params[i].zero_point += 128;
}
for (int i = 0; i < a.size(); ++i) {
a[i] ^= 0x80;
}
}
gemm(m(), n(), k2,
a.data(), a_stride() * sizeof(int8_t),
static_cast<const void*>(packed_w.data()),
c.data(), cm_stride() * sizeof(xnn_float16), cn_stride() * sizeof(xnn_float16), &params, quantization_params.data());
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux AArch64.
const float tolerance = std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-2f);
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< " (accumulator = " << acc[i * n() + j]
<< "), optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qd8_f16_qb4w_gemm_ukernel_fn gemm,
xnn_init_f16_qb4w_minmax_params_fn init_params,
xnn_pack_qs8_qb4w_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.f, 1.f), std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(0, std::numeric_limits<uint8_t>::max()),
std::ref(rng));
const size_t planes = 2; // 4 bit is 2 planes - low nibbles and high nibbles
const size_t k2 = round_up_po2(k(), 2); // tester assumes byte aligned rows
const size_t packed_k2 = round_up_po2(k(), kr() * sr() * planes); // 2 blocks for nibbles
const size_t packed_k_bytes = (packed_k2 + 1)/ 2;
const size_t num_blocks = packed_k2 / bl();
xnnpack::Buffer<float> input(m() * k2);
xnnpack::Buffer<int8_t> a((m() - 1) * a_stride() + k2 + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<xnn_qd8_quantization_params> quantization_params(mr());
xnnpack::Buffer<uint8_t> b(n() * k2 / 2);
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<xnn_bfloat16> kernel_scale2d(n() * k2 / bl());
xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k_bytes +
/* vksum */ packed_n() * sizeof(float) +
/* scales */ packed_n() * num_blocks * sizeof(xnn_bfloat16) +
/* bias */ packed_n() * sizeof(float));
xnnpack::Buffer<xnn_float16> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n(), 0);
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input.begin(), input.end(), std::ref(f32rng));
for (size_t i = 0; i < m(); ++i) {
const float* input_ptr = &input[i * k2];
const auto minmax = std::minmax_element(input_ptr, input_ptr + k2);
float inv_scale;
quantization_params[i] = xnn_f32_qd8_asymmetric_quantization_params(*minmax.first, *minmax.second, &inv_scale);
for (size_t j = 0; j < k2; ++j) {
float scaled_input = input_ptr[j] * inv_scale;
scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max()
- quantization_params[i].zero_point));
scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min()
- quantization_params[i].zero_point));
a[i * a_stride() + j] = int8_t(std::lrintf(scaled_input) + long(quantization_params[i].zero_point));
}
}
for (size_t i = m(); i < mr(); ++i) {
quantization_params[i].zero_point = quantization_params[m() - 1].zero_point;
quantization_params[i].inv_scale = quantization_params[m() - 1].inv_scale;
}
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale2d.begin(), kernel_scale2d.end(), [&]() { return scalerng(); });
std::fill(packed_w.begin(), packed_w.end(), 0);
// Row sums are multiplied by input zero point, since we don't know it
// until runtime, set it to 1.
const xnn_qs8_qc4w_packing_params packing_params = { /*input_zero_point=*/1, b_zero_point()};
pack(/*g=*/1, n(), k2, nr(), kr(), sr(), bl(),
b.data(), /*bias=*/nullptr, /*scale=*/kernel_scale2d.data(),
packed_w.data(), sizeof(xnn_float16) * nr(), sizeof(float) * nr(), &packing_params);
// Fill in packed kernel scale
size_t stride = nr() * (packed_k_bytes + /* scales= */ num_blocks * sizeof(xnn_float16) + /* ksum= */ sizeof(float) + /* bias= */ sizeof(float));
size_t block_stride = (bl() / 2 + sizeof(xnn_float16)) * nr();
size_t start_offset = nr() * (packed_k_bytes / num_blocks + sizeof(float));
uintptr_t start = (uintptr_t) packed_w.data() + start_offset;
xnn_init_blockwise_scale_bf16_params(
n(), nr(), nr(),
stride,
stride,
/*num_blocks=*/ num_blocks,
/*block_stride=*/ block_stride,
0,
kernel_scale2d.data(),
(void*) start);
start = (uintptr_t) packed_w.data() + stride - sizeof(float) * nr();
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
stride,
stride,
0,
bias.data(),
(void*) start);
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
float kfsum = 0.0;
for (size_t bl_index=0; bl_index < num_blocks; ++bl_index) {
int32_t ksum = 0;
int32_t c_ref_acc = 0;
for (size_t kr_index = 0; kr_index < bl(); kr_index++) {
const size_t k_index = bl_index * bl() + kr_index;
const size_t nb_index = (n_index * k2 + k_index) / 2;
const int32_t bv = int32_t((k_index % 2 == 0) ? (b[nb_index] & UINT8_C(0xF)) : (b[nb_index] >> 4)) - b_zero_point();
ksum += bv;
c_ref_acc += int32_t(a[m_index * a_stride() + k_index]) * int32_t(bv);
}
size_t scale_index = n_index * num_blocks + bl_index;
float scale = kernel_scale2d[scale_index];
c_ref[m_index * n() + n_index] += c_ref_acc * scale;
kfsum += scale * ksum;
}
c_ref[m_index * n() + n_index] -= (quantization_params[m_index].zero_point * kfsum);
c_ref[m_index * n() + n_index] *= quantization_params[m_index].inv_scale;
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f16_qb4w_minmax_params params;
init_params(&params,
static_cast<xnn_float16>(c_min),
static_cast<xnn_float16>(c_max),
8,
bl());
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
gemm(m(), n(), k2,
a.data(), a_stride() * sizeof(int8_t),
static_cast<const void*>(packed_w.data()),
c.data(), cm_stride() * sizeof(xnn_float16), cn_stride() * sizeof(xnn_float16), &params, quantization_params.data());
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux AArch64.
const float tolerance = std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-3f);
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k2;
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qd8_f32_qc4w_gemm_ukernel_fn gemm,
xnn_init_f32_qc4w_minmax_params_fn init_params,
xnn_pack_qs8_qc4w_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.f, 1.f), std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(0, std::numeric_limits<uint8_t>::max()),
std::ref(rng));
const size_t planes = 2; // 4 bit is 2 planes - low nibbles and high nibbles
const size_t k2 = round_up_po2(k(), 2); // tester assumes byte aligned rows
const size_t packed_k2 = round_up_po2(k(), kr() * sr() * planes); // 2 blocks for nibbles
const size_t packed_k_bytes = (packed_k2 + 1)/ 2;
xnnpack::Buffer<float> input(m() * k2);
xnnpack::Buffer<int8_t> a((m() - 1) * a_stride() + k2 + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<xnn_qd8_quantization_params> quantization_params(mr());
xnnpack::Buffer<uint8_t> b(n() * k2 / 2);
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> kernel_scale(n());
xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k_bytes +
packed_n() * (sizeof(int32_t) + sizeof(float) * 2));
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<float> c_ref(m() * n(), 0);
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input.begin(), input.end(), std::ref(f32rng));
for (size_t i = 0; i < m(); ++i) {
const float* input_ptr = &input[i * k2];
const auto minmax = std::minmax_element(input_ptr, input_ptr + k2);
float inv_scale;
quantization_params[i] = xnn_f32_qd8_asymmetric_quantization_params(*minmax.first, *minmax.second, &inv_scale);
for (size_t j = 0; j < k2; ++j) {
float scaled_input = input_ptr[j] * inv_scale;
scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max()
- quantization_params[i].zero_point));
scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min()
- quantization_params[i].zero_point));
a[i * a_stride() + j] = int8_t(std::lrintf(scaled_input) + long(quantization_params[i].zero_point));
}
}
for (size_t i = m(); i < mr(); ++i) {
quantization_params[i].zero_point = quantization_params[m() - 1].zero_point;
quantization_params[i].inv_scale = quantization_params[m() - 1].inv_scale;
}
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale.begin(), kernel_scale.end(), std::ref(scalerng));
std::fill(packed_w.begin(), packed_w.end(), 0);
// Row sums are multiplied by input zero point, since we don't know it
// until runtime, set it to 1.
const xnn_qs8_qc4w_packing_params packing_params = { /*input_zero_point=*/1, b_zero_point()};
pack(/*g=*/1, n(), k2, nr(), kr(), sr(),
b.data(), /*bias=*/nullptr, /*scale=*/nullptr,
packed_w.data(), 2 * sizeof(float) * nr(), &packing_params);
// Fill in packed kernel scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k_bytes + 3 * sizeof(float)),
nr() * (ks() * packed_k_bytes + 3 * sizeof(float)),
0,
kernel_scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k_bytes + sizeof(float))));
// Fill in packed bias
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k_bytes + 3 * sizeof(float)),
nr() * (ks() * packed_k_bytes + 3 * sizeof(float)),
0,
bias.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k_bytes + 2 * sizeof(float))));
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
int32_t ksum = 0;
for (size_t k_index = 0; k_index < k2; k_index++) {
const size_t nb_index = (n_index * k2 + k_index) / 2;
const int32_t bv = int32_t((k_index % 2 == 0) ? (b[nb_index] & UINT8_C(0xF)) : (b[nb_index] >> 4)) - b_zero_point();
ksum += bv;
c_ref[m_index * n() + n_index] += int32_t(a[m_index * a_stride() + k_index]) * int32_t(bv);
}
c_ref[m_index * n() + n_index] -= (quantization_params[m_index].zero_point * ksum);
c_ref[m_index * n() + n_index] *= quantization_params[m_index].inv_scale * kernel_scale[n_index];
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_qc4w_minmax_params params;
init_params(&params, c_min, c_max, 8);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
if (unsigned_inputs()) {
// Some architectures require that the input be unsigned.
// Adjust the zero point and flip the sign of the input to mimic adding
// 128 to the input with correct overflow behaviour.
for (int i = 0; i < quantization_params.size(); ++i) {
quantization_params[i].zero_point += 128;
}
for (int i = 0; i < a.size(); ++i) {
a[i] ^= 0x80;
}
}
gemm(m(), n(), k2,
a.data(), a_stride() * sizeof(int8_t),
static_cast<const void*>(packed_w.data()),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float), &params, quantization_params.data());
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux AArch64.
const float tolerance = std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f);
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< " (accumulator = " << acc[i * n() + j]
<< "), optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k2;
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qd8_f32_qb4w_gemm_ukernel_fn gemm,
xnn_init_f32_qb4w_minmax_params_fn init_params,
xnn_pack_qs8_qb4w_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.f, 1.f), std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(0, std::numeric_limits<uint8_t>::max()),
std::ref(rng));
const size_t planes = 2; // 4 bit is 2 planes - low nibbles and high nibbles
const size_t k2 = round_up_po2(k(), 2); // tester assumes byte aligned rows
const size_t packed_k2 = round_up_po2(k(), kr() * sr() * planes); // 2 blocks for nibbles
const size_t packed_k_bytes = (packed_k2 + 1)/ 2;
const size_t num_blocks = packed_k2 / bl();
xnnpack::Buffer<float> input(m() * k2);
xnnpack::Buffer<int8_t> a((m() - 1) * a_stride() + k2 + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<xnn_qd8_quantization_params> quantization_params(mr());
xnnpack::Buffer<uint8_t> b(n() * k2 / 2);
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<xnn_bfloat16> kernel_scale2d(n() * k2 / bl());
xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k_bytes +
/* vksum */ packed_n() * sizeof(float) +
/* scales */ packed_n() * num_blocks * sizeof(float) +
/* bias */ packed_n() * sizeof(float));
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n(), 0);
for (size_t iteration = 0; iteration < 1 /* iterations() */; iteration++) {
std::generate(input.begin(), input.end(), std::ref(f32rng));
for (size_t i = 0; i < m(); ++i) {
const float* input_ptr = &input[i * k2];
const auto minmax = std::minmax_element(input_ptr, input_ptr + k2);
float inv_scale;
quantization_params[i] = xnn_f32_qd8_asymmetric_quantization_params(*minmax.first, *minmax.second, &inv_scale);
for (size_t j = 0; j < k2; ++j) {
float scaled_input = input_ptr[j] * inv_scale;
scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max()
- quantization_params[i].zero_point));
scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min()
- quantization_params[i].zero_point));
a[i * a_stride() + j] = int8_t(std::lrintf(scaled_input) + long(quantization_params[i].zero_point));
}
}
for (size_t i = m(); i < mr(); ++i) {
quantization_params[i].zero_point = quantization_params[m() - 1].zero_point;
quantization_params[i].inv_scale = quantization_params[m() - 1].inv_scale;
}
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale2d.begin(), kernel_scale2d.end(), [&]() { return scalerng(); });
std::fill(packed_w.begin(), packed_w.end(), 0);
// Row sums are multiplied by input zero point, since we don't know it
// until runtime, set it to 1.
const xnn_qs8_qc4w_packing_params packing_params = { /*input_zero_point=*/1, b_zero_point()};
pack(/*g=*/1, n(), k2, nr(), kr(), sr(), bl(),
b.data(), /*bias=*/nullptr, /*scale=*/kernel_scale2d.data(),
packed_w.data(), sizeof(xnn_float16) * nr(), sizeof(float) * nr(), &packing_params);
// Fill in packed kernel scale
size_t stride = nr() * (packed_k_bytes + /* scales= */ num_blocks * sizeof(xnn_float16) + /* ksum= */ sizeof(float) + /* bias= */ sizeof(float));
size_t block_stride = (bl() / 2 + sizeof(xnn_float16)) * nr();
size_t start_offset = nr() * (packed_k_bytes / num_blocks + sizeof(float));
uintptr_t start = (uintptr_t) packed_w.data() + start_offset;
xnn_init_blockwise_scale_bf16_params(
n(), nr(), nr(),
stride,
stride,
/*num_blocks=*/ num_blocks,
/*block_stride=*/ block_stride,
0,
kernel_scale2d.data(),
(void*) start);
start = (uintptr_t) packed_w.data() + stride - sizeof(float) * nr();
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
stride,
stride,
0,
bias.data(),
(void*) start);
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
float kfsum = 0.0;
for (size_t bl_index=0; bl_index < num_blocks; ++bl_index) {
int32_t ksum = 0;
int32_t c_ref_acc = 0;
for (size_t kr_index = 0; kr_index < bl(); kr_index++) {
const size_t k_index = bl_index * bl() + kr_index;
const size_t nb_index = (n_index * k2 + k_index) / 2;
const int32_t bv = int32_t((k_index % 2 == 0) ? (b[nb_index] & UINT8_C(0xF)) : (b[nb_index] >> 4)) - b_zero_point();
ksum += bv;
c_ref_acc += int32_t(a[m_index * a_stride() + k_index]) * int32_t(bv);
}
size_t scale_index = n_index * num_blocks + bl_index;
float scale = kernel_scale2d[scale_index];
c_ref[m_index * n() + n_index] += c_ref_acc * scale;
kfsum += scale * ksum;
}
c_ref[m_index * n() + n_index] -= (quantization_params[m_index].zero_point * kfsum);
c_ref[m_index * n() + n_index] *= quantization_params[m_index].inv_scale;
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_qb4w_minmax_params params;
init_params(&params, c_min, c_max, 8, bl());
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
if (unsigned_inputs()) {
// Some architectures require that the input be unsigned.
// Adjust the zero point and flip the sign of the input to mimic adding
// 128 to the input with correct overflow behaviour.
for (int i = 0; i < quantization_params.size(); ++i) {
quantization_params[i].zero_point += 128;
}
for (int i = 0; i < a.size(); ++i) {
a[i] ^= 0x80;
}
}
gemm(m(), n(), k2,
a.data(), a_stride() * sizeof(int8_t),
static_cast<const void*>(packed_w.data()),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float), &params, quantization_params.data());
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux AArch64.
const float tolerance =
std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-5f);
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k2;
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qp8_f32_qc4w_gemm_minmax_ukernel_fn gemm,
xnn_init_f32_minmax_params_fn init_minmax_params,
xnn_pack_weights_and_biases_fn pack,
xnn_packed_stride_weights_and_biases_fn packed_stride) {
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.f, 1.f),
std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f),
std::ref(rng));
auto w8rng = std::bind(std::uniform_int_distribution<int32_t>(
0, std::numeric_limits<uint8_t>::max()),
std::ref(rng));
const size_t k2 = round_up_po2(k(), 2); // tester assumes byte aligned rows
xnnpack::Buffer<float> input_f32(m() * k2);
xnnpack::Buffer<uint8_t> b(n() * k2 / 2);
xnnpack::Buffer<float> bias(n(), 0.0f);
xnnpack::Buffer<float> kernel_scale(n());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() +
((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<float> c_ref(m() * n(), 0);
// Create a fake `gemm_config` for the packing functions.
struct xnn_gemm_config gemm_config;
gemm_config.mr = static_cast<uint8_t>(mr());
gemm_config.mr_packed = static_cast<uint8_t>(mr_packed());
gemm_config.nr = static_cast<uint8_t>(nr());
gemm_config.log2_kr = static_cast<uint8_t>(31 - math_clz_nonzero_u32(kr()));
gemm_config.log2_sr = static_cast<uint8_t>(31 - math_clz_nonzero_u32(sr()));
const size_t packed_w_stride =
packed_stride(&gemm_config, k2, /*k_stride=*/k2, /*extra_bytes=*/0);
const size_t packed_w_size = packed_w_stride * round_up(n(), nr());
xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(packed_w_size);
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input_f32.begin(), input_f32.end(), std::ref(f32rng));
// Quantize the left-hand operand.
const size_t input_packed_size =
xnn_x8_packq_f32qp8_packed_size(m(), k2, mr_packed(), kr(), sr());
xnnpack::Buffer<int8_t> input_qp8(input_packed_size);
xnn_x8_packq_f32qp8_ukernel__scalar_u1(m(), k2, mr_packed(), kr(), sr(),
/*m_idx_start=*/0, input_f32.data(),
/*lhs_stride=*/k2 * sizeof(float),
input_qp8.data());
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale.begin(), kernel_scale.end(), std::ref(scalerng));
std::fill(packed_w.begin(), packed_w.end(), 0);
// RHS packing.
struct xnn_qs8_qc4w_packing_params params;
params.input_zero_point = 1;
params.kernel_zero_point = b_zero_point();
pack(/*flags=*/0, &gemm_config, k2, n(),
/*groups=*/1, /*k_stride=*/k2,
/*accumulator_init=*/nullptr,
/*weights=*/b.data(),
/*int_extra_data0_fn=*/nullptr,
/*extra_data0=*/bias.data(),
/*extra_data0_size=*/sizeof(float),
/*init_extra_data1_fn=*/
nullptr,
/*extra_data1=*/kernel_scale.data(),
/*extra_data1_size=*/sizeof(float),
/*packed_weights_ptr=*/packed_w.data(), &params);
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k2; k_index++) {
const size_t nb_index = (n_index * k2 + k_index) / 2;
const int32_t bv =
static_cast<int32_t>((k_index % 2 == 0)
? (b[nb_index] & UINT8_C(0xF))
: (b[nb_index] >> 4)) -
b_zero_point();
c_ref[m_index * n() + n_index] +=
xnn_x8_packq_f32qp8_get_dequantized(m_index, k_index,
input_qp8.data(), k2,
mr_packed(), kr(), sr()) *
static_cast<int32_t>(bv);
}
c_ref[m_index * n() + n_index] *= kernel_scale[n_index];
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min =
*std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max =
*std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min()
? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f *
static_cast<float>(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max()
? std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f *
static_cast<float>(255 - qmax());
// Prepare parameters.
xnn_f32_minmax_params minmax_params;
init_minmax_params(&minmax_params, c_min, c_max);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] =
std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
gemm(m(), n(), k2, input_qp8.data(), packed_w.data(), c.data(),
cm_stride() * sizeof(float), sizeof(float), &minmax_params);
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux
// AArch64.
const float tolerance =
std::max(1.1e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f);
ASSERT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< " (accumulator = " << acc[i * n() + j] << "), optimized = "
<< c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]
<< ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr()
<< ", M x N x K = " << m() << " x " << n() << " x " << k2
<< ", cn_stride = " << cn_stride()
<< ", cm_stride = " << cm_stride();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qp8_f32_qb4w_gemm_minmax_ukernel_fn gemm,
xnn_init_f32_qb4w_minmax_params_fn init_minmax_params,
xnn_pack_weights_and_biases_fn pack,
xnn_packed_stride_weights_and_biases_fn packed_stride){
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(-5.f, 5.f),
std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f),
std::ref(rng));
auto w8rng = std::bind(std::uniform_int_distribution<int32_t>(
0, std::numeric_limits<uint8_t>::max()),
std::ref(rng));
const size_t k2 = round_up_po2(k(), 2); // tester assumes byte aligned rows
const size_t packed_k2 = round_up_po2(k(), kr() * sr()); // 2 blocks for nibbles
const size_t packed_k_bytes = (packed_k2 + 1)/ 2;
const size_t num_blocks = packed_k2 / bl();
xnnpack::Buffer<float> input_f32(m() * k2);
xnnpack::Buffer<uint8_t> b(n() * k2 / 2);
xnnpack::Buffer<float> bias(n(), 0.0f);
xnnpack::Buffer<uint16_t> kernel_scale2d(n() * packed_k2 / bl());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() +
((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<float> c_ref(m() * n(), 0);
// Create a fake `gemm_config` for the packing functions.
struct xnn_gemm_config gemm_config;
gemm_config.mr = static_cast<uint8_t>(mr());
gemm_config.mr_packed = static_cast<uint8_t>(mr_packed());
gemm_config.nr = static_cast<uint8_t>(nr());
gemm_config.log2_kr = static_cast<uint8_t>(31 - math_clz_nonzero_u32(kr()));
gemm_config.log2_sr = static_cast<uint8_t>(31 - math_clz_nonzero_u32(sr()));
const size_t packed_w_stride =
packed_stride(&gemm_config, k2, /*k_stride=*/bl(), /*extra_bytes=*/0);
const size_t packed_w_size = packed_w_stride * round_up(n(), nr());
xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(packed_w_size);
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input_f32.begin(), input_f32.end(), std::ref(f32rng));
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
std::generate(kernel_scale2d.begin(), kernel_scale2d.end(), [&]() { return math_cvt_bf16_fp32(scalerng()); });
std::fill(packed_w.begin(), packed_w.end(), 0);
// Quantize the left-hand operand.
const size_t input_packed_size =
xnn_x8_packq_f32qp8_packed_size(m(), k2, mr_packed(), kr(), sr());
xnnpack::Buffer<int8_t> input_qp8(input_packed_size);
xnn_x8_packq_f32qp8_ukernel__scalar_u1(
m(), k2, mr_packed(), kr(), sr(),
/*m_idx_start=*/0, reinterpret_cast<const float*>(input_f32.data()),
/*lhs_stride=*/k2 * sizeof(float),
input_qp8.data()
);
// RHS packing.
struct xnn_qs8_qc4w_packing_params params;
params.input_zero_point = 1;
params.kernel_zero_point = b_zero_point();
pack(/*flags=*/0, &gemm_config, k2, n(),
/*groups=*/1, /*k_stride=*/bl(),
/*accumulator_init=*/nullptr,
/*weights=*/b.data(),
/*int_extra_data0_fn=*/nullptr,
/*extra_data0=*/nullptr,
/*extra_data0_size=*/0,
/*init_extra_data1_fn=*/
nullptr,
/*extra_data1=*/kernel_scale2d.data(),
/*extra_data1_size=*/sizeof(float),
/*packed_weights_ptr=*/packed_w.data(), &params);
size_t stride = nr() * (packed_k_bytes + /* scales= */ num_blocks * sizeof(uint16_t) + /* ksum= */ sizeof(float) + /* bias= */ sizeof(float));
uintptr_t start = (uintptr_t) packed_w.data() + stride - sizeof(float) * nr();
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
stride,
stride,
0,
bias.data(),
(void*) start);
// Compute 32-bit results and output quantization arguments.
std::fill(c_ref.begin(), c_ref.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
float kfsum = 0.0;
for (size_t bl_index=0; bl_index < num_blocks; ++bl_index) {
int32_t ksum = 0;
int32_t c_ref_acc = 0;
for (size_t kr_index = 0; kr_index < bl(); kr_index++) {
const size_t k_index = bl_index * bl() + kr_index;
const size_t nb_index = (n_index * k2 + k_index) / 2;
const int32_t bv = int32_t((k_index % 2 == 0) ? (b[nb_index] & UINT8_C(0xF)) : (b[nb_index] >> 4)) - b_zero_point();
ksum += bv;
c_ref_acc += int32_t(xnn_x8_packq_f32qp8_get_quantized(m_index, k_index, input_qp8.data(),
k2, mr_packed(), kr(), sr())) * int32_t(bv);
}
size_t scale_index = n_index * num_blocks + bl_index;
float scale = math_cvt_fp32_bf16(kernel_scale2d[scale_index]);
c_ref[m_index * n() + n_index] += c_ref_acc * scale;
kfsum += scale * ksum;
}
float inv_scale = xnn_x8_packq_f32qp8_get_recip_scale(m_index, input_qp8.data(), k2, mr_packed(), kr(), sr());
int32_t neg_nudged_zero_point= xnn_x8_packq_f32qp8_get_neg_nudged_zp(m_index, input_qp8.data(), k2, mr_packed(), kr(), sr());
c_ref[m_index * n() + n_index] += (neg_nudged_zero_point * kfsum);
c_ref[m_index * n() + n_index] *= inv_scale;
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min =
*std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max =
*std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min()
? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f *
static_cast<float>(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max()
? std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f *
static_cast<float>(255 - qmax());
// Prepare parameters.
xnn_f32_qb4w_minmax_params minmax_params;
init_minmax_params(&minmax_params, c_min, c_max, 8, bl());
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] =
std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
gemm(m(), n(), k2, input_qp8.data(), packed_w.data(), c.data(),
cm_stride() * sizeof(float), sizeof(float), &minmax_params);
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
// Extract tolerance into variable to workaround test failures on Linux
// AArch64.
const float tolerance =
std::max(1.1e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f);
ASSERT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j], tolerance)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< " (accumulator = " << acc[i * n() + j] << "), optimized = "
<< c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]
<< ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr()
<< ", M x N x K = " << m() << " x " << n() << " x " << k2
<< ", cn_stride = " << cn_stride()
<< ", cm_stride = " << cm_stride();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qs8_gemm_minmax_ukernel_fn gemm,
xnn_init_qs8_conv_minmax_params_fn init_params,
xnn_pack_qs8_gemm_fn pack,
xnn_qs8_requantize_fn requantize) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
std::ref(rng));
xnnpack::Buffer<int8_t> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<int8_t> b(n() * k());
xnnpack::Buffer<int32_t> bias(n());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() + packed_n() * sizeof(int32_t) / sizeof(int8_t));
xnnpack::Buffer<int8_t> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<int8_t> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
xnnpack::fill_uniform_random_bits(a.data(), a.size(), rng);
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
std::fill(packed_w.begin(), packed_w.end(), 0);
const xnn_qs8_packing_params packing_params = { int8_t(a_zero_point() - 0x80) };
void* const packed_data = packed_w.data();
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_data, /*extra_bytes=*/0, &packing_params);
// Compute 32-bit results and output quantization arguments.
std::fill(acc.begin(), acc.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
acc[m_index * n() + n_index] +=
(int32_t(a[m_index * a_stride() + k_index]) - int32_t(a_zero_point() - 0x80)) *
int32_t(b[n_index * k() + k_index]);
}
acc[m_index * n() + n_index] += bias[n_index];
}
}
const int32_t accumulated_min = *std::min_element(acc.cbegin(), acc.cend());
const int32_t accumulated_max = *std::max_element(acc.cbegin(), acc.cend());
const double c_scale = uint32_t(accumulated_max - accumulated_min) >= 256 ? double(uint32_t(accumulated_max - accumulated_min)) / 255.0 : 1.00001;
const int8_t c_zero_point = int8_t(std::max(std::min(
lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / c_scale),
long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
const float requantization_scale = 1.0f / float(c_scale);
union xnn_qs8_conv_minmax_params quantization_params;
init_params(&quantization_params,
requantization_scale, c_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
gemm(
m(), n(), k(),
a.data(), a_stride() * sizeof(int8_t),
packed_data,
c.data(), cm_stride() * sizeof(int8_t), cn_stride() * sizeof(int8_t),
&quantization_params);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = requantize(
acc[m_index * n() + n_index], requantization_scale, c_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
}
}
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(qmax()) - 0x80);
EXPECT_GE(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(qmin()) - 0x80);
EXPECT_EQ(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(c_ref[i * n() + j]))
<< "at " << i << ", " << j << ": reference = " << int32_t(c_ref[i * n() + j])
<< " (accumulator = " << acc[i * n() + j]
<< "), optimized = " << int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]) << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k()
<< ", requantization scale = " << requantization_scale << ", output zero point = " << int32_t(c_zero_point);
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_qs8_igemm_minmax_ukernel_fn igemm,
xnn_init_qs8_conv_minmax_params_fn init_params,
xnn_pack_qs8_igemm_fn pack,
xnn_qs8_requantize_fn requantize) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
auto w8rng = std::bind(
std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
std::ref(rng));
xnnpack::Buffer<int8_t> a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<int8_t> b(n() * ks() * k());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
ks() * packed_n() * packed_k() +
packed_n() * sizeof(int32_t) / sizeof(int8_t));
xnnpack::Buffer<int32_t> bias(n());
xnnpack::Buffer<int8_t> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<int32_t> acc(m() * n());
xnnpack::Buffer<int8_t> c_ref(m() * n());
xnnpack::Buffer<int8_t> junk(k() + XNN_EXTRA_BYTES / sizeof(int8_t));
xnnpack::Buffer<const int8_t*> im2col(mr() * ks());
{//for (size_t iteration = 0; iteration < iterations(); iteration++) {
xnnpack::fill_uniform_random_bits(a.data(), a.size(), rng);
std::generate(b.begin(), b.end(), std::ref(w8rng));
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
std::fill(packed_w.begin(), packed_w.end(), 0);
const xnn_qs8_packing_params packing_params = { int8_t(a_zero_point() - 0x80) };
pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, &packing_params);
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = 0; m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = a.data() + a_stride() * m_index - a_offset();
}
}
std::shuffle(im2col.begin(), im2col.end(), rng);
if (zero_index() != SIZE_MAX) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
im2col[ks_index * mr() + zero_index()] = a.data();
}
}
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = m(); m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = junk.data();
}
}
// Compute 32-bit results and output quantization arguments.
std::fill(acc.begin(), acc.end(), 0);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
if (im2col[ks_index * mr() + m_index] == a.data()) {
acc[m_index * n() + n_index] +=
(int32_t(im2col[ks_index * mr() + m_index][k_index]) - int32_t(a_zero_point() - 0x80)) *
int32_t(b[(n_index * ks() + ks_index) * k() + k_index]);
} else {
acc[m_index * n() + n_index] +=
(int32_t(im2col[ks_index * mr() + m_index][k_index + a_offset()]) - int32_t(a_zero_point() - 0x80)) *
int32_t(b[(n_index * ks() + ks_index) * k() + k_index]);
}
}
}
acc[m_index * n() + n_index] += bias[n_index];
}
}
const int32_t accumulated_min = *std::min_element(acc.cbegin(), acc.cend());
const int32_t accumulated_max = *std::max_element(acc.cbegin(), acc.cend());
const double c_scale = uint32_t(accumulated_max - accumulated_min) >= 256 ? double(uint32_t(accumulated_max - accumulated_min)) / 255.0 : 1.00001;
const uint8_t c_zero_point = uint8_t(std::max(std::min(
lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / c_scale),
long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
const float requantization_scale = 1.0f / float(c_scale);
union xnn_qs8_conv_minmax_params quantization_params;
init_params(&quantization_params,
requantization_scale, c_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
const int8_t* zero_pointer = (zero_index() != SIZE_MAX) ? a.data() : nullptr;
igemm(
m(), n(), k(), ks() * mr() * sizeof(void*),
im2col.data(), packed_w.data(),
c.data(), cm_stride() * sizeof(int8_t), cn_stride() * sizeof(int8_t),
a_offset() * sizeof(uint8_t), zero_pointer,
&quantization_params);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = requantize(
acc[m_index * n() + n_index], requantization_scale, c_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
}
}
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(qmax()) - 0x80);
EXPECT_GE(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(qmin()) - 0x80);
EXPECT_EQ(int32_t(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]), int32_t(c_ref[i * n() + j]))
<< "at " << i << ", " << j << ": reference = " << uint32_t(c_ref[i * n() + j])
<< " (accumulator = " << acc[i * n() + j]
<< "), optimized = " << (uint32_t) c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k()
<< ", requantization scale = " << requantization_scale << ", output zero point = " << int32_t(c_zero_point);
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_bf16_gemm_minmax_ukernel_fn gemm_minmax,
xnn_init_bf16_minmax_params_fn init_params,
xnn_pack_f16_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.5f, 1.0f), std::ref(rng));
xnnpack::Buffer<xnn_bfloat16> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(xnn_bfloat16));
xnnpack::Buffer<xnn_bfloat16> b(n() * k());
xnnpack::Buffer<xnn_bfloat16, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() + packed_n());
xnnpack::Buffer<xnn_bfloat16> bias(n());
xnnpack::Buffer<xnn_bfloat16> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&] { return f32rng(rng); });
std::generate(b.begin(), b.end(), [&] { return f32rng(rng); });
std::generate(bias.begin(), bias.end(), [&] { return f32rng(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
reinterpret_cast<const uint16_t*>(b.data()),
reinterpret_cast<const uint16_t*>(bias.data()), /*scale=*/nullptr,
reinterpret_cast<uint16_t*>(packed_w.data()),
/*extra_bytes=*/0, /*params=*/nullptr);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = bias[n_index];
for (size_t k_index = 0; k_index < k(); k_index++) {
EXPECT_LE(n(), packed_n());
EXPECT_LT(m_index * n() + n_index, c_ref.size());
EXPECT_LT(m_index * k() + k_index, a.size());
c_ref[m_index * n() + n_index] +=
a[m_index * a_stride() + k_index] *
b[n_index * k() + k_index];
}
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min = xnn_bfloat16(accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin()));
const float c_max = xnn_bfloat16(accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax()));
// Prepare parameters.
xnn_bf16_minmax_params params;
init_params(&params, c_min, c_max);
for (float& c_value : c_ref) {
c_value = std::max(std::min(c_value, c_max), c_min);
}
gemm_minmax(m(), n(), k() * sizeof(xnn_bfloat16),
a.data(), a_stride() * sizeof(xnn_bfloat16),
packed_w.data(),
c.data(), cm_stride() * sizeof(xnn_bfloat16), cn_stride() * sizeof(xnn_bfloat16),
&params);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 3.0e-2f))
<< "at " << i << ", " << j << ": Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f16_gemm_minmax_ukernel_fn gemm_minmax,
xnn_init_f16_minmax_params_fn init_params,
xnn_pack_f16_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
xnnpack::Buffer<xnn_float16> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(xnn_float16));
xnnpack::Buffer<xnn_float16> b(n() * k());
xnnpack::Buffer<xnn_float16, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() + packed_n());
xnnpack::Buffer<xnn_float16> bias(n());
xnnpack::Buffer<xnn_float16> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), f32rng);
std::generate(b.begin(), b.end(), f32rng);
std::generate(bias.begin(), bias.end(), f32rng);
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
reinterpret_cast<const uint16_t*>(b.data()),
reinterpret_cast<const uint16_t*>(bias.data()),
/*scale=*/nullptr,
reinterpret_cast<uint16_t*>(packed_w.data()),
/*extra_bytes=*/0, /*params=*/nullptr);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
EXPECT_LE(n(), packed_n());
EXPECT_LT(m_index * n() + n_index, c_ref.size());
EXPECT_LT(m_index * k() + k_index, a.size());
c_ref[m_index * n() + n_index] +=
a[m_index * a_stride() + k_index] *
b[n_index * k() + k_index];
}
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min = xnn_float16(accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin()));
const float c_max = xnn_float16(accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax()));
// Prepare parameters.
xnn_f16_minmax_params params;
init_params(&params, static_cast<xnn_float16>(c_min), static_cast<xnn_float16>(c_max));
for (float& c_value : c_ref) {
c_value = std::max(std::min(c_value, c_max), c_min);
}
gemm_minmax(m(), n(), k() * sizeof(xnn_float16),
a.data(), a_stride() * sizeof(xnn_float16),
packed_w.data(),
c.data(), cm_stride() * sizeof(xnn_float16), cn_stride() * sizeof(xnn_float16),
&params);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-2f))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f16_igemm_minmax_ukernel_fn igemm_minmax,
xnn_init_f16_minmax_params_fn init_params,
xnn_pack_f16_igemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
xnnpack::Buffer<xnn_float16> a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(xnn_float16));
xnnpack::Buffer<xnn_float16> b(n() * ks() * k());
xnnpack::Buffer<xnn_float16, XNN_ALLOCATION_ALIGNMENT> packed_w(
ks() * packed_k() * packed_n() + packed_n());
xnnpack::Buffer<xnn_float16> bias(n());
xnnpack::Buffer<xnn_float16> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
xnnpack::Buffer<xnn_float16> junk(k() + XNN_EXTRA_BYTES / sizeof(xnn_float16));
xnnpack::Buffer<const xnn_float16*> im2col(mr() * ks());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), f32rng);
std::generate(b.begin(), b.end(), f32rng);
std::generate(bias.begin(), bias.end(), f32rng);
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0);
pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(),
reinterpret_cast<const uint16_t*>(b.data()),
reinterpret_cast<const uint16_t*>(bias.data()), /*scale=*/nullptr,
reinterpret_cast<uint16_t*>(packed_w.data()),
/*extra_bytes=*/0, /*params=*/nullptr);
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = 0; m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = a.data() + a_stride() * m_index - a_offset();
}
}
std::shuffle(im2col.begin(), im2col.end(), rng);
if (zero_index() != SIZE_MAX) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
im2col[ks_index * mr() + zero_index()] = a.data();
}
}
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = m(); m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = junk.data();
}
}
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
EXPECT_LT(ks_index * mr() + m_index, im2col.size());
EXPECT_LT(k_index, k());
EXPECT_LT(k_index, a_stride());
if (im2col[ks_index * mr() + m_index] == a.data()) {
c_ref[m_index * n() + n_index] +=
im2col[ks_index * mr() + m_index][k_index] *
b[(n_index * ks() + ks_index) * k() + k_index];
} else {
c_ref[m_index * n() + n_index] +=
im2col[ks_index * mr() + m_index][k_index + a_offset()] *
b[(n_index * ks() + ks_index) * k() + k_index];
}
}
}
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min = xnn_float16(accumulated_min + (accumulated_max - accumulated_min) / 255.0f * xnn_float16(qmin()));
const float c_max = xnn_float16(accumulated_max - (accumulated_max - accumulated_min) / 255.0f * xnn_float16(255 - qmax()));
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::min(c_ref[m_index * n() + n_index], c_max);
c_ref[m_index * n() + n_index] = std::max(c_ref[m_index * n() + n_index], c_min);
}
}
// Prepare parameters.
xnn_f16_minmax_params params;
init_params(&params, static_cast<xnn_float16>(c_min), static_cast<xnn_float16>(c_max));
for (float& c_value : c_ref) {
c_value = std::max(std::min(c_value, c_max), c_min);
}
const xnn_float16* zero_pointer = (zero_index() != SIZE_MAX) ? a.data() : nullptr;
igemm_minmax(
m(), n(), k() * sizeof(xnn_float16), ks() * mr() * sizeof(void*),
reinterpret_cast<const xnn_float16**>(im2col.data()), packed_w.data(),
c.data(), cm_stride() * sizeof(xnn_float16), cn_stride() * sizeof(xnn_float16),
a_offset() * sizeof(xnn_float16), zero_pointer,
&params);
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max)
<< "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks();
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min)
<< "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks();
EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-2f))
<< "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_ppmm_minmax_ukernel_fn ppmm_minmax,
xnn_init_f32_minmax_params_fn init_params,
xnn_pack_f32_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
xnnpack::Buffer<float> a(packed_k() * mr());
xnnpack::Buffer<float> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> packed_w(packed_n() * packed_k() +
packed_n());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0.0f);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, /*params=*/nullptr);
for (size_t i = m(); i < mr(); i++) {
for (size_t l = 0; l < k(); l++) {
a[l * mr() + i] = a[l * mr() + m() - 1];
}
}
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
for (size_t l = 0; l < k(); l++) {
c_ref[i * n() + j] +=
a[l * mr() + i] *
b[j * k() + l];
}
c_ref[i * n() + j] += bias[j];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_minmax_params params;
init_params(&params, c_min, c_max);
for (float& c_value : c_ref) {
c_value = std::max(std::min(c_value, c_max), c_min);
}
ppmm_minmax(m(), n(), k() * sizeof(float),
a.data(), packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
&params);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_gemm_ukernel_fn gemm,
xnn_pack_f32_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
xnnpack::Buffer<float> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<float> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> packed_w(packed_n() * packed_k() +
packed_n());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0.0f);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, /*params=*/nullptr);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LE(n(), packed_n());
ASSERT_LT(m_index * n() + n_index, c_ref.size());
c_ref[m_index * n() + n_index] +=
a[m_index * a_stride() + k_index] *
b[n_index * k() + k_index];
}
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
gemm(m(), n(), k() * sizeof(float),
a.data(), a_stride() * sizeof(float),
packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
nullptr);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_gemm_relu_ukernel_fn gemm_relu,
xnn_pack_f32_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
xnnpack::Buffer<float> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<float> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> packed_w(packed_n() * packed_k() +
packed_n());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0.0f);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, /*params=*/nullptr);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LE(n(), packed_n());
ASSERT_LT(m_index * n() + n_index, c_ref.size());
c_ref[m_index * n() + n_index] +=
a[m_index * a_stride() + k_index] *
b[n_index * k() + k_index];
}
c_ref[m_index * n() + n_index] = std::max(0.0f, c_ref[m_index * n() + n_index] + bias[n_index]);
}
}
gemm_relu(m(), n(), k() * sizeof(float),
a.data(), a_stride() * sizeof(float),
packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
nullptr);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], 0.0f)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_gemm_minmax_ukernel_fn gemm_minmax,
xnn_init_f32_minmax_params_fn init_params,
xnn_pack_f32_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
xnnpack::Buffer<float> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<float> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> packed_w(packed_n() * packed_k() +
packed_n());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0.0f);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, /*params=*/nullptr);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LE(n(), packed_n());
ASSERT_LT(m_index * n() + n_index, c_ref.size());
c_ref[m_index * n() + n_index] +=
a[m_index * a_stride() + k_index] *
b[n_index * k() + k_index];
}
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_minmax_params params;
init_params(&params, c_min, c_max);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
gemm_minmax(m(), n(), k() * sizeof(float),
a.data(), a_stride() * sizeof(float),
packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
&params);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_gemm_goi_minmax_ukernel_fn gemm_minmax,
xnn_init_f32_minmax_params_fn init_params) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
xnnpack::Buffer<float> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<float> b(n() * k());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LT(m_index * n() + n_index, c_ref.size());
c_ref[m_index * n() + n_index] +=
a[m_index * a_stride() + k_index] *
b[n_index * k() + k_index];
}
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_minmax_params params;
init_params(&params, c_min, c_max);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
gemm_minmax(m(), n(), k() * sizeof(float),
a.data(), a_stride() * sizeof(float),
b.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
&params);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]
<< ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]
<< ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()]
<< ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_qc4w_gemm_minmax_ukernel_fn gemm_minmax,
xnn_init_f32_qc4w_minmax_params_fn init_params,
xnn_pack_f32_qc4w_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
std::uniform_int_distribution<int32_t> i8dist(-1, std::numeric_limits<uint8_t>::max());
const size_t k_stride = (k() + 1) / 2;
const size_t packed_k_bytes = (packed_k() + 1) / 2;
xnnpack::Buffer<float> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<uint8_t> b(n() * k_stride);
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> scale(n());
xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k_bytes + packed_n() * sizeof(float) * 2);
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<double> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::generate(scale.begin(), scale.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0);
std::generate(b.begin(), b.end(), [&]() { return i8dist(rng); });
// For odd k ensure last nibble is padded with 0
if (k() & 1) {
for (size_t n_index = 0; n_index < n(); n_index++) {
const size_t nb_index = n_index * k_stride + k_stride - 1;
b[nb_index] &= 0xF;
}
}
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), nr() * sizeof(float), /*params=*/nullptr);
// Fill in packed scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k_bytes + (sizeof(float) + sizeof(float))),
nr() * (ks() * packed_k_bytes + (sizeof(float) + sizeof(float))),
0,
scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k_bytes + sizeof(float))));
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = double(bias[n_index]);
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LE(n(), packed_n());
ASSERT_LT(m_index * n() + n_index, c_ref.size());
const size_t nb_index = n_index * k_stride + k_index / 2;
const int16_t bv = int16_t((k_index % 2 == 0) ? (b[nb_index] & UINT8_C(0xF)) : (b[nb_index] >> 4u)) - b_zero_point();
c_ref[m_index * n() + n_index] +=
double(a[m_index * a_stride() + k_index]) *
double(bv);
}
c_ref[m_index * n() + n_index] *= double(scale[n_index]);
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_qc4w_minmax_params params;
init_params(&params, c_min, c_max, b_zero_point());
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], double(c_max)), double(c_min));
}
}
gemm_minmax(m(), n(), k() * sizeof(float), // Note KC measured in bytes of input
a.data(), a_stride() * sizeof(float),
packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
&params);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5, std::abs(c_ref[i * n() + j]) * 1.0e-6))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_qc8w_gemm_ukernel_fn gemm,
xnn_pack_f32_qs8w_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
std::uniform_int_distribution<int32_t> i8dist(-1, std::numeric_limits<int8_t>::max());
xnnpack::Buffer<float> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<int8_t> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> scale(n());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() + packed_n() * sizeof(float) * 2);
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return i8dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::generate(scale.begin(), scale.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), nr() * sizeof(float), /*params=*/nullptr);
// Fill in packed scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + (sizeof(float) + sizeof(float))),
nr() * (ks() * packed_k() * sizeof(int8_t) + (sizeof(float) + sizeof(float))),
0,
scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + sizeof(float))));
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LE(n(), packed_n());
ASSERT_LT(m_index * n() + n_index, c_ref.size());
c_ref[m_index * n() + n_index] +=
a[m_index * a_stride() + k_index] *
(float) b[n_index * k() + k_index];
}
c_ref[m_index * n() + n_index] += bias[n_index];
c_ref[m_index * n() + n_index] *= scale[n_index];
}
}
gemm(m(), n(), k() * sizeof(float), // Note KC measured in bytes of input
a.data(), a_stride() * sizeof(float),
packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
nullptr);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
0.1f);
}
}
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_qc8w_gemm_relu_ukernel_fn gemm_relu,
xnn_pack_f32_qs8w_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
std::uniform_int_distribution<int32_t> i8dist(-1, std::numeric_limits<int8_t>::max());
xnnpack::Buffer<float> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<int8_t> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> scale(n());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() + packed_n() * sizeof(float) * 2);
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return i8dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::generate(scale.begin(), scale.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), nr() * sizeof(float), /*params=*/nullptr);
// Fill in packed scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + (sizeof(float) + sizeof(float))),
nr() * (ks() * packed_k() * sizeof(int8_t) + (sizeof(float) + sizeof(float))),
0,
scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + sizeof(float))));
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LE(n(), packed_n());
ASSERT_LT(m_index * n() + n_index, c_ref.size());
c_ref[m_index * n() + n_index] +=
a[m_index * a_stride() + k_index] *
(float) b[n_index * k() + k_index];
}
c_ref[m_index * n() + n_index] = std::max(0.0f, c_ref[m_index * n() + n_index] + bias[n_index]);
c_ref[m_index * n() + n_index] *= scale[n_index];
}
}
gemm_relu(m(), n(), k() * sizeof(float), // Note KC measured in bytes of input
a.data(), a_stride() * sizeof(float),
packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
nullptr);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], 0.0f)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_qc8w_gemm_minmax_ukernel_fn gemm_minmax,
xnn_init_f32_minmax_params_fn init_params,
xnn_pack_f32_qs8w_gemm_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
std::uniform_int_distribution<int32_t> i8dist(-1, std::numeric_limits<int8_t>::max());
xnnpack::Buffer<float> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<int8_t> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> scale(n());
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k() + packed_n() * sizeof(float) * 2);
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<double> c_ref(m() * n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return i8dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::generate(scale.begin(), scale.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), nr() * sizeof(float), /*params=*/nullptr);
// Fill in packed scale
xnn_init_qs8_qc8w_scale_fp32_params(
n(), nr(), nr(),
nr() * (ks() * packed_k() * sizeof(int8_t) + (sizeof(float) + sizeof(float))),
nr() * (ks() * packed_k() * sizeof(int8_t) + (sizeof(float) + sizeof(float))),
0,
scale.data(),
(void*) ((uintptr_t) packed_w.data() + nr() * (ks() * packed_k() * sizeof(int8_t) + sizeof(float))));
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = double(bias[n_index]);
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LE(n(), packed_n());
ASSERT_LT(m_index * n() + n_index, c_ref.size());
c_ref[m_index * n() + n_index] +=
double(a[m_index * a_stride() + k_index]) *
double(b[n_index * k() + k_index]);
}
c_ref[m_index * n() + n_index] *= double(scale[n_index]);
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min =
qmin() == std::numeric_limits<uint8_t>::min() ? -std::numeric_limits<float>::infinity()
: accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max =
qmax() == std::numeric_limits<uint8_t>::max() ? +std::numeric_limits<float>::infinity()
: accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_minmax_params params;
init_params(&params, c_min, c_max);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], double(c_max)), double(c_min));
}
}
gemm_minmax(m(), n(), k() * sizeof(float), // Note KC measured in bytes of input
a.data(), a_stride() * sizeof(float),
packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
&params);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5, std::abs(c_ref[i * n() + j]) * 1.0e-6))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_gemminc_minmax_ukernel_fn gemminc,
xnn_init_f32_minmax_params_fn init_params,
xnn_pack_f32_gemminc_fn pack) const
{
ASSERT_LE(m(), mr());
ASSERT_GE(a_stride(), k());
ASSERT_GE(cm_stride(), n());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
xnnpack::Buffer<float> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<float> b(n() * k());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> packed_w(
packed_n() * packed_k()); // no packed_n()
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> acc(mr() * packed_n());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::generate(acc.begin(), acc.end(), [&]() { return f32dist(rng); });
std::fill(packed_w.begin(), packed_w.end(), 0.0f);
pack(/*g=*/1, n(), k(), nr(), kr(), sr(),
b.data(), packed_w.data(), /*params=*/nullptr);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LE(n(), packed_n());
ASSERT_LT(m_index * n() + n_index, c_ref.size());
c_ref[m_index * n() + n_index] +=
a[m_index * a_stride() + k_index] *
b[n_index * k() + k_index];
}
c_ref[m_index * n() + n_index] += acc[n_index / nr() * nr() * mr() + m_index % mr() * nr() + n_index % nr()];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
// Prepare parameters.
xnn_f32_minmax_params params;
init_params(&params, c_min, c_max);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::max(std::min(c_ref[m_index * n() + n_index], c_max), c_min);
}
}
gemminc(m(), n(), k() * sizeof(float),
a.data(), a_stride() * sizeof(float),
packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
acc.data(),
&params);
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min)
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_igemm_ukernel_fn igemm,
xnn_pack_f32_igemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
xnnpack::Buffer<float> a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<float> b(n() * ks() * k());
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> packed_w(
ks() * packed_k() * packed_n() + packed_n());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
xnnpack::Buffer<float> junk(k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<const float*> im2col(mr() * ks());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0.0f);
pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, /*params=*/nullptr);
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = 0; m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = a.data() + a_stride() * m_index - a_offset();
}
}
std::shuffle(im2col.begin(), im2col.end(), rng);
if (zero_index() != SIZE_MAX) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
im2col[ks_index * mr() + zero_index()] = a.data();
}
}
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = m(); m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = junk.data();
}
}
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LT(ks_index * mr() + m_index, im2col.size());
ASSERT_LT(k_index, k());
ASSERT_LT(k_index, a_stride());
if (im2col[ks_index * mr() + m_index] == a.data()) {
c_ref[m_index * n() + n_index] +=
(im2col[ks_index * mr() + m_index][k_index]) *
(b[(n_index * ks() + ks_index) * k() + k_index]);
} else {
c_ref[m_index * n() + n_index] +=
(im2col[ks_index * mr() + m_index][k_index + a_offset()]) *
(b[(n_index * ks() + ks_index) * k() + k_index]);
}
}
}
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float* zero_pointer = (zero_index() != SIZE_MAX) ? a.data() : nullptr;
igemm(
m(), n(), k() * sizeof(float), ks() * mr() * sizeof(void*),
im2col.data(), packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
a_offset() * sizeof(float), zero_pointer,
nullptr);
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_igemm_relu_ukernel_fn igemm_relu,
xnn_pack_f32_igemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
xnnpack::Buffer<float> a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<float> b(n() * ks() * k());
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> packed_w(
ks() * packed_k() * packed_n() + packed_n());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
xnnpack::Buffer<float> junk(k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<const float*> im2col(mr() * ks());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0.0f);
pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, /*params=*/nullptr);
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = 0; m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = a.data() + a_stride() * m_index - a_offset();
}
}
std::shuffle(im2col.begin(), im2col.end(), rng);
if (zero_index() != SIZE_MAX) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
im2col[ks_index * mr() + zero_index()] = a.data();
}
}
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = m(); m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = junk.data();
}
}
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LT(ks_index * mr() + m_index, im2col.size());
ASSERT_LT(k_index, k());
ASSERT_LT(k_index, a_stride());
if (im2col[ks_index * mr() + m_index] == a.data()) {
c_ref[m_index * n() + n_index] +=
(im2col[ks_index * mr() + m_index][k_index]) *
(b[(n_index * ks() + ks_index) * k() + k_index]);
} else {
c_ref[m_index * n() + n_index] +=
(im2col[ks_index * mr() + m_index][k_index + a_offset()]) *
(b[(n_index * ks() + ks_index) * k() + k_index]);
}
}
}
c_ref[m_index * n() + n_index] = std::max(0.0f, bias[n_index] + c_ref[m_index * n() + n_index]);
}
}
const float* zero_pointer = (zero_index() != SIZE_MAX) ? a.data() : nullptr;
igemm_relu(
m(), n(), k() * sizeof(float), ks() * mr() * sizeof(void*),
im2col.data(), packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
a_offset() * sizeof(float), zero_pointer,
nullptr);
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], 0.0f)
<< "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks();
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks();
}
}
}
}
void GemmMicrokernelTester::Test(
xnn_f32_igemm_minmax_ukernel_fn igemm_minmax,
xnn_init_f32_minmax_params_fn init_params,
xnn_pack_f32_igemm_fn pack) const
{
ASSERT_LE(m(), mr());
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> f32dist;
xnnpack::Buffer<float> a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<float> b(n() * ks() * k());
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> packed_w(
ks() * packed_k() * packed_n() + packed_n());
xnnpack::Buffer<float> bias(n());
xnnpack::Buffer<float> c((mr() - 1) * cm_stride() + ((n() - 1) / nr()) * cn_stride() + (n() - 1) % nr() + 1);
xnnpack::Buffer<float> c_ref(m() * n());
xnnpack::Buffer<float> junk(k() + XNN_EXTRA_BYTES / sizeof(float));
xnnpack::Buffer<const float*> im2col(mr() * ks());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); });
std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); });
std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
std::fill(packed_w.begin(), packed_w.end(), 0.0f);
pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(),
b.data(), bias.data(), /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, /*params=*/nullptr);
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = 0; m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = a.data() + a_stride() * m_index - a_offset();
}
}
std::shuffle(im2col.begin(), im2col.end(), rng);
if (zero_index() != SIZE_MAX) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
im2col[ks_index * mr() + zero_index()] = a.data();
}
}
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t m_index = m(); m_index < mr(); m_index++) {
im2col[ks_index * mr() + m_index] = junk.data();
}
}
std::fill(c_ref.begin(), c_ref.end(), 0.0f);
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
for (size_t ks_index = 0; ks_index < ks(); ks_index++) {
for (size_t k_index = 0; k_index < k(); k_index++) {
ASSERT_LT(ks_index * mr() + m_index, im2col.size());
ASSERT_LT(k_index, k());
ASSERT_LT(k_index, a_stride());
if (im2col[ks_index * mr() + m_index] == a.data()) {
c_ref[m_index * n() + n_index] +=
(im2col[ks_index * mr() + m_index][k_index]) *
(b[(n_index * ks() + ks_index) * k() + k_index]);
} else {
c_ref[m_index * n() + n_index] +=
(im2col[ks_index * mr() + m_index][k_index + a_offset()]) *
(b[(n_index * ks() + ks_index) * k() + k_index]);
}
}
}
c_ref[m_index * n() + n_index] += bias[n_index];
}
}
const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
const float c_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
const float c_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
for (size_t m_index = 0; m_index < m(); m_index++) {
for (size_t n_index = 0; n_index < n(); n_index++) {
c_ref[m_index * n() + n_index] = std::min(c_ref[m_index * n() + n_index], c_max);
c_ref[m_index * n() + n_index] = std::max(c_ref[m_index * n() + n_index], c_min);
}
}
// Prepare parameters.
xnn_f32_minmax_params params;
init_params(&params, c_min, c_max);
const float* zero_pointer = (zero_index() != SIZE_MAX) ? a.data() : nullptr;
igemm_minmax(
m(), n(), k() * sizeof(float), ks() * mr() * sizeof(void*),
im2col.data(), packed_w.data(),
c.data(), cm_stride() * sizeof(float), cn_stride() * sizeof(float),
a_offset() * sizeof(float), zero_pointer,
&params);
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
EXPECT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max)
<< "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks();
EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min)
<< "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks();
EXPECT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],
std::max(1.0e-5f, std::abs(c_ref[i * n() + j]) * 1.0e-6f))
<< "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j]
<< ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
<< " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks();
}
}
}
}