// Copyright 2019 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include "vbinary-microkernel-tester.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "xnnpack.h" #include "xnnpack/buffer.h" #include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams-init.h" #include "xnnpack/microparams.h" #include "xnnpack/requantization.h" #include "replicable_random_device.h" void VBinaryMicrokernelTester::Test(xnn_f16_vbinary_ukernel_fn vbinary, OpType op_type, xnn_init_f16_default_params_fn) const { xnnpack::ReplicableRandomDevice rng; std::uniform_real_distribution f32dist(0.01f, 1.0f); xnnpack::Buffer a(batch_size() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); xnnpack::Buffer b( broadcast_b() ? 1 : batch_size() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); xnnpack::Buffer y( batch_size() + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(xnn_float16) : 0)); xnnpack::Buffer y_ref(batch_size()); for (size_t iteration = 0; iteration < iterations(); iteration++) { if (!inplace_a()) { std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); }); } if (!inplace_b()) { std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); }); } if (inplace_a() || inplace_b()) { std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); } const xnn_float16* a_data = inplace_a() ? y.data() : a.data(); const xnn_float16* b_data = inplace_b() ? y.data() : b.data(); reference_op_impl(a_data, b_data, y_ref.data(), batch_size(), op_type); // Call optimized micro-kernel. vbinary(batch_size() * sizeof(xnn_float16), a_data, b_data, y.data(), nullptr); // Verify results. for (size_t i = 0; i < batch_size(); i++) { EXPECT_NEAR(y[i], y_ref[i], std::max(1.0e-4f, std::abs(y_ref[i]) * 1.0e-2f)) << "at " << i << " / " << batch_size(); } } } void VBinaryMicrokernelTester::Test(xnn_f32_vbinary_ukernel_fn vbinary, OpType op_type, xnn_init_f32_default_params_fn) const { xnnpack::ReplicableRandomDevice rng; std::uniform_real_distribution f32dist(-1.0f, 1.0f); xnnpack::Buffer a(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); xnnpack::Buffer b( broadcast_b() ? 1 : batch_size() + XNN_EXTRA_BYTES / sizeof(float)); xnnpack::Buffer y(batch_size() + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); xnnpack::Buffer y_ref(batch_size()); for (size_t iteration = 0; iteration < iterations(); iteration++) { if (!inplace_a()) { std::generate(a.begin(), a.end(), [&]() { return f32dist(rng); }); } if (!inplace_b()) { std::generate(b.begin(), b.end(), [&]() { return f32dist(rng); }); } if (inplace_a() || inplace_b()) { std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); } const float* a_data = inplace_a() ? y.data() : a.data(); const float* b_data = inplace_b() ? y.data() : b.data(); reference_op_impl(a_data, b_data, y_ref.data(), batch_size(), op_type); // Call optimized micro-kernel. vbinary(batch_size() * sizeof(float), a_data, b_data, y.data(), nullptr); // Verify results. for (size_t i = 0; i < batch_size(); i++) { EXPECT_NEAR(y[i], y_ref[i], std::abs(y_ref[i]) * 1.0e-6f) << "at " << i << " / " << batch_size(); } } } void VBinaryMicrokernelTester::Test( xnn_qu8_vadd_minmax_ukernel_fn vadd_minmax, xnn_init_qu8_add_minmax_params_fn init_params) const { xnnpack::ReplicableRandomDevice rng; auto u8rng = [&rng]() { return std::uniform_int_distribution( 0, std::numeric_limits::max())(rng); }; xnnpack::Buffer a(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); xnnpack::Buffer b( broadcast_b() ? 1 : batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); xnnpack::Buffer y( batch_size() + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0)); xnnpack::Buffer y_fp(batch_size()); xnnpack::Buffer y_ref(batch_size()); for (size_t iteration = 0; iteration < iterations(); iteration++) { if (!inplace_a()) { std::generate(a.begin(), a.end(), [&]() { return u8rng(); }); } if (!inplace_b()) { std::generate(b.begin(), b.end(), [&]() { return u8rng(); }); } if (inplace_a() || inplace_b()) { std::generate(y.begin(), y.end(), [&]() { return u8rng(); }); } const uint8_t* a_data = inplace_a() ? y.data() : a.data(); const uint8_t* b_data = inplace_b() ? y.data() : b.data(); const size_t stride_b = broadcast_b() ? 0 : 1; // Prepare parameters. xnn_qu8_add_minmax_params params; struct xnn_quantization_params a_quantization = {a_zero_point(), a_scale()}; struct xnn_quantization_params b_quantization = {b_zero_point(), b_scale()}; struct xnn_quantization_params y_quantization = {y_zero_point(), y_scale()}; init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); // Compute reference results. for (size_t i = 0; i < batch_size(); i++) { y_fp[i] = static_cast(y_zero_point()) + static_cast(static_cast(a_data[i]) - static_cast(a_zero_point())) * (a_scale() / y_scale()) + static_cast(static_cast(b_data[i * stride_b]) - static_cast(b_zero_point())) * (b_scale() / y_scale()); y_fp[i] = std::min(y_fp[i], static_cast(UINT8_MAX)); y_fp[i] = std::max(y_fp[i], static_cast(0)); y_ref[i] = xnn_qu8_quantize_add(a_data[i], b_data[i * stride_b], params); } // Call optimized micro-kernel. vadd_minmax(batch_size(), a_data, b_data, y.data(), ¶ms); // Verify results. for (size_t i = 0; i < batch_size(); i++) { EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) << "at element " << i << " / " << batch_size(); EXPECT_EQ(static_cast(y_ref[i]), static_cast(y[i])) << "at element " << i << " / " << batch_size(); } } } void VBinaryMicrokernelTester::Test( xnn_qu8_vmul_minmax_ukernel_fn vmul_minmax, xnn_init_qu8_mul_minmax_params_fn init_params) const { xnnpack::ReplicableRandomDevice rng; auto u8rng = [&rng]() { return std::uniform_int_distribution( 0, std::numeric_limits::max())(rng); }; xnnpack::Buffer a(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); xnnpack::Buffer b( broadcast_b() ? 1 : batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); xnnpack::Buffer y( batch_size() + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0)); xnnpack::Buffer y_fp(batch_size()); xnnpack::Buffer y_ref(batch_size()); for (size_t iteration = 0; iteration < iterations(); iteration++) { if (!inplace_a()) { std::generate(a.begin(), a.end(), [&]() { return u8rng(); }); } if (!inplace_b()) { std::generate(b.begin(), b.end(), [&]() { return u8rng(); }); } if (inplace_a() || inplace_b()) { std::generate(y.begin(), y.end(), [&]() { return u8rng(); }); } const uint8_t* a_data = inplace_a() ? y.data() : a.data(); const uint8_t* b_data = inplace_b() ? y.data() : b.data(); const size_t stride_b = broadcast_b() ? 0 : 1; // Prepare parameters. const float product_scale = a_scale() * b_scale(); const float product_output_scale = product_scale / y_scale(); xnn_qu8_mul_minmax_params params; struct xnn_quantization_params a_quantization = {a_zero_point(), a_scale()}; struct xnn_quantization_params b_quantization = {b_zero_point(), b_scale()}; struct xnn_quantization_params y_quantization = {y_zero_point(), y_scale()}; init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); // Compute reference results. for (size_t i = 0; i < batch_size(); i++) { const int32_t acc = (static_cast(a_data[i]) - static_cast(a_zero_point())) * (static_cast(b_data[i * stride_b]) - static_cast(b_zero_point())); y_fp[i] = static_cast(y_zero_point()) + product_output_scale * static_cast(acc); y_fp[i] = std::min(y_fp[i], static_cast(UINT8_MAX)); y_fp[i] = std::max(y_fp[i], static_cast(0)); y_ref[i] = xnn_qu8_requantize_fp32(acc, product_output_scale, y_zero_point(), 0, UINT8_MAX); } // Call optimized micro-kernel. vmul_minmax(batch_size(), a_data, b_data, y.data(), ¶ms); // Verify results. for (size_t i = 0; i < batch_size(); i++) { EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) << "at element " << i << " / " << batch_size(); EXPECT_NEAR(static_cast(y[i]), static_cast(y_ref[i]), 1) << "at element " << i << " / " << batch_size(); } } } void VBinaryMicrokernelTester::Test( xnn_qs8_vadd_minmax_ukernel_fn vadd_minmax, xnn_init_qs8_add_minmax_params_fn init_params) const { xnnpack::ReplicableRandomDevice rng; auto i8rng = [&rng]() { return std::uniform_int_distribution( std::numeric_limits::min(), std::numeric_limits::max())(rng); }; xnnpack::Buffer a(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); xnnpack::Buffer b(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); xnnpack::Buffer y( batch_size() + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(int8_t) : 0)); xnnpack::Buffer y_fp(batch_size()); xnnpack::Buffer y_ref(batch_size()); for (size_t iteration = 0; iteration < iterations(); iteration++) { if (!inplace_a()) { std::generate(a.begin(), a.end(), [&]() { return i8rng(); }); } if (!inplace_b()) { std::generate(b.begin(), b.end(), [&]() { return i8rng(); }); } if (inplace_a() || inplace_b()) { std::generate(y.begin(), y.end(), [&]() { return i8rng(); }); } const int8_t* a_data = inplace_a() ? y.data() : a.data(); const int8_t* b_data = inplace_b() ? y.data() : b.data(); const size_t stride_b = broadcast_b() ? 0 : 1; // Prepare parameters. xnn_qs8_add_minmax_params params; struct xnn_quantization_params a_quantization = {a_zero_point() - 0x80, a_scale()}; struct xnn_quantization_params b_quantization = {b_zero_point() - 0x80, b_scale()}; struct xnn_quantization_params y_quantization = {y_zero_point() - 0x80, y_scale()}; init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); // Compute reference results. for (size_t i = 0; i < batch_size(); i++) { y_fp[i] = static_cast(static_cast(y_zero_point() - 0x80)) + static_cast(static_cast(a_data[i]) - static_cast(a_zero_point() - 0x80)) * (a_scale() / y_scale()) + static_cast(static_cast(b_data[i * stride_b]) - static_cast(b_zero_point() - 0x80)) * (b_scale() / y_scale()); y_fp[i] = std::min(y_fp[i], static_cast(INT8_MAX)); y_fp[i] = std::max(y_fp[i], static_cast(INT8_MIN)); y_ref[i] = xnn_qs8_quantize_add(a_data[i], b_data[i * stride_b], params); } // Call optimized micro-kernel. vadd_minmax(batch_size(), a_data, b_data, y.data(), ¶ms); // Verify results. for (size_t i = 0; i < batch_size(); i++) { EXPECT_EQ(static_cast(y_ref[i]), static_cast(y[i])) << "at element " << i << " / " << batch_size(); EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) << "at element " << i << " / " << batch_size(); } } } void VBinaryMicrokernelTester::Test( xnn_qs8_vmul_minmax_ukernel_fn vmul_minmax, xnn_init_qs8_mul_minmax_params_fn init_params) const { xnnpack::ReplicableRandomDevice rng; auto i8rng = [&rng]() { return std::uniform_int_distribution( std::numeric_limits::min(), std::numeric_limits::max())(rng); }; xnnpack::Buffer a(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); xnnpack::Buffer b(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); xnnpack::Buffer y( batch_size() + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(int8_t) : 0)); xnnpack::Buffer y_fp(batch_size()); xnnpack::Buffer y_ref(batch_size()); for (size_t iteration = 0; iteration < iterations(); iteration++) { if (!inplace_a()) { std::generate(a.begin(), a.end(), [&]() { return i8rng(); }); } if (!inplace_b()) { std::generate(b.begin(), b.end(), [&]() { return i8rng(); }); } if (inplace_a() || inplace_b()) { std::generate(y.begin(), y.end(), [&]() { return i8rng(); }); } const int8_t* a_data = inplace_a() ? y.data() : a.data(); const int8_t* b_data = inplace_b() ? y.data() : b.data(); const size_t stride_b = broadcast_b() ? 0 : 1; // Prepare parameters. xnn_qs8_mul_minmax_params params; struct xnn_quantization_params a_quantization = {a_zero_point() - 0x80, a_scale()}; struct xnn_quantization_params b_quantization = {b_zero_point() - 0x80, b_scale()}; struct xnn_quantization_params y_quantization = {y_zero_point() - 0x80, y_scale()}; init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); // Compute reference results. const float product_scale = a_scale() * b_scale(); const float product_output_scale = product_scale / y_scale(); EXPECT_GE(product_output_scale, 0x1.0p-32f); for (size_t i = 0; i < batch_size(); i++) { const int32_t acc = (static_cast(a_data[i]) - static_cast(a_zero_point() - 0x80)) * (static_cast(b_data[i * stride_b]) - static_cast(b_zero_point() - 0x80)); y_fp[i] = static_cast(y_zero_point() - 0x80) + product_output_scale * static_cast(acc); y_fp[i] = std::min(y_fp[i], static_cast(INT8_MAX)); y_fp[i] = std::max(y_fp[i], static_cast(INT8_MIN)); y_ref[i] = xnn_qs8_requantize_fp32( acc, product_output_scale, static_cast(y_zero_point() - 0x80), INT8_MIN, INT8_MAX); } // Call optimized micro-kernel. vmul_minmax(batch_size(), a_data, b_data, y.data(), ¶ms); // Verify results. for (size_t i = 0; i < batch_size(); i++) { EXPECT_NEAR(static_cast(y_ref[i]), static_cast(y[i]), 1) << "at element " << i << " / " << batch_size(); EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) << "at element " << i << " / " << batch_size(); } } }