sglang_v0.5.2/pytorch_2.8.0/third_party/XNNPACK/test/static-reduce.cc

578 lines
21 KiB
C++

// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <algorithm>
#include <array>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <limits>
#include <memory>
#include <numeric>
#include <random>
#include <sstream>
#include <string>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "xnnpack.h"
#include "xnnpack/buffer.h"
#include "xnnpack/common.h"
#include "xnnpack/datatype.h"
#include "xnnpack/log.h"
#include "xnnpack/math.h"
#include "xnnpack/operator.h"
#include "xnnpack/subgraph.h"
#include "replicable_random_device.h"
struct Param {
using TupleT = std::tuple<xnn_datatype, xnn_reduce_operator, bool, bool>;
explicit Param(TupleT p)
: datatype(std::get<0>(p)),
reduce_operator(std::get<1>(p)),
keep_dims(std::get<2>(p)),
use_neg_axes(std::get<3>(p)) {}
std::string Name() const {
std::stringstream sstr;
switch (reduce_operator) {
case xnn_reduce_mean:
sstr << "mean";
break;
case xnn_reduce_sum:
sstr << "sum";
break;
case xnn_reduce_invalid:
sstr << "invalid";
break;
}
sstr << "_" << xnn_datatype_to_string(datatype);
if (keep_dims) {
sstr << "_keep_dims";
}
if (use_neg_axes) {
sstr << "_use_neg_axes";
}
return sstr.str();
}
xnn_datatype datatype;
xnn_reduce_operator reduce_operator;
bool keep_dims;
bool use_neg_axes;
};
namespace xnnpack {
template <class T>
class ReduceTestBase : public ::testing::TestWithParam<Param> {
protected:
void SetUp() override {
const Param p = GetParam();
auto num_input_dim_dist =
std::uniform_int_distribution<size_t>(2, XNN_MAX_TENSOR_DIMS);
const size_t num_input_dims = num_input_dim_dist(rng);
auto num_reduction_axes_dist =
std::uniform_int_distribution<size_t>(1, num_input_dims);
const size_t num_reduction_axes = num_reduction_axes_dist(rng);
auto axes_dist =
std::uniform_int_distribution<size_t>(0, num_input_dims - 1);
reduction_axes.resize(num_reduction_axes);
std::generate(reduction_axes.begin(), reduction_axes.end(),
[&]() { return axes_dist(rng); });
std::sort(reduction_axes.begin(), reduction_axes.end());
auto end = std::unique(reduction_axes.begin(), reduction_axes.end());
reduction_axes.erase(end, reduction_axes.end());
auto shape_dist = std::uniform_int_distribution<size_t>(2, 15);
input_shape.resize(num_input_dims);
std::generate(input_shape.begin(), input_shape.end(),
[&]() { return shape_dist(rng); });
num_input_elements =
std::accumulate(input_shape.cbegin(), input_shape.cend(), size_t(1),
std::multiplies<size_t>());
output_shape = input_shape;
for (size_t axis : reduction_axes) {
output_shape[axis] = 1;
}
num_output_elements =
std::accumulate(output_shape.cbegin(), output_shape.cend(), size_t(1),
std::multiplies<size_t>());
if (p.use_neg_axes) {
for (int i = 0; i < reduction_axes.size(); i++) {
reduction_axes[i] = reduction_axes[i] - num_input_dims;
}
}
input = xnnpack::Buffer<char>(XNN_EXTRA_BYTES / sizeof(char) +
num_input_elements * xnn_datatype_size_bytes(p.datatype));
operator_output =
xnnpack::Buffer<char>(num_output_elements * xnn_datatype_size_bytes(p.datatype));
subgraph_output =
xnnpack::Buffer<char>(num_output_elements * xnn_datatype_size_bytes(p.datatype));
}
struct QuantizationParams {
xnn_quantization_params input;
xnn_quantization_params output;
constexpr bool IsQuantized() const { return input.scale != 0; }
};
QuantizationParams RandomQuantizationParams(xnn_datatype t) {
QuantizationParams qp;
switch (t) {
case xnn_datatype_qint8:
qp.input.scale = scale_dist(rng);
qp.output.scale = scale_dist(rng);
qp.input.zero_point = i8dist(rng);
qp.output.zero_point = i8dist(rng);
break;
case xnn_datatype_quint8:
qp.input.scale = scale_dist(rng);
qp.output.scale = scale_dist(rng);
qp.input.zero_point = u8dist(rng);
qp.output.zero_point = u8dist(rng);
break;
default:
qp.input.scale = 0;
qp.output.scale = 0;
qp.input.zero_point = 0;
qp.output.zero_point = 0;
}
return qp;
}
void SetUpInputOutput(xnn_subgraph_t subgraph, const QuantizationParams& qp,
uint32_t& input_id, uint32_t& output_id) {
const Param p = GetParam();
if (qp.IsQuantized()) {
ASSERT_EQ(xnn_status_success,
xnn_define_quantized_tensor_value(
subgraph, p.datatype, qp.input.zero_point, qp.input.scale,
input_shape.size(), input_shape.data(), nullptr,
/*external_id=*/0,
/*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
ASSERT_EQ(xnn_status_success,
xnn_define_quantized_tensor_value(
subgraph, p.datatype, qp.output.zero_point, qp.output.scale,
output_shape.size(), output_shape.data(), nullptr,
/*external_id=*/1,
/*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
} else {
ASSERT_EQ(xnn_status_success,
xnn_define_tensor_value(
subgraph, p.datatype, input_shape.size(),
input_shape.data(), nullptr,
/*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT,
&input_id));
ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
ASSERT_EQ(xnn_status_success,
xnn_define_tensor_value(
subgraph, p.datatype, output_shape.size(),
output_shape.data(), nullptr, /*external_id=*/1,
/*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
}
}
template <class Datatype, class Dist>
void GenerateRandomInput(Dist& dist) {
Datatype* beg = reinterpret_cast<Datatype*>(input.data());
Datatype* end = reinterpret_cast<Datatype*>(input.data() + input.size());
std::generate(beg, end, [&]() { return dist(rng); });
}
void GenerateRandomInput(xnn_datatype t) {
switch (t) {
case xnn_datatype_fp16:
GenerateRandomInput<xnn_float16>(f32dist);
break;
case xnn_datatype_fp32:
GenerateRandomInput<float>(f32dist);
break;
case xnn_datatype_qint8:
GenerateRandomInput<int8_t>(i8dist);
break;
case xnn_datatype_quint8:
GenerateRandomInput<uint8_t>(u8dist);
break;
default:
XNN_UNREACHABLE;
}
}
template <class Datatype>
void CompareOutputsImpl() {
const Datatype* subgraph_out_ptr =
reinterpret_cast<const Datatype*>(subgraph_output.data());
const Datatype* operator_out_ptr =
reinterpret_cast<const Datatype*>(operator_output.data());
const size_t output_size = subgraph_output.size() / sizeof(Datatype);
for (size_t i = 0; i < output_size;
i++, ++subgraph_out_ptr, ++operator_out_ptr) {
const Datatype sub_out = *subgraph_out_ptr;
const Datatype op_out = *operator_out_ptr;
ASSERT_NEAR(sub_out, op_out, std::abs(0.05f * std::min(sub_out, op_out)));
}
}
void CompareOutputs(xnn_datatype t) {
switch (t) {
case xnn_datatype_fp16:
CompareOutputsImpl<xnn_float16>();
break;
case xnn_datatype_fp32:
CompareOutputsImpl<float>();
break;
case xnn_datatype_qint8:
CompareOutputsImpl<int8_t>();
break;
case xnn_datatype_quint8:
CompareOutputsImpl<uint8_t>();
break;
default:
XNN_UNREACHABLE;
}
}
xnnpack::ReplicableRandomDevice rng;
std::uniform_real_distribution<float> scale_dist =
std::uniform_real_distribution<float>(0.0f, 1.0f);
std::uniform_real_distribution<float> f32dist =
std::uniform_real_distribution<float>(-1.0f, 1.0f);
std::uniform_int_distribution<int32_t> i8dist =
std::uniform_int_distribution<int32_t>(
std::numeric_limits<int8_t>::min(),
std::numeric_limits<int8_t>::max());
std::uniform_int_distribution<int32_t> u8dist =
std::uniform_int_distribution<int32_t>(
std::numeric_limits<uint8_t>::min(),
std::numeric_limits<uint8_t>::max());
std::vector<int64_t> reduction_axes;
std::vector<size_t> input_shape;
size_t num_input_elements;
std::vector<size_t> output_shape;
size_t num_output_elements;
xnnpack::Buffer<char> input;
xnnpack::Buffer<char> operator_output;
xnnpack::Buffer<char> subgraph_output;
};
using ReduceTest = ReduceTestBase<void>;
using ReduceTestF16 = ReduceTestBase<xnn_float16>;
using ReduceTestF32 = ReduceTestBase<float>;
using ReduceTestQS8 = ReduceTestBase<int8_t>;
using ReduceTestQU8 = ReduceTestBase<uint8_t>;
using ::testing::Bool;
using ::testing::Combine;
using ::testing::Values;
INSTANTIATE_TEST_SUITE_P(ReduceTest, ReduceTest,
testing::ConvertGenerator<Param::TupleT>(Combine(
Values(xnn_datatype_fp16, xnn_datatype_fp32,
xnn_datatype_qint8, xnn_datatype_quint8),
Values(xnn_reduce_sum, xnn_reduce_mean), Bool(),
Bool())),
[](auto p) { return p.param.Name(); });
TEST_P(ReduceTest, define) {
const Param p = GetParam();
ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
xnn_subgraph_t subgraph = nullptr;
ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph));
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(
subgraph, xnn_delete_subgraph);
uint32_t input_id = XNN_INVALID_NODE_ID;
uint32_t output_id = XNN_INVALID_NODE_ID;
SetUpInputOutput(subgraph, RandomQuantizationParams(p.datatype), input_id,
output_id);
ASSERT_EQ(xnn_status_success,
xnn_define_static_reduce_v2(
subgraph, p.reduce_operator, reduction_axes.size(),
reduction_axes.data(), input_id, output_id,
/*flags=*/p.keep_dims ? XNN_FLAG_KEEP_DIMS : 0));
ASSERT_EQ(subgraph->num_nodes, 1);
const struct xnn_node* node = &subgraph->nodes[0];
ASSERT_EQ(node->type, xnn_reduce_operator_to_node_type(p.reduce_operator));
ASSERT_EQ(node->params.reduce.num_reduction_axes, reduction_axes.size());
for (size_t i = 0; i < reduction_axes.size(); i++) {
ASSERT_EQ(node->params.reduce.reduction_axes[i], reduction_axes[i]);
}
ASSERT_EQ(node->num_inputs, 1);
ASSERT_EQ(node->inputs[0], input_id);
ASSERT_EQ(node->num_outputs, 1);
ASSERT_EQ(node->outputs[0], output_id);
ASSERT_EQ(node->flags, p.keep_dims ? XNN_FLAG_KEEP_DIMS : 0);
}
TEST_P(ReduceTest, matches_operator_api) {
const Param p = GetParam();
ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
xnn_operator_t op = nullptr;
GenerateRandomInput(p.datatype);
const uint32_t flags = p.keep_dims ? XNN_FLAG_KEEP_DIMS : 0;
const QuantizationParams qp = RandomQuantizationParams(p.datatype);
// Call operator API.
const xnn_status status = xnn_create_reduce_nd(
p.reduce_operator, p.datatype, &qp.input, &qp.output, flags, &op);
if (status == xnn_status_unsupported_hardware) {
GTEST_SKIP();
}
ASSERT_EQ(xnn_status_success, status);
ASSERT_NE(nullptr, op);
std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_op(
op, xnn_delete_operator);
size_t workspace_size = SIZE_MAX;
size_t workspace_alignment = SIZE_MAX;
ASSERT_EQ(xnn_status_success,
xnn_reshape_reduce_nd(op, reduction_axes.size(),
reduction_axes.data(), input_shape.size(),
input_shape.data(), &workspace_size,
&workspace_alignment,
/*threadpool=*/nullptr));
ASSERT_NE(workspace_size, SIZE_MAX);
ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT);
xnnpack::Buffer<char, XNN_ALLOCATION_ALIGNMENT> workspace;
void* workspace_ptr = nullptr;
if (p.datatype != xnn_datatype_fp32) {
workspace = xnnpack::Buffer<char, XNN_ALLOCATION_ALIGNMENT>(workspace_size);
workspace_ptr = workspace.data();
}
ASSERT_EQ(xnn_status_success,
xnn_setup_reduce_nd(op, workspace_ptr, input.data(),
operator_output.data()));
ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr));
// Call subgraph API.
xnn_subgraph_t subgraph = nullptr;
ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph));
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(
subgraph, xnn_delete_subgraph);
uint32_t input_id = XNN_INVALID_NODE_ID;
uint32_t output_id = XNN_INVALID_NODE_ID;
int output_num_dims = input_shape.size();
if (!p.keep_dims) {
output_num_dims -= reduction_axes.size();
}
if (qp.IsQuantized()) {
ASSERT_EQ(xnn_status_success,
xnn_define_quantized_tensor_value(
subgraph, p.datatype, qp.input.zero_point, qp.input.scale,
input_shape.size(), input_shape.data(), nullptr,
/*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
ASSERT_EQ(
xnn_status_success,
xnn_define_quantized_tensor_value(
subgraph, p.datatype, qp.output.zero_point, qp.output.scale,
output_shape.size(), output_shape.data(), nullptr,
/*external_id=*/1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
} else {
ASSERT_EQ(
xnn_status_success,
xnn_define_tensor_value(subgraph, p.datatype, input_shape.size(),
input_shape.data(), nullptr, /*external_id=*/0,
XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
ASSERT_EQ(
xnn_status_success,
xnn_define_tensor_value(subgraph, p.datatype, output_num_dims,
output_shape.data(), nullptr, /*external_id=*/1,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
}
ASSERT_EQ(xnn_status_success,
xnn_define_static_reduce_v2(
subgraph, p.reduce_operator, reduction_axes.size(),
reduction_axes.data(), input_id, output_id, flags));
xnn_runtime_t runtime = nullptr;
ASSERT_EQ(
xnn_status_success,
xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime));
ASSERT_NE(nullptr, runtime);
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime, xnn_delete_runtime);
const std::array<xnn_external_value, 2> external = {
xnn_external_value{input_id, input.data()},
xnn_external_value{output_id, subgraph_output.data()}};
ASSERT_EQ(xnn_status_success,
xnn_setup_runtime(runtime, external.size(), external.data()));
ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime));
// Check outputs match.
CompareOutputs(p.datatype);
}
TEST_P(ReduceTest, reshape) {
const Param p = GetParam();
ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
GenerateRandomInput(p.datatype);
// Call subgraph API.
xnn_subgraph_t subgraph = nullptr;
ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph));
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(
subgraph, xnn_delete_subgraph);
uint32_t input_id = XNN_INVALID_NODE_ID;
uint32_t output_id = XNN_INVALID_NODE_ID;
QuantizationParams qp = RandomQuantizationParams(p.datatype);
const int output_num_dims = p.keep_dims
? output_shape.size()
: input_shape.size() - reduction_axes.size();
if (qp.IsQuantized()) {
ASSERT_EQ(xnn_status_success,
xnn_define_quantized_tensor_value(
subgraph, p.datatype, qp.input.zero_point, qp.input.scale,
input_shape.size(), input_shape.data(), nullptr,
/*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
ASSERT_EQ(
xnn_status_success,
xnn_define_quantized_tensor_value(
subgraph, p.datatype, qp.output.zero_point, qp.output.scale,
output_num_dims, output_shape.data(), nullptr, /*external_id=*/1,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
} else {
ASSERT_EQ(
xnn_status_success,
xnn_define_tensor_value(subgraph, p.datatype, input_shape.size(),
input_shape.data(), nullptr, /*external_id=*/0,
XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
ASSERT_EQ(
xnn_status_success,
xnn_define_tensor_value(subgraph, p.datatype, output_num_dims,
output_shape.data(), nullptr, /*external_id=*/1,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
}
ASSERT_EQ(xnn_define_static_reduce_v2(
subgraph, p.reduce_operator, reduction_axes.size(),
reduction_axes.data(), input_id, output_id,
/*flags=*/p.keep_dims ? XNN_FLAG_KEEP_DIMS : 0),
xnn_status_success);
xnn_runtime_t runtime = nullptr;
xnn_status status =
xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime);
if (status == xnn_status_unsupported_hardware) {
GTEST_SKIP();
}
ASSERT_EQ(status, xnn_status_success);
ASSERT_NE(nullptr, runtime);
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime, xnn_delete_runtime);
const std::array<xnn_external_value, 2> external = {
xnn_external_value{input_id, input.data()},
xnn_external_value{output_id, subgraph_output.data()}};
ASSERT_EQ(xnn_status_success,
xnn_setup_runtime(runtime, external.size(), external.data()));
ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime));
input_shape[0] += 2;
input_shape[1] += 4;
ASSERT_EQ(xnn_status_success,
xnn_reshape_external_value(runtime, input_id, input_shape.size(),
input_shape.data()));
const struct xnn_node* node = &subgraph->nodes[0];
std::vector<int64_t> unique_reduction_axes = reduction_axes;
for (int i = 0; i < unique_reduction_axes.size(); i++) {
if (unique_reduction_axes[i] < 0) {
unique_reduction_axes[i] = input_shape.size() + unique_reduction_axes[i];
}
}
std::sort(unique_reduction_axes.begin(), unique_reduction_axes.end());
auto end =
std::unique(unique_reduction_axes.begin(), unique_reduction_axes.end());
unique_reduction_axes.erase(end, unique_reduction_axes.end());
// There are too many parameters which influence the workspace size so
// knowing if reallocation is required or not is messy.
node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values,
/*threadpool=*/nullptr);
const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape;
size_t current_axes = 0;
size_t current_dim = 0;
for (size_t i = 0; i < input_shape.size(); ++i) {
if (unique_reduction_axes[current_axes] == i) {
if (p.keep_dims) {
ASSERT_EQ(output_shape->dim[current_dim], 1);
++current_dim;
}
++current_axes;
if (current_axes == unique_reduction_axes.size()) {
break;
}
} else {
ASSERT_EQ(output_shape->dim[current_dim], input_shape[i]);
++current_dim;
}
}
input_shape[0] -= 1;
ASSERT_EQ(xnn_status_success,
xnn_reshape_external_value(runtime, input_id, input_shape.size(),
input_shape.data()));
ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values,
runtime->num_values, /*threadpool=*/nullptr),
xnn_status_success);
current_axes = 0;
current_dim = 0;
for (size_t i = 0; i < input_shape.size(); ++i) {
if (unique_reduction_axes[current_axes] == i) {
if (p.keep_dims) {
ASSERT_EQ(output_shape->dim[current_dim], 1);
++current_dim;
}
++current_axes;
if (current_axes == unique_reduction_axes.size()) {
break;
}
} else {
ASSERT_EQ(output_shape->dim[current_dim], input_shape[i]);
++current_dim;
}
}
}
} // namespace xnnpack