sglang_v0.5.2/pytorch_2.8.0/third_party/XNNPACK/bench/BUILD.bazel

647 lines
14 KiB
Python

# Copyright 2023 Google LLC
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
load(
"//:build_defs.bzl",
"xnnpack_benchmark",
"xnnpack_cxx_library",
"xnnpack_if_kleidiai_enabled",
"xnnpack_kleidiai_defines",
"xnnpack_optional_dnnl_copts",
"xnnpack_optional_dnnl_deps",
"xnnpack_optional_gemmlowp_copts",
"xnnpack_optional_gemmlowp_deps",
"xnnpack_optional_ruy_copts",
"xnnpack_optional_ruy_deps",
"xnnpack_optional_tflite_copts",
"xnnpack_optional_tflite_deps",
"xnnpack_slow_benchmark_tags",
"xnnpack_visibility",
)
load(
"//:build_params.bzl",
"xnnpack_select_if",
)
MICROKERNEL_BENCHMARK_DEPS = [
":bench_utils",
"//:aligned_allocator",
"//:all_microkernels",
"//:buffer",
"//:common",
"//:datatype",
"//:hardware_config",
"//:math",
"//:microkernels_h",
"//:microparams_init",
"//:microparams",
"//:packing",
"//:params",
"//:xnnpack_h",
]
OPERATOR_BENCHMARK_DEPS = [
":bench_utils",
"//:XNNPACK",
"//:aligned_allocator",
"//:buffer",
"//:cache",
"//:common",
"//:datatype",
"//:math",
]
############################### Helper libraries ###############################
xnnpack_cxx_library(
name = "bench_utils",
srcs = ["utils.cc"],
hdrs = ["utils.h"],
copts = select({
"//:cpuinfo_enabled": ["-DXNN_ENABLE_CPUINFO=1"],
"//conditions:default": ["-DXNN_ENABLE_CPUINFO=0"],
}),
visibility = xnnpack_visibility(),
deps = [
"//:allocator",
"//:common",
"//:hardware_config",
"//:memory",
"//:params",
"//:xnnpack_h",
"@com_google_benchmark//:benchmark",
] + xnnpack_select_if(
"//:cpuinfo_enabled",
["@cpuinfo"],
),
)
cc_library(
name = "conv",
hdrs = ["conv.h"],
deps = [
"@com_google_benchmark//:benchmark",
],
)
cc_library(
name = "dwconv",
hdrs = ["dwconv.h"],
deps = [
"@com_google_benchmark//:benchmark",
],
)
cc_library(
name = "spmm",
hdrs = ["spmm.h"],
deps = [
"@com_google_benchmark//:benchmark",
],
)
xnnpack_cxx_library(
name = "gemm_benchmark",
srcs = [
"gemm-benchmark.cc",
],
hdrs = [
"gemm.h",
"gemm-benchmark.h",
],
deps = MICROKERNEL_BENCHMARK_DEPS + [
"//:config_hdrs",
"@com_google_benchmark//:benchmark",
],
)
xnnpack_cxx_library(
name = "packw_benchmark",
hdrs = [
"packw-benchmark.h",
],
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
"@com_google_benchmark//:benchmark",
],
)
xnnpack_cxx_library(
name = "bgemm",
hdrs = [
"bgemm.h",
],
deps = MICROKERNEL_BENCHMARK_DEPS + [
"@com_google_benchmark//:benchmark",
],
)
######################### Benchmarks for micro-kernels #########################
[xnnpack_benchmark(
name = "%s_bench" % kernel,
srcs = [
"%s.cc" % kernel.replace("_", "-"),
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":gemm_benchmark",
"//:allocator",
"//:isa_checks",
],
) for kernel in [
"bf16_gemm",
"qd8_f16_qb4w_gemm",
"qd8_f32_qb4w_gemm",
"qd8_f16_qc8w_gemm",
"qd8_f32_qc8w_gemm",
"qd8_f16_qc4w_gemm",
"qd8_f32_qc4w_gemm",
"qs8_qc8w_gemm_fp32",
"qu8_gemm_fp32",
"qu8_gemm_rndnu",
"f16_f32acc_gemm",
"f16_gemm",
"f32_qc4w_gemm",
"f32_qc8w_gemm",
]]
[xnnpack_benchmark(
name = "%s_bench" % kernel,
srcs = [
"%s.cc" % kernel.replace("_", "-"),
],
copts = xnnpack_optional_ruy_copts() + xnnpack_optional_gemmlowp_copts(),
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":gemm_benchmark",
"//:allocator",
"//:isa_checks",
] + xnnpack_optional_ruy_deps() + xnnpack_optional_gemmlowp_deps(),
) for kernel in [
"qs8_gemm",
"qu8_gemm",
"f16_gemm_minmax",
"f32_gemm",
"f32_gemm_minmax",
"f32_gemm_goi_minmax",
]]
xnnpack_benchmark(
name = "f32_bgemm_bench",
srcs = [
"f32-bgemm.cc",
],
copts = xnnpack_optional_ruy_copts(),
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
"//:allocator",
] + xnnpack_optional_ruy_deps(),
)
xnnpack_benchmark(
name = "qp8_f32_qc4w_gemm_bench",
srcs = [
"qp8-f32-qc4w-gemm.cc",
],
defines = xnnpack_kleidiai_defines(),
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":gemm_benchmark",
"//:isa_checks",
] + xnnpack_if_kleidiai_enabled([
"@KleidiAI//kai/ukernels/matmul",
]),
)
xnnpack_benchmark(
name = "qp8_f32_qb4w_gemm",
srcs = ["qp8-f32-qb4w-gemm.cc"],
defines = xnnpack_kleidiai_defines(),
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":gemm_benchmark",
"//:isa_checks",
] + xnnpack_if_kleidiai_enabled([
"@KleidiAI//kai/ukernels/matmul",
]),
)
[xnnpack_benchmark(
name = "%s_bench" % kernel,
srcs = [
"%s.cc" % kernel.replace("_", "-"),
"rsum-benchmark.h",
],
deps = MICROKERNEL_BENCHMARK_DEPS,
) for kernel in [
"qs8_rdsum",
"qu8_rdsum",
"qs8_rsum",
"qu8_rsum",
"f16_rsum",
"f16_f32acc_rsum",
"f32_rsum",
"f16_f32acc_rdsum",
"f32_rdsum",
]]
[xnnpack_benchmark(
name = "%s_bench" % kernel,
srcs = [
"%s.cc" % kernel.replace("_", "-"),
],
deps = MICROKERNEL_BENCHMARK_DEPS,
) for kernel in [
"f16_raddstoreexpminusmax",
"f16_rmax",
"f16_rminmax",
"f16_rmin",
"f32_raddexpminusmax",
"f32_raddextexp",
"f32_raddstoreexpminusmax",
"f32_rmax",
"f32_rminmax",
"f32_rmin",
"f32_vscaleexpminusmax",
"f32_vscaleextexp",
"f16_vcmul",
"f32_vcmul",
"x8_lut",
]]
[xnnpack_benchmark(
name = "%s_bench" % kernel,
srcs = [
"%s.cc" % kernel.replace("_", "-"),
],
deps = MICROKERNEL_BENCHMARK_DEPS,
) for kernel in [
"xx_transposev",
"xN_transposec",
]]
xnnpack_benchmark(
name = "qs8_dwconv_bench",
srcs = [
"qs8-dwconv.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":dwconv",
"//:indirection",
"//:microkernel_configs",
"//:microkernel_utils",
],
)
xnnpack_benchmark(
name = "f16_f32acc_igemm_bench",
srcs = [
"f16-f32acc-igemm.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":conv",
"//:indirection",
],
)
xnnpack_benchmark(
name = "f16_igemm_bench",
srcs = [
"f16-igemm.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":conv",
"//:indirection",
],
)
xnnpack_benchmark(
name = "vunary_bench",
srcs = ["vunary.cc"],
deps = MICROKERNEL_BENCHMARK_DEPS,
)
xnnpack_benchmark(
name = "vbinary_bench",
srcs = ["vbinary.cc"],
deps = MICROKERNEL_BENCHMARK_DEPS,
)
xnnpack_benchmark(
name = "f32_igemm_bench",
srcs = [
"f32-igemm.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":conv",
"//:indirection",
],
)
xnnpack_benchmark(
name = "f32_conv_hwc_bench",
srcs = [
"dconv.h",
"f32-conv-hwc.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS,
)
xnnpack_benchmark(
name = "f16_conv_hwc2chw_bench",
srcs = [
"dconv.h",
"f16-conv-hwc2chw.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS,
)
xnnpack_benchmark(
name = "f32_conv_hwc2chw_bench",
srcs = [
"dconv.h",
"f32-conv-hwc2chw.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS,
)
xnnpack_benchmark(
name = "f16_dwconv_bench",
srcs = [
"f16-dwconv.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":dwconv",
"//:indirection",
"//:microkernel_utils",
],
)
xnnpack_benchmark(
name = "f32_dwconv_bench",
srcs = [
"f32-dwconv.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":dwconv",
"//:indirection",
"//:microkernel_utils",
],
)
xnnpack_benchmark(
name = "f32_dwconv2d_chw_bench",
srcs = [
"f32-dwconv2d-chw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":dwconv",
"//:indirection",
],
)
xnnpack_benchmark(
name = "f16_dwconv2d_chw_bench",
srcs = [
"f16-dwconv2d-chw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":dwconv",
"//:indirection",
],
)
xnnpack_benchmark(
name = "f32_spmm_bench",
srcs = [
"f32-spmm.cc",
"spmm-benchmark.h",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":spmm",
"//:isa_checks",
],
)
xnnpack_benchmark(
name = "f16_spmm_bench",
srcs = [
"f16-spmm.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS + [":spmm"],
)
xnnpack_benchmark(
name = "f32_softmax_bench",
srcs = [
"f32-softmax.cc",
],
copts = xnnpack_optional_dnnl_copts(),
deps = MICROKERNEL_BENCHMARK_DEPS + xnnpack_optional_dnnl_deps(),
)
xnnpack_benchmark(
name = "f32_im2col_gemm_bench",
srcs = [
"f32-im2col-gemm.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":conv",
"//:im2col",
],
)
xnnpack_cxx_library(
name = "packq_benchmark",
srcs = [
"packq-benchmark.cc",
],
hdrs = ["packq-benchmark.h"],
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
"@com_google_benchmark//:benchmark",
],
)
xnnpack_benchmark(
name = "x8_packq_bench",
srcs = [
"x8-packq.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
":packq_benchmark",
"//:allocator",
],
)
xnnpack_benchmark(
name = "x8_packw_bench",
srcs = [
"x8-packw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
":packw_benchmark",
"//:allocator",
],
)
xnnpack_benchmark(
name = "qs8_packw_bench",
srcs = [
"qs8-packw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
":packw_benchmark",
"//:allocator",
],
)
xnnpack_benchmark(
name = "qs8_qc4w_packw_bench",
srcs = [
"qs8-qc4w-packw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
":packw_benchmark",
"//:allocator",
],
)
xnnpack_benchmark(
name = "x16_packw_bench",
srcs = [
"x16-packw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
":packw_benchmark",
"//:allocator",
],
)
xnnpack_benchmark(
name = "x32_packw_bench",
srcs = [
"x32-packw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
":packw_benchmark",
"//:allocator",
],
)
########################### Benchmarks for operators ###########################
xnnpack_benchmark(
name = "unary_bench",
srcs = ["unary.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)
xnnpack_benchmark(
name = "binary_bench",
srcs = ["binary.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)
xnnpack_benchmark(
name = "average_pooling_bench",
srcs = ["average-pooling.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)
xnnpack_benchmark(
name = "batch_matrix_multiply_bench",
srcs = ["batch-matrix-multiply.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps() + [
"//test:next_prime",
"@cpuinfo",
"@pthreadpool",
],
)
xnnpack_benchmark(
name = "channel_shuffle_bench",
srcs = ["channel-shuffle.cc"],
deps = OPERATOR_BENCHMARK_DEPS,
)
xnnpack_benchmark(
name = "convolution_bench",
srcs = ["convolution.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)
xnnpack_benchmark(
name = "deconvolution_bench",
srcs = ["deconvolution.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)
xnnpack_benchmark(
name = "fully_connected_bench",
srcs = ["fully-connected.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)
xnnpack_benchmark(
name = "max_pooling_bench",
srcs = ["max-pooling.cc"],
deps = OPERATOR_BENCHMARK_DEPS,
)
xnnpack_benchmark(
name = "prelu_bench",
srcs = ["prelu.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)
xnnpack_benchmark(
name = "scaled_dot_product_attention_bench",
srcs = ["scaled-dot-product-attention.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)