647 lines
14 KiB
Python
647 lines
14 KiB
Python
# Copyright 2023 Google LLC
|
|
#
|
|
# This source code is licensed under the BSD-style license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
load(
|
|
"//:build_defs.bzl",
|
|
"xnnpack_benchmark",
|
|
"xnnpack_cxx_library",
|
|
"xnnpack_if_kleidiai_enabled",
|
|
"xnnpack_kleidiai_defines",
|
|
"xnnpack_optional_dnnl_copts",
|
|
"xnnpack_optional_dnnl_deps",
|
|
"xnnpack_optional_gemmlowp_copts",
|
|
"xnnpack_optional_gemmlowp_deps",
|
|
"xnnpack_optional_ruy_copts",
|
|
"xnnpack_optional_ruy_deps",
|
|
"xnnpack_optional_tflite_copts",
|
|
"xnnpack_optional_tflite_deps",
|
|
"xnnpack_slow_benchmark_tags",
|
|
"xnnpack_visibility",
|
|
)
|
|
load(
|
|
"//:build_params.bzl",
|
|
"xnnpack_select_if",
|
|
)
|
|
|
|
MICROKERNEL_BENCHMARK_DEPS = [
|
|
":bench_utils",
|
|
"//:aligned_allocator",
|
|
"//:all_microkernels",
|
|
"//:buffer",
|
|
"//:common",
|
|
"//:datatype",
|
|
"//:hardware_config",
|
|
"//:math",
|
|
"//:microkernels_h",
|
|
"//:microparams_init",
|
|
"//:microparams",
|
|
"//:packing",
|
|
"//:params",
|
|
"//:xnnpack_h",
|
|
]
|
|
|
|
OPERATOR_BENCHMARK_DEPS = [
|
|
":bench_utils",
|
|
"//:XNNPACK",
|
|
"//:aligned_allocator",
|
|
"//:buffer",
|
|
"//:cache",
|
|
"//:common",
|
|
"//:datatype",
|
|
"//:math",
|
|
]
|
|
|
|
############################### Helper libraries ###############################
|
|
|
|
xnnpack_cxx_library(
|
|
name = "bench_utils",
|
|
srcs = ["utils.cc"],
|
|
hdrs = ["utils.h"],
|
|
copts = select({
|
|
"//:cpuinfo_enabled": ["-DXNN_ENABLE_CPUINFO=1"],
|
|
"//conditions:default": ["-DXNN_ENABLE_CPUINFO=0"],
|
|
}),
|
|
visibility = xnnpack_visibility(),
|
|
deps = [
|
|
"//:allocator",
|
|
"//:common",
|
|
"//:hardware_config",
|
|
"//:memory",
|
|
"//:params",
|
|
"//:xnnpack_h",
|
|
"@com_google_benchmark//:benchmark",
|
|
] + xnnpack_select_if(
|
|
"//:cpuinfo_enabled",
|
|
["@cpuinfo"],
|
|
),
|
|
)
|
|
|
|
cc_library(
|
|
name = "conv",
|
|
hdrs = ["conv.h"],
|
|
deps = [
|
|
"@com_google_benchmark//:benchmark",
|
|
],
|
|
)
|
|
|
|
cc_library(
|
|
name = "dwconv",
|
|
hdrs = ["dwconv.h"],
|
|
deps = [
|
|
"@com_google_benchmark//:benchmark",
|
|
],
|
|
)
|
|
|
|
cc_library(
|
|
name = "spmm",
|
|
hdrs = ["spmm.h"],
|
|
deps = [
|
|
"@com_google_benchmark//:benchmark",
|
|
],
|
|
)
|
|
|
|
xnnpack_cxx_library(
|
|
name = "gemm_benchmark",
|
|
srcs = [
|
|
"gemm-benchmark.cc",
|
|
],
|
|
hdrs = [
|
|
"gemm.h",
|
|
"gemm-benchmark.h",
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
"//:config_hdrs",
|
|
"@com_google_benchmark//:benchmark",
|
|
],
|
|
)
|
|
|
|
xnnpack_cxx_library(
|
|
name = "packw_benchmark",
|
|
hdrs = [
|
|
"packw-benchmark.h",
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":bgemm",
|
|
"@com_google_benchmark//:benchmark",
|
|
],
|
|
)
|
|
|
|
xnnpack_cxx_library(
|
|
name = "bgemm",
|
|
hdrs = [
|
|
"bgemm.h",
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
"@com_google_benchmark//:benchmark",
|
|
],
|
|
)
|
|
|
|
######################### Benchmarks for micro-kernels #########################
|
|
|
|
[xnnpack_benchmark(
|
|
name = "%s_bench" % kernel,
|
|
srcs = [
|
|
"%s.cc" % kernel.replace("_", "-"),
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":gemm_benchmark",
|
|
"//:allocator",
|
|
"//:isa_checks",
|
|
],
|
|
) for kernel in [
|
|
"bf16_gemm",
|
|
"qd8_f16_qb4w_gemm",
|
|
"qd8_f32_qb4w_gemm",
|
|
"qd8_f16_qc8w_gemm",
|
|
"qd8_f32_qc8w_gemm",
|
|
"qd8_f16_qc4w_gemm",
|
|
"qd8_f32_qc4w_gemm",
|
|
"qs8_qc8w_gemm_fp32",
|
|
"qu8_gemm_fp32",
|
|
"qu8_gemm_rndnu",
|
|
"f16_f32acc_gemm",
|
|
"f16_gemm",
|
|
"f32_qc4w_gemm",
|
|
"f32_qc8w_gemm",
|
|
]]
|
|
|
|
[xnnpack_benchmark(
|
|
name = "%s_bench" % kernel,
|
|
srcs = [
|
|
"%s.cc" % kernel.replace("_", "-"),
|
|
],
|
|
copts = xnnpack_optional_ruy_copts() + xnnpack_optional_gemmlowp_copts(),
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":gemm_benchmark",
|
|
"//:allocator",
|
|
"//:isa_checks",
|
|
] + xnnpack_optional_ruy_deps() + xnnpack_optional_gemmlowp_deps(),
|
|
) for kernel in [
|
|
"qs8_gemm",
|
|
"qu8_gemm",
|
|
"f16_gemm_minmax",
|
|
"f32_gemm",
|
|
"f32_gemm_minmax",
|
|
"f32_gemm_goi_minmax",
|
|
]]
|
|
|
|
xnnpack_benchmark(
|
|
name = "f32_bgemm_bench",
|
|
srcs = [
|
|
"f32-bgemm.cc",
|
|
],
|
|
copts = xnnpack_optional_ruy_copts(),
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":bgemm",
|
|
"//:allocator",
|
|
] + xnnpack_optional_ruy_deps(),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "qp8_f32_qc4w_gemm_bench",
|
|
srcs = [
|
|
"qp8-f32-qc4w-gemm.cc",
|
|
],
|
|
defines = xnnpack_kleidiai_defines(),
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":gemm_benchmark",
|
|
"//:isa_checks",
|
|
] + xnnpack_if_kleidiai_enabled([
|
|
"@KleidiAI//kai/ukernels/matmul",
|
|
]),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "qp8_f32_qb4w_gemm",
|
|
srcs = ["qp8-f32-qb4w-gemm.cc"],
|
|
defines = xnnpack_kleidiai_defines(),
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":gemm_benchmark",
|
|
"//:isa_checks",
|
|
] + xnnpack_if_kleidiai_enabled([
|
|
"@KleidiAI//kai/ukernels/matmul",
|
|
]),
|
|
)
|
|
|
|
[xnnpack_benchmark(
|
|
name = "%s_bench" % kernel,
|
|
srcs = [
|
|
"%s.cc" % kernel.replace("_", "-"),
|
|
"rsum-benchmark.h",
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS,
|
|
) for kernel in [
|
|
"qs8_rdsum",
|
|
"qu8_rdsum",
|
|
"qs8_rsum",
|
|
"qu8_rsum",
|
|
"f16_rsum",
|
|
"f16_f32acc_rsum",
|
|
"f32_rsum",
|
|
"f16_f32acc_rdsum",
|
|
"f32_rdsum",
|
|
]]
|
|
|
|
[xnnpack_benchmark(
|
|
name = "%s_bench" % kernel,
|
|
srcs = [
|
|
"%s.cc" % kernel.replace("_", "-"),
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS,
|
|
) for kernel in [
|
|
"f16_raddstoreexpminusmax",
|
|
"f16_rmax",
|
|
"f16_rminmax",
|
|
"f16_rmin",
|
|
"f32_raddexpminusmax",
|
|
"f32_raddextexp",
|
|
"f32_raddstoreexpminusmax",
|
|
"f32_rmax",
|
|
"f32_rminmax",
|
|
"f32_rmin",
|
|
"f32_vscaleexpminusmax",
|
|
"f32_vscaleextexp",
|
|
"f16_vcmul",
|
|
"f32_vcmul",
|
|
"x8_lut",
|
|
]]
|
|
|
|
[xnnpack_benchmark(
|
|
name = "%s_bench" % kernel,
|
|
srcs = [
|
|
"%s.cc" % kernel.replace("_", "-"),
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS,
|
|
) for kernel in [
|
|
"xx_transposev",
|
|
"xN_transposec",
|
|
]]
|
|
|
|
xnnpack_benchmark(
|
|
name = "qs8_dwconv_bench",
|
|
srcs = [
|
|
"qs8-dwconv.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":dwconv",
|
|
"//:indirection",
|
|
"//:microkernel_configs",
|
|
"//:microkernel_utils",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f16_f32acc_igemm_bench",
|
|
srcs = [
|
|
"f16-f32acc-igemm.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":conv",
|
|
"//:indirection",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f16_igemm_bench",
|
|
srcs = [
|
|
"f16-igemm.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":conv",
|
|
"//:indirection",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "vunary_bench",
|
|
srcs = ["vunary.cc"],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS,
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "vbinary_bench",
|
|
srcs = ["vbinary.cc"],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS,
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f32_igemm_bench",
|
|
srcs = [
|
|
"f32-igemm.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":conv",
|
|
"//:indirection",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f32_conv_hwc_bench",
|
|
srcs = [
|
|
"dconv.h",
|
|
"f32-conv-hwc.cc",
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS,
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f16_conv_hwc2chw_bench",
|
|
srcs = [
|
|
"dconv.h",
|
|
"f16-conv-hwc2chw.cc",
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS,
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f32_conv_hwc2chw_bench",
|
|
srcs = [
|
|
"dconv.h",
|
|
"f32-conv-hwc2chw.cc",
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS,
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f16_dwconv_bench",
|
|
srcs = [
|
|
"f16-dwconv.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":dwconv",
|
|
"//:indirection",
|
|
"//:microkernel_utils",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f32_dwconv_bench",
|
|
srcs = [
|
|
"f32-dwconv.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":dwconv",
|
|
"//:indirection",
|
|
"//:microkernel_utils",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f32_dwconv2d_chw_bench",
|
|
srcs = [
|
|
"f32-dwconv2d-chw.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":dwconv",
|
|
"//:indirection",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f16_dwconv2d_chw_bench",
|
|
srcs = [
|
|
"f16-dwconv2d-chw.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":dwconv",
|
|
"//:indirection",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f32_spmm_bench",
|
|
srcs = [
|
|
"f32-spmm.cc",
|
|
"spmm-benchmark.h",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":spmm",
|
|
"//:isa_checks",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f16_spmm_bench",
|
|
srcs = [
|
|
"f16-spmm.cc",
|
|
],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [":spmm"],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f32_softmax_bench",
|
|
srcs = [
|
|
"f32-softmax.cc",
|
|
],
|
|
copts = xnnpack_optional_dnnl_copts(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + xnnpack_optional_dnnl_deps(),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "f32_im2col_gemm_bench",
|
|
srcs = [
|
|
"f32-im2col-gemm.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":conv",
|
|
"//:im2col",
|
|
],
|
|
)
|
|
|
|
xnnpack_cxx_library(
|
|
name = "packq_benchmark",
|
|
srcs = [
|
|
"packq-benchmark.cc",
|
|
],
|
|
hdrs = ["packq-benchmark.h"],
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":bgemm",
|
|
"@com_google_benchmark//:benchmark",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "x8_packq_bench",
|
|
srcs = [
|
|
"x8-packq.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":bgemm",
|
|
":packq_benchmark",
|
|
"//:allocator",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "x8_packw_bench",
|
|
srcs = [
|
|
"x8-packw.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":bgemm",
|
|
":packw_benchmark",
|
|
"//:allocator",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "qs8_packw_bench",
|
|
srcs = [
|
|
"qs8-packw.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":bgemm",
|
|
":packw_benchmark",
|
|
"//:allocator",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "qs8_qc4w_packw_bench",
|
|
srcs = [
|
|
"qs8-qc4w-packw.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":bgemm",
|
|
":packw_benchmark",
|
|
"//:allocator",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "x16_packw_bench",
|
|
srcs = [
|
|
"x16-packw.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":bgemm",
|
|
":packw_benchmark",
|
|
"//:allocator",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "x32_packw_bench",
|
|
srcs = [
|
|
"x32-packw.cc",
|
|
],
|
|
tags = xnnpack_slow_benchmark_tags(),
|
|
deps = MICROKERNEL_BENCHMARK_DEPS + [
|
|
":bgemm",
|
|
":packw_benchmark",
|
|
"//:allocator",
|
|
],
|
|
)
|
|
|
|
########################### Benchmarks for operators ###########################
|
|
|
|
xnnpack_benchmark(
|
|
name = "unary_bench",
|
|
srcs = ["unary.cc"],
|
|
copts = xnnpack_optional_tflite_copts(),
|
|
tags = ["nowin32"],
|
|
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "binary_bench",
|
|
srcs = ["binary.cc"],
|
|
copts = xnnpack_optional_tflite_copts(),
|
|
tags = ["nowin32"],
|
|
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "average_pooling_bench",
|
|
srcs = ["average-pooling.cc"],
|
|
copts = xnnpack_optional_tflite_copts(),
|
|
tags = ["nowin32"],
|
|
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "batch_matrix_multiply_bench",
|
|
srcs = ["batch-matrix-multiply.cc"],
|
|
copts = xnnpack_optional_tflite_copts(),
|
|
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
|
|
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps() + [
|
|
"//test:next_prime",
|
|
"@cpuinfo",
|
|
"@pthreadpool",
|
|
],
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "channel_shuffle_bench",
|
|
srcs = ["channel-shuffle.cc"],
|
|
deps = OPERATOR_BENCHMARK_DEPS,
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "convolution_bench",
|
|
srcs = ["convolution.cc"],
|
|
copts = xnnpack_optional_tflite_copts(),
|
|
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
|
|
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "deconvolution_bench",
|
|
srcs = ["deconvolution.cc"],
|
|
copts = xnnpack_optional_tflite_copts(),
|
|
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
|
|
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "fully_connected_bench",
|
|
srcs = ["fully-connected.cc"],
|
|
copts = xnnpack_optional_tflite_copts(),
|
|
tags = ["nowin32"],
|
|
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "max_pooling_bench",
|
|
srcs = ["max-pooling.cc"],
|
|
deps = OPERATOR_BENCHMARK_DEPS,
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "prelu_bench",
|
|
srcs = ["prelu.cc"],
|
|
copts = xnnpack_optional_tflite_copts(),
|
|
tags = ["nowin32"],
|
|
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
|
|
)
|
|
|
|
xnnpack_benchmark(
|
|
name = "scaled_dot_product_attention_bench",
|
|
srcs = ["scaled-dot-product-attention.cc"],
|
|
copts = xnnpack_optional_tflite_copts(),
|
|
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
|
|
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
|
|
)
|