# Copyright 2023 Google LLC # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. load( "//:build_defs.bzl", "xnnpack_benchmark", "xnnpack_cxx_library", "xnnpack_if_kleidiai_enabled", "xnnpack_kleidiai_defines", "xnnpack_optional_dnnl_copts", "xnnpack_optional_dnnl_deps", "xnnpack_optional_gemmlowp_copts", "xnnpack_optional_gemmlowp_deps", "xnnpack_optional_ruy_copts", "xnnpack_optional_ruy_deps", "xnnpack_optional_tflite_copts", "xnnpack_optional_tflite_deps", "xnnpack_slow_benchmark_tags", "xnnpack_visibility", ) load( "//:build_params.bzl", "xnnpack_select_if", ) MICROKERNEL_BENCHMARK_DEPS = [ ":bench_utils", "//:aligned_allocator", "//:all_microkernels", "//:buffer", "//:common", "//:datatype", "//:hardware_config", "//:math", "//:microkernels_h", "//:microparams_init", "//:microparams", "//:packing", "//:params", "//:xnnpack_h", ] OPERATOR_BENCHMARK_DEPS = [ ":bench_utils", "//:XNNPACK", "//:aligned_allocator", "//:buffer", "//:cache", "//:common", "//:datatype", "//:math", ] ############################### Helper libraries ############################### xnnpack_cxx_library( name = "bench_utils", srcs = ["utils.cc"], hdrs = ["utils.h"], copts = select({ "//:cpuinfo_enabled": ["-DXNN_ENABLE_CPUINFO=1"], "//conditions:default": ["-DXNN_ENABLE_CPUINFO=0"], }), visibility = xnnpack_visibility(), deps = [ "//:allocator", "//:common", "//:hardware_config", "//:memory", "//:params", "//:xnnpack_h", "@com_google_benchmark//:benchmark", ] + xnnpack_select_if( "//:cpuinfo_enabled", ["@cpuinfo"], ), ) cc_library( name = "conv", hdrs = ["conv.h"], deps = [ "@com_google_benchmark//:benchmark", ], ) cc_library( name = "dwconv", hdrs = ["dwconv.h"], deps = [ "@com_google_benchmark//:benchmark", ], ) cc_library( name = "spmm", hdrs = ["spmm.h"], deps = [ "@com_google_benchmark//:benchmark", ], ) xnnpack_cxx_library( name = "gemm_benchmark", srcs = [ "gemm-benchmark.cc", ], hdrs = [ "gemm.h", "gemm-benchmark.h", ], deps = MICROKERNEL_BENCHMARK_DEPS + [ "//:config_hdrs", "@com_google_benchmark//:benchmark", ], ) xnnpack_cxx_library( name = "packw_benchmark", hdrs = [ "packw-benchmark.h", ], deps = MICROKERNEL_BENCHMARK_DEPS + [ ":bgemm", "@com_google_benchmark//:benchmark", ], ) xnnpack_cxx_library( name = "bgemm", hdrs = [ "bgemm.h", ], deps = MICROKERNEL_BENCHMARK_DEPS + [ "@com_google_benchmark//:benchmark", ], ) ######################### Benchmarks for micro-kernels ######################### [xnnpack_benchmark( name = "%s_bench" % kernel, srcs = [ "%s.cc" % kernel.replace("_", "-"), ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":gemm_benchmark", "//:allocator", "//:isa_checks", ], ) for kernel in [ "bf16_gemm", "qd8_f16_qb4w_gemm", "qd8_f32_qb4w_gemm", "qd8_f16_qc8w_gemm", "qd8_f32_qc8w_gemm", "qd8_f16_qc4w_gemm", "qd8_f32_qc4w_gemm", "qs8_qc8w_gemm_fp32", "qu8_gemm_fp32", "qu8_gemm_rndnu", "f16_f32acc_gemm", "f16_gemm", "f32_qc4w_gemm", "f32_qc8w_gemm", ]] [xnnpack_benchmark( name = "%s_bench" % kernel, srcs = [ "%s.cc" % kernel.replace("_", "-"), ], copts = xnnpack_optional_ruy_copts() + xnnpack_optional_gemmlowp_copts(), tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":gemm_benchmark", "//:allocator", "//:isa_checks", ] + xnnpack_optional_ruy_deps() + xnnpack_optional_gemmlowp_deps(), ) for kernel in [ "qs8_gemm", "qu8_gemm", "f16_gemm_minmax", "f32_gemm", "f32_gemm_minmax", "f32_gemm_goi_minmax", ]] xnnpack_benchmark( name = "f32_bgemm_bench", srcs = [ "f32-bgemm.cc", ], copts = xnnpack_optional_ruy_copts(), tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":bgemm", "//:allocator", ] + xnnpack_optional_ruy_deps(), ) xnnpack_benchmark( name = "qp8_f32_qc4w_gemm_bench", srcs = [ "qp8-f32-qc4w-gemm.cc", ], defines = xnnpack_kleidiai_defines(), tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":gemm_benchmark", "//:isa_checks", ] + xnnpack_if_kleidiai_enabled([ "@KleidiAI//kai/ukernels/matmul", ]), ) xnnpack_benchmark( name = "qp8_f32_qb4w_gemm", srcs = ["qp8-f32-qb4w-gemm.cc"], defines = xnnpack_kleidiai_defines(), tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":gemm_benchmark", "//:isa_checks", ] + xnnpack_if_kleidiai_enabled([ "@KleidiAI//kai/ukernels/matmul", ]), ) [xnnpack_benchmark( name = "%s_bench" % kernel, srcs = [ "%s.cc" % kernel.replace("_", "-"), "rsum-benchmark.h", ], deps = MICROKERNEL_BENCHMARK_DEPS, ) for kernel in [ "qs8_rdsum", "qu8_rdsum", "qs8_rsum", "qu8_rsum", "f16_rsum", "f16_f32acc_rsum", "f32_rsum", "f16_f32acc_rdsum", "f32_rdsum", ]] [xnnpack_benchmark( name = "%s_bench" % kernel, srcs = [ "%s.cc" % kernel.replace("_", "-"), ], deps = MICROKERNEL_BENCHMARK_DEPS, ) for kernel in [ "f16_raddstoreexpminusmax", "f16_rmax", "f16_rminmax", "f16_rmin", "f32_raddexpminusmax", "f32_raddextexp", "f32_raddstoreexpminusmax", "f32_rmax", "f32_rminmax", "f32_rmin", "f32_vscaleexpminusmax", "f32_vscaleextexp", "f16_vcmul", "f32_vcmul", "x8_lut", ]] [xnnpack_benchmark( name = "%s_bench" % kernel, srcs = [ "%s.cc" % kernel.replace("_", "-"), ], deps = MICROKERNEL_BENCHMARK_DEPS, ) for kernel in [ "xx_transposev", "xN_transposec", ]] xnnpack_benchmark( name = "qs8_dwconv_bench", srcs = [ "qs8-dwconv.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":dwconv", "//:indirection", "//:microkernel_configs", "//:microkernel_utils", ], ) xnnpack_benchmark( name = "f16_f32acc_igemm_bench", srcs = [ "f16-f32acc-igemm.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":conv", "//:indirection", ], ) xnnpack_benchmark( name = "f16_igemm_bench", srcs = [ "f16-igemm.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":conv", "//:indirection", ], ) xnnpack_benchmark( name = "vunary_bench", srcs = ["vunary.cc"], deps = MICROKERNEL_BENCHMARK_DEPS, ) xnnpack_benchmark( name = "vbinary_bench", srcs = ["vbinary.cc"], deps = MICROKERNEL_BENCHMARK_DEPS, ) xnnpack_benchmark( name = "f32_igemm_bench", srcs = [ "f32-igemm.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":conv", "//:indirection", ], ) xnnpack_benchmark( name = "f32_conv_hwc_bench", srcs = [ "dconv.h", "f32-conv-hwc.cc", ], deps = MICROKERNEL_BENCHMARK_DEPS, ) xnnpack_benchmark( name = "f16_conv_hwc2chw_bench", srcs = [ "dconv.h", "f16-conv-hwc2chw.cc", ], deps = MICROKERNEL_BENCHMARK_DEPS, ) xnnpack_benchmark( name = "f32_conv_hwc2chw_bench", srcs = [ "dconv.h", "f32-conv-hwc2chw.cc", ], deps = MICROKERNEL_BENCHMARK_DEPS, ) xnnpack_benchmark( name = "f16_dwconv_bench", srcs = [ "f16-dwconv.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":dwconv", "//:indirection", "//:microkernel_utils", ], ) xnnpack_benchmark( name = "f32_dwconv_bench", srcs = [ "f32-dwconv.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":dwconv", "//:indirection", "//:microkernel_utils", ], ) xnnpack_benchmark( name = "f32_dwconv2d_chw_bench", srcs = [ "f32-dwconv2d-chw.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":dwconv", "//:indirection", ], ) xnnpack_benchmark( name = "f16_dwconv2d_chw_bench", srcs = [ "f16-dwconv2d-chw.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":dwconv", "//:indirection", ], ) xnnpack_benchmark( name = "f32_spmm_bench", srcs = [ "f32-spmm.cc", "spmm-benchmark.h", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":spmm", "//:isa_checks", ], ) xnnpack_benchmark( name = "f16_spmm_bench", srcs = [ "f16-spmm.cc", ], deps = MICROKERNEL_BENCHMARK_DEPS + [":spmm"], ) xnnpack_benchmark( name = "f32_softmax_bench", srcs = [ "f32-softmax.cc", ], copts = xnnpack_optional_dnnl_copts(), deps = MICROKERNEL_BENCHMARK_DEPS + xnnpack_optional_dnnl_deps(), ) xnnpack_benchmark( name = "f32_im2col_gemm_bench", srcs = [ "f32-im2col-gemm.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":conv", "//:im2col", ], ) xnnpack_cxx_library( name = "packq_benchmark", srcs = [ "packq-benchmark.cc", ], hdrs = ["packq-benchmark.h"], deps = MICROKERNEL_BENCHMARK_DEPS + [ ":bgemm", "@com_google_benchmark//:benchmark", ], ) xnnpack_benchmark( name = "x8_packq_bench", srcs = [ "x8-packq.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":bgemm", ":packq_benchmark", "//:allocator", ], ) xnnpack_benchmark( name = "x8_packw_bench", srcs = [ "x8-packw.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":bgemm", ":packw_benchmark", "//:allocator", ], ) xnnpack_benchmark( name = "qs8_packw_bench", srcs = [ "qs8-packw.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":bgemm", ":packw_benchmark", "//:allocator", ], ) xnnpack_benchmark( name = "qs8_qc4w_packw_bench", srcs = [ "qs8-qc4w-packw.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":bgemm", ":packw_benchmark", "//:allocator", ], ) xnnpack_benchmark( name = "x16_packw_bench", srcs = [ "x16-packw.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":bgemm", ":packw_benchmark", "//:allocator", ], ) xnnpack_benchmark( name = "x32_packw_bench", srcs = [ "x32-packw.cc", ], tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":bgemm", ":packw_benchmark", "//:allocator", ], ) ########################### Benchmarks for operators ########################### xnnpack_benchmark( name = "unary_bench", srcs = ["unary.cc"], copts = xnnpack_optional_tflite_copts(), tags = ["nowin32"], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), ) xnnpack_benchmark( name = "binary_bench", srcs = ["binary.cc"], copts = xnnpack_optional_tflite_copts(), tags = ["nowin32"], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), ) xnnpack_benchmark( name = "average_pooling_bench", srcs = ["average-pooling.cc"], copts = xnnpack_optional_tflite_copts(), tags = ["nowin32"], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), ) xnnpack_benchmark( name = "batch_matrix_multiply_bench", srcs = ["batch-matrix-multiply.cc"], copts = xnnpack_optional_tflite_copts(), tags = xnnpack_slow_benchmark_tags() + ["nowin32"], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps() + [ "//test:next_prime", "@cpuinfo", "@pthreadpool", ], ) xnnpack_benchmark( name = "channel_shuffle_bench", srcs = ["channel-shuffle.cc"], deps = OPERATOR_BENCHMARK_DEPS, ) xnnpack_benchmark( name = "convolution_bench", srcs = ["convolution.cc"], copts = xnnpack_optional_tflite_copts(), tags = xnnpack_slow_benchmark_tags() + ["nowin32"], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), ) xnnpack_benchmark( name = "deconvolution_bench", srcs = ["deconvolution.cc"], copts = xnnpack_optional_tflite_copts(), tags = xnnpack_slow_benchmark_tags() + ["nowin32"], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), ) xnnpack_benchmark( name = "fully_connected_bench", srcs = ["fully-connected.cc"], copts = xnnpack_optional_tflite_copts(), tags = ["nowin32"], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), ) xnnpack_benchmark( name = "max_pooling_bench", srcs = ["max-pooling.cc"], deps = OPERATOR_BENCHMARK_DEPS, ) xnnpack_benchmark( name = "prelu_bench", srcs = ["prelu.cc"], copts = xnnpack_optional_tflite_copts(), tags = ["nowin32"], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), ) xnnpack_benchmark( name = "scaled_dot_product_attention_bench", srcs = ["scaled-dot-product-attention.cc"], copts = xnnpack_optional_tflite_copts(), tags = xnnpack_slow_benchmark_tags() + ["nowin32"], deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), )