#include #include #include #include static void ConvolutionSetup(benchmark::internal::Benchmark* benchmark) { benchmark->Unit(benchmark::kMicrosecond)->ArgNames({"Cin", "Cout", "ImageSize"}); } class NNPACK : public benchmark::Fixture { virtual void SetUp(const benchmark::State&) override { const auto status = nnp_initialize(); assert(status == nnp_status_success); } virtual void TearDown(const benchmark::State&) override { const auto status = nnp_deinitialize(); assert(status == nnp_status_success); } }; BENCHMARK_DEFINE_F(NNPACK, conv1x1)(benchmark::State& state) { const size_t inputChannels = static_cast(state.range(0)); const size_t outputChannels = static_cast(state.range(1)); const size_t imageSize = static_cast(state.range(2)); std::vector input, kernel, output, bias; std::vector> transformedKernel, workspaceBuffer; input.resize(inputChannels * imageSize * imageSize); kernel.resize(outputChannels * inputChannels); bias.resize(outputChannels); output.resize(outputChannels * imageSize * imageSize); nnp_convolution_transform_strategy strategy = nnp_convolution_transform_strategy_precompute; const nnp_convolution_algorithm algorithm = nnp_convolution_algorithm_implicit_gemm; const nnp_size imageSize2D = { imageSize, imageSize }; const nnp_size kernelSize2D = { 1, 1 }; const nnp_size outputStride2D = { 1, 1 }; const nnp_padding imagePadding = { 0, 0, 0, 0 }; if (strategy == nnp_convolution_transform_strategy_precompute) { size_t transformedKernelSize = 0; nnp_status status = nnp_convolution_inference( algorithm, nnp_convolution_transform_strategy_precompute, inputChannels, outputChannels, imageSize2D, imagePadding, kernelSize2D, outputStride2D, NULL, NULL, NULL, NULL, NULL, &transformedKernelSize, nnp_activation_identity, NULL, NULL, NULL); if (status == nnp_status_success) { transformedKernel.resize(transformedKernelSize); status = nnp_convolution_inference( algorithm, nnp_convolution_transform_strategy_precompute, inputChannels, outputChannels, imageSize2D, imagePadding, kernelSize2D, outputStride2D, NULL, kernel.data(), NULL, NULL, transformedKernel.data(), &transformedKernelSize, nnp_activation_identity, NULL, NULL, NULL); assert(status == nnp_status_success); strategy = nnp_convolution_transform_strategy_reuse; } else { assert(status == nnp_status_unsupported_transform_strategy); strategy = nnp_convolution_transform_strategy_compute; } } size_t workspaceSize = 0; nnp_status status = nnp_convolution_inference( algorithm, strategy, inputChannels, outputChannels, imageSize2D, imagePadding, kernelSize2D, outputStride2D, NULL, NULL, NULL, NULL, NULL, &workspaceSize, nnp_activation_identity, NULL, NULL, NULL); assert(status == nnp_status_success); workspaceBuffer.resize(workspaceSize); double input_transform_share = 0.0, kernel_transform_share = 0.0, output_transform_share = 0.0, matmul_share = 0.0; for (auto _ : state) { nnp_profile profile; status = nnp_convolution_inference( algorithm, strategy, inputChannels, outputChannels, imageSize2D, imagePadding, kernelSize2D, outputStride2D, input.data(), transformedKernel.empty() ? kernel.data() : static_cast(static_cast(transformedKernel.data())), bias.data(), output.data(), workspaceBuffer.data(), &workspaceSize, nnp_activation_identity, NULL, NULL, &profile); assert(status == nnp_status_success); input_transform_share += profile.input_transform; kernel_transform_share += profile.kernel_transform; output_transform_share += profile.output_transform; matmul_share += profile.block_multiplication; } state.counters["Ti"] = benchmark::Counter(input_transform_share, benchmark::Counter::kIsRate); state.counters["Tk"] = benchmark::Counter(kernel_transform_share, benchmark::Counter::kIsRate); state.counters["To"] = benchmark::Counter(output_transform_share, benchmark::Counter::kIsRate); state.counters["MM"] = benchmark::Counter(matmul_share, benchmark::Counter::kIsRate); state.SetItemsProcessed(state.iterations() * imageSize * imageSize * inputChannels * outputChannels); } BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({1024, 1024, 16}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({1024, 512, 16}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({1024, 256, 16}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 512, 1024, 16}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 512, 512, 16}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 512, 256, 16}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 256, 1024, 16}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 256, 512, 16}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 256, 256, 16}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({1024, 1024, 26}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({1024, 512, 26}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({1024, 256, 26}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 512, 1024, 26}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 512, 512, 26}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 512, 256, 26}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 256, 1024, 26}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 256, 512, 26}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 256, 256, 26}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({1024, 1024, 52}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({1024, 512, 52}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({1024, 256, 52}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 512, 1024, 52}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 512, 512, 52}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 512, 256, 52}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 256, 1024, 52}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 256, 512, 52}); BENCHMARK_REGISTER_F(NNPACK, conv1x1)->Apply(ConvolutionSetup)->Args({ 256, 256, 52}); BENCHMARK_MAIN();