#pragma once #include "cuda_runtime.h" #include "cutlass/cutlass.h" /** * A wrapper for a kernel that is used to guard against compilation on * architectures that will never use the kernel. The purpose of this is to * reduce the size of the compiled binary. * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef * into code that will be executed on the device where it is defined. */ template struct enable_sm90_or_later : Kernel { template CUTLASS_DEVICE void operator()(Args&&... args) { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900 Kernel::operator()(std::forward(args)...); #endif } };