sglang_v0.5.2/pytorch_2.8.0/third_party/NNPACK/configure.py

483 lines
22 KiB
Python
Executable File

#!/usr/bin/env python
import confu
parser = confu.standard_parser()
parser.add_argument("--backend", dest="backend", default="auto",
choices=["auto", "psimd", "scalar"])
parser.add_argument("--inference-only", dest="inference_only", default=False,
action="store_true",
help="Build only inference/forward pass functions to reduce library size")
parser.add_argument("--convolution-only", dest="convolution_only", default=False,
action="store_true",
help="Build only convolution functions to reduce library size")
def main(args):
options = parser.parse_args(args)
backend = options.backend
if backend == "auto":
if options.target.is_x86_64:
backend = "x86_64"
elif options.target.is_arm or options.target.is_arm64:
backend = "arm"
elif options.target.is_emscripten:
backend = "scalar"
else:
backend = "psimd"
build = confu.Build.from_options(options)
macros = dict()
if backend == "psimd":
macros["NNP_BACKEND_PSIMD"] = 1
if backend == "scalar":
macros["NNP_BACKEND_SCALAR"] = 1
export_macros = dict()
export_macros["NNP_CONVOLUTION_ONLY"] = int(options.convolution_only)
export_macros["NNP_INFERENCE_ONLY"] = int(options.inference_only)
macros.update(export_macros)
build.export_cpath("include", ["nnpack.h"])
with build.options(source_dir="src", macros=macros,
deps={
(build.deps.pthreadpool, build.deps.cpuinfo, build.deps.fxdiv, build.deps.fp16): any,
build.deps.psimd: backend == "psimd" or backend == "arm",
},
extra_include_dirs={
("src", "src/ref"): any,
"src/x86_64-fma": options.target.is_x86_64
}):
nnpack_objects = [
build.cc("init.c"),
build.cc("convolution-inference.c"),
]
if not options.convolution_only:
# Fully-connected, pooling, Softmax, ReLU layers
nnpack_objects += [
build.cc("fully-connected-inference.c"),
build.cc("pooling-output.c"),
build.cc("softmax-output.c"),
build.cc("relu-output.c"),
]
if not options.inference_only:
# Training functions for fully-connected and ReLU layers
nnpack_objects += [
build.cc("fully-connected-output.c"),
build.cc("relu-input-gradient.c"),
]
if not options.inference_only:
# Training functions for convolutional layer
nnpack_objects += [
build.cc("convolution-output.c"),
build.cc("convolution-input-gradient.c"),
build.cc("convolution-kernel-gradient.c"),
]
if backend == "x86_64":
arch_nnpack_objects = [
# Transformations
build.peachpy("x86_64-fma/2d-fourier-8x8.py"),
build.peachpy("x86_64-fma/2d-fourier-16x16.py"),
build.peachpy("x86_64-fma/2d-winograd-8x8-3x3.py"),
# Tuple GEMM
build.peachpy("x86_64-fma/blas/s8gemm.py"),
build.peachpy("x86_64-fma/blas/c8gemm.py"),
build.peachpy("x86_64-fma/blas/s4c6gemm.py"),
# Direct convolution
build.peachpy("x86_64-fma/blas/conv1x1.py"),
# BLAS microkernels
build.peachpy("x86_64-fma/blas/sgemm.py"),
]
if not options.convolution_only:
arch_nnpack_objects += [
# Activations
build.peachpy("x86_64-fma/softmax.py"),
build.cc("x86_64-fma/softmax.c"),
build.peachpy("x86_64-fma/relu.py"),
# Pooling
build.peachpy("x86_64-fma/max-pooling.py"),
# BLAS microkernels
build.peachpy("x86_64-fma/blas/sdotxf.py"),
build.peachpy("x86_64-fma/blas/shdotxf.py"),
]
elif backend == "scalar":
arch_nnpack_objects = [
# Transformations
build.cc("scalar/2d-fourier-8x8.c"),
build.cc("scalar/2d-fourier-16x16.c"),
build.cc("scalar/2d-winograd-8x8-3x3.c"),
# Tuple GEMM
build.cc("scalar/blas/s2gemm.c"),
build.cc("scalar/blas/cgemm-conjb.c"),
# Direct convolution
build.cc("scalar/blas/conv1x1.c"),
# BLAS microkernels
build.cc("scalar/blas/sgemm.c"),
]
if not options.inference_only:
arch_nnpack_objects += [
# Tuple GEMM
build.cc("scalar/blas/s2gemm-transc.c"),
build.cc("scalar/blas/cgemm.c"),
build.cc("scalar/blas/cgemm-conjb-transc.c"),
]
if not options.convolution_only:
arch_nnpack_objects += [
# Activations
build.cc("scalar/relu.c"),
build.cc("scalar/softmax.c"),
# BLAS microkernels
build.cc("scalar/blas/sdotxf.c"),
build.cc("scalar/blas/shdotxf.c"),
]
elif backend == "arm":
from confu import arm
with build.options(isa=arm.neon+arm.fp16 if options.target.is_arm else None):
arch_nnpack_objects = [
# Transformations
build.cc("psimd/2d-fourier-8x8.c"),
build.cc("psimd/2d-fourier-16x16.c"),
build.cc("neon/2d-winograd-8x8-3x3.c"),
build.cc("neon/2d-winograd-8x8-3x3-fp16.c"),
# Tuple GEMM
build.cc("neon/blas/h4gemm.c"),
build.cc("neon/blas/s4gemm.c"),
build.cc("neon/blas/c4gemm-conjb.c"),
build.cc("neon/blas/s4c2gemm-conjb.c"),
# Direct convolution
build.cc("neon/blas/conv1x1.c"),
# BLAS microkernels
build.cc("neon/blas/sgemm.c"),
]
if not options.inference_only:
arch_nnpack_objects += [
# Transformations
build.cc("psimd/2d-winograd-8x8-3x3.c"),
# Tuple GEMM
build.cc("neon/blas/c4gemm.c"),
build.cc("neon/blas/s4c2gemm.c"),
build.cc("neon/blas/c4gemm-conjb-transc.c"),
build.cc("neon/blas/s4c2gemm-conjb-transc.c"),
]
if not options.convolution_only:
arch_nnpack_objects += [
# ReLU and Softmax
build.cc("neon/relu.c"),
build.cc("psimd/softmax.c"),
# BLAS microkernels
build.cc("neon/blas/sdotxf.c"),
build.cc("psimd/blas/shdotxf.c"),
]
if options.target.is_arm:
# Functions implemented in assembly
arch_nnpack_objects += [
build.cc("neon/blas/h4gemm-aarch32.S"),
build.cc("neon/blas/s4gemm-aarch32.S"),
build.cc("neon/blas/sgemm-aarch32.S"),
]
elif backend == "psimd":
arch_nnpack_objects = [
# Transformations
build.cc("psimd/2d-fourier-8x8.c"),
build.cc("psimd/2d-fourier-16x16.c"),
build.cc("psimd/2d-winograd-8x8-3x3.c"),
# Tuple GEMM
build.cc("psimd/blas/s4gemm.c"),
build.cc("psimd/blas/c4gemm-conjb.c"),
build.cc("psimd/blas/s4c2gemm-conjb.c"),
# Direct convolution
build.cc("psimd/blas/conv1x1.c"),
# BLAS microkernels
build.cc("psimd/blas/sgemm.c"),
]
if not options.inference_only:
arch_nnpack_objects += [
# Tuple GEMM
build.cc("psimd/blas/c4gemm.c"),
build.cc("psimd/blas/s4c2gemm.c"),
build.cc("psimd/blas/c4gemm-conjb-transc.c"),
build.cc("psimd/blas/s4c2gemm-conjb-transc.c"),
]
if not options.convolution_only:
arch_nnpack_objects += [
# Activations
build.cc("psimd/relu.c"),
build.cc("psimd/softmax.c"),
# BLAS microkernels
build.cc("psimd/blas/sdotxf.c"),
build.cc("psimd/blas/shdotxf.c"),
]
reference_layer_objects = [
build.cc("ref/convolution-output.c"),
build.cc("ref/convolution-input-gradient.c"),
build.cc("ref/convolution-kernel.c"),
build.cc("ref/fully-connected-output.c"),
build.cc("ref/max-pooling-output.c"),
build.cc("ref/softmax-output.c"),
build.cc("ref/relu-output.c"),
build.cc("ref/relu-input-gradient.c"),
]
reference_fft_objects = [
build.cc("ref/fft/aos.c"),
build.cc("ref/fft/soa.c"),
build.cc("ref/fft/forward-real.c"),
build.cc("ref/fft/forward-dualreal.c"),
build.cc("ref/fft/inverse-real.c"),
build.cc("ref/fft/inverse-dualreal.c"),
]
if backend == "x86_64":
arch_fft_stub_objects = [
build.peachpy("x86_64-fma/fft-soa.py"),
build.peachpy("x86_64-fma/fft-aos.py"),
build.peachpy("x86_64-fma/fft-dualreal.py"),
build.peachpy("x86_64-fma/ifft-dualreal.py"),
build.peachpy("x86_64-fma/fft-real.py"),
build.peachpy("x86_64-fma/ifft-real.py"),
]
arch_winograd_stub_objects = [
build.peachpy("x86_64-fma/winograd-f6k3.py"),
]
arch_math_stub_objects = [
]
elif backend == "scalar":
arch_fft_stub_objects = [
build.cc("scalar/fft-aos.c"),
build.cc("scalar/fft-soa.c"),
build.cc("scalar/fft-real.c"),
build.cc("scalar/fft-dualreal.c"),
]
arch_winograd_stub_objects = [
build.cc("scalar/winograd-f6k3.c"),
]
elif backend == "psimd" or backend == "arm":
arch_fft_stub_objects = [
build.cc("psimd/fft-aos.c"),
build.cc("psimd/fft-soa.c"),
build.cc("psimd/fft-real.c"),
build.cc("psimd/fft-dualreal.c"),
]
if backend == "psimd":
arch_winograd_stub_objects = [
build.cc("psimd/winograd-f6k3.c"),
]
else:
# ARM NEON Winograd transform optionally uses FP16 storage
with build.options(isa=arm.neon+arm.fp16 if options.target.is_arm else None):
arch_winograd_stub_objects = [
build.cc("neon/winograd-f6k3.c"),
]
arch_math_stub_objects = [
build.cc("psimd/exp.c"),
]
fft_objects = reference_fft_objects + arch_fft_stub_objects
nnpack_objects = nnpack_objects + arch_nnpack_objects
build.static_library("nnpack", nnpack_objects)
# Build tests for micro-kernels. Link to the micro-kernels implementations
with build.options(source_dir="test", extra_include_dirs="test",
deps={
(build.deps.googletest, build.deps.cpuinfo, build.deps.clog, build.deps.fp16): any,
"log": build.target.is_android}):
build.unittest("fourier-reference-test",
reference_fft_objects + [build.cxx("fourier/reference.cc")])
if backend == "x86_64":
build.smoketest("fourier-test",
reference_fft_objects + arch_fft_stub_objects + [build.cxx("fourier/x86_64-avx2.cc")])
build.smoketest("winograd-test",
arch_winograd_stub_objects + arch_nnpack_objects + [build.cxx("winograd/x86_64-fma3.cc")])
build.smoketest("sgemm-test",
arch_nnpack_objects + [build.cxx("sgemm/x86_64-fma3.cc")])
elif backend == "psimd":
build.smoketest("fourier-test",
reference_fft_objects + arch_fft_stub_objects + [build.cxx("fourier/psimd.cc")])
build.smoketest("winograd-test",
arch_winograd_stub_objects + arch_nnpack_objects + [build.cxx("winograd/psimd.cc")])
build.smoketest("sgemm-test",
arch_nnpack_objects + [build.cxx("sgemm/psimd.cc")])
elif backend == "arm":
# No ARM-specific Fourier implementation; use PSIMD
build.smoketest("fourier-test",
reference_fft_objects + arch_fft_stub_objects + [build.cxx("fourier/psimd.cc")])
build.smoketest("winograd-test",
arch_winograd_stub_objects + arch_nnpack_objects + [build.cxx("winograd/neon.cc")])
build.smoketest("sgemm-test",
arch_nnpack_objects + [build.cxx("sgemm/neon.cc")])
build.smoketest("sxgemm-test",
arch_nnpack_objects + [build.cxx("sxgemm/neon.cc")])
build.smoketest("hxgemm-test",
arch_nnpack_objects + [build.cxx("hxgemm/neon.cc")])
elif backend == "scalar":
build.smoketest("fourier-test",
reference_fft_objects + arch_fft_stub_objects + [build.cxx("fourier/scalar.cc")])
build.smoketest("winograd-test",
arch_winograd_stub_objects + arch_nnpack_objects + [build.cxx("winograd/scalar.cc")])
build.smoketest("sgemm-test",
arch_nnpack_objects + [build.cxx("sgemm/scalar.cc")])
# Build test for layers. Link to the library.
with build.options(source_dir="test", include_dirs="test", deps={
(build, build.deps.pthreadpool, build.deps.cpuinfo, build.deps.clog, build.deps.googletest.core, build.deps.fp16): any,
"rt": build.target.is_linux,
"log": build.target.is_android,
}):
if not options.inference_only:
build.smoketest("convolution-output-smoketest",
reference_layer_objects + [build.cxx("convolution-output/smoke.cc")])
build.unittest("convolution-output-alexnet-test",
reference_layer_objects + [build.cxx("convolution-output/alexnet.cc")])
build.unittest("convolution-output-vgg-a-test",
reference_layer_objects + [build.cxx("convolution-output/vgg-a.cc")])
build.unittest("convolution-output-overfeat-fast-test",
reference_layer_objects + [build.cxx("convolution-output/overfeat-fast.cc")])
build.smoketest("convolution-input-gradient-smoketest",
reference_layer_objects + [build.cxx("convolution-input-gradient/smoke.cc")])
build.unittest("convolution-input-gradient-alexnet-test",
reference_layer_objects + [build.cxx("convolution-input-gradient/alexnet.cc")])
build.unittest("convolution-input-gradient-vgg-a-test",
reference_layer_objects + [build.cxx("convolution-input-gradient/vgg-a.cc")])
build.unittest("convolution-input-gradient-overfeat-fast-test",
reference_layer_objects + [build.cxx("convolution-input-gradient/overfeat-fast.cc")])
build.smoketest("convolution-kernel-gradient-smoketest",
reference_layer_objects + [build.cxx("convolution-kernel-gradient/smoke.cc")])
build.unittest("convolution-kernel-gradient-alexnet-test",
reference_layer_objects + [build.cxx("convolution-kernel-gradient/alexnet.cc")])
build.unittest("convolution-kernel-gradient-vgg-a-test",
reference_layer_objects + [build.cxx("convolution-kernel-gradient/vgg-a.cc")])
build.unittest("convolution-kernel-gradient-overfeat-fast-test",
reference_layer_objects + [build.cxx("convolution-kernel-gradient/overfeat-fast.cc")])
build.smoketest("convolution-inference-smoketest",
reference_layer_objects + [build.cxx("convolution-inference/smoke.cc")])
build.unittest("convolution-inference-alexnet-test",
reference_layer_objects + [build.cxx("convolution-inference/alexnet.cc")])
build.unittest("convolution-inference-vgg-a-test",
reference_layer_objects + [build.cxx("convolution-inference/vgg-a.cc")])
build.unittest("convolution-inference-overfeat-fast-test",
reference_layer_objects + [build.cxx("convolution-inference/overfeat-fast.cc")])
if not options.convolution_only:
build.unittest("fully-connected-inference-alexnet-test",
reference_layer_objects + [build.cxx("fully-connected-inference/alexnet.cc")])
build.unittest("fully-connected-inference-vgg-a-test",
reference_layer_objects + [build.cxx("fully-connected-inference/vgg-a.cc")])
build.unittest("fully-connected-inference-overfeat-fast-test",
reference_layer_objects + [build.cxx("fully-connected-inference/overfeat-fast.cc")])
if not options.inference_only:
build.smoketest("fully-connected-output-smoketest",
reference_layer_objects + [build.cxx("fully-connected-output/smoke.cc")])
build.unittest("fully-connected-output-alexnet-test",
reference_layer_objects + [build.cxx("fully-connected-output/alexnet.cc")])
build.unittest("fully-connected-output-vgg-a-test",
reference_layer_objects + [build.cxx("fully-connected-output/vgg-a.cc")])
build.unittest("fully-connected-output-overfeat-fast-test",
reference_layer_objects + [build.cxx("fully-connected-output/overfeat-fast.cc")])
build.smoketest("max-pooling-output-smoketest",
reference_layer_objects + [build.cxx("max-pooling-output/smoke.cc")])
build.unittest("max-pooling-output-vgg-a-test",
reference_layer_objects + [build.cxx("max-pooling-output/vgg-a.cc")])
build.unittest("max-pooling-output-overfeat-fast",
reference_layer_objects + [build.cxx("max-pooling-output/overfeat-fast.cc")])
build.unittest("relu-output-alexnet-test",
reference_layer_objects + [build.cxx("relu-output/alexnet.cc")])
build.unittest("relu-output-vgg-a-test",
reference_layer_objects + [build.cxx("relu-output/vgg-a.cc")])
build.unittest("relu-output-overfeat-fast-test",
reference_layer_objects + [build.cxx("relu-output/overfeat-fast.cc")])
if not options.inference_only:
build.unittest("relu-input-gradient-alexnet-test",
reference_layer_objects + [build.cxx("relu-input-gradient/alexnet.cc")])
build.unittest("relu-input-gradient-vgg-a-test",
reference_layer_objects + [build.cxx("relu-input-gradient/vgg-a.cc")])
build.unittest("relu-input-gradient-overfeat-fast-test",
reference_layer_objects + [build.cxx("relu-input-gradient/overfeat-fast.cc")])
build.smoketest("softmax-output-smoketest",
reference_layer_objects + [build.cxx("softmax-output/smoke.cc")])
build.unittest("softmax-output-imagenet-test",
reference_layer_objects + [build.cxx("softmax-output/imagenet.cc")])
# Build automatic benchmarks
with build.options(source_dir="bench", extra_include_dirs=["bench", "test"], macros=macros, deps={
(build, build.deps.pthreadpool, build.deps.cpuinfo, build.deps.clog, build.deps.fp16, build.deps.googlebenchmark): all,
"rt": build.target.is_linux,
"log": build.target.is_android}):
build.benchmark("convolution-inference-bench", build.cxx("convolution-inference.cc"))
build.benchmark("sgemm-bench", build.cxx("sgemm.cc"))
build.benchmark("sxgemm-bench", build.cxx("sxgemm.cc"))
build.benchmark("hxgemm-bench", build.cxx("hxgemm.cc"))
build.benchmark("conv1x1-bench", build.cxx("conv1x1.cc"))
build.benchmark("winograd-bench", build.cxx("winograd.cc"))
# Build benchmarking utilities
if not options.inference_only and not build.target.is_android:
with build.options(source_dir="bench", extra_include_dirs="bench", macros=macros, deps={
(build, build.deps.pthreadpool, build.deps.cpuinfo, build.deps.clog): all,
"rt": build.target.is_linux,
"log": build.target.is_android}):
support_objects = [build.cc("median.c")]
if build.target.is_x86_64:
support_objects += [build.peachpy("memread.py")]
else:
support_objects += [build.cc("memread.c")]
if build.target.is_linux and build.target.is_x86_64:
support_objects += [build.cc("perf_counter.c")]
build.executable("transform-benchmark",
[build.cc("transform.c")] + support_objects)
build.executable("convolution-benchmark",
[build.cc("convolution.c")] + support_objects)
if not options.convolution_only:
build.executable("fully-connected-benchmark",
[build.cc("fully-connected.c")] + support_objects)
build.executable("pooling-benchmark",
[build.cc("pooling.c")] + support_objects)
build.executable("relu-benchmark",
[build.cc("relu.c")] + support_objects)
return build
if __name__ == "__main__":
import sys
main(sys.argv[1:]).generate()