sglang_v0.5.2/pytorch_2.8.0/third_party/NNPACK/benchmark.py

185 lines
7.7 KiB
Python
Executable File

#!/usr/bin/env python
from __future__ import print_function
def extract_time(line, prefix):
if line.startswith(prefix):
line = line[len(prefix):].lstrip()
line = line[:line.index(" ms")].rstrip()
return line
def convolution(mode, batch_size, input_channels, output_channels, image_size, kernel_size, padding, algorithm, transform_strategy=None, threads=None, verbose=False, use_selldr=False):
import subprocess
if use_selldr:
import os
import sys
nacl_sdk_dir = os.getenv("NACL_SDK_ROOT")
if nacl_sdk_dir is None:
print("Error: can not find Native Client SDK: set NACL_SDK_ROOT envorinment variable and try again", file=sys.stderr)
sys.exit(1)
benchmark_args = [os.path.join(nacl_sdk_dir, "tools", "sel_ldr.py"), "--",
"bin/convolution-benchmark"]
else:
benchmark_args = ["bin/convolution-benchmark"]
benchmark_args += [
"-m", mode,
"-b", str(batch_size),
"-ic", str(input_channels),
"-oc", str(output_channels),
"-is", str(image_size[0]), str(image_size[1]),
"-ip", str(padding),
"-ks", str(kernel_size[0]), str(kernel_size[1]),
"-a", algorithm
]
if mode == "inference" and transform_strategy is not None:
benchmark_args += ["-ts", transform_strategy]
if threads is not None:
benchmark_args += ["-t", str(threads)]
benchmark = subprocess.Popen(benchmark_args, stdout=subprocess.PIPE)
benchmark_stdout, _ = benchmark.communicate()
if benchmark.returncode == 0:
output_lines = [line for line in benchmark_stdout.splitlines() if len(line)]
total, input_transform, kernel_transform, output_transform, block_multiplication, overhead = None, None, None, None, None, None
for output_line in output_lines:
total = total or extract_time(output_line, "Time:")
input_transform = input_transform or extract_time(output_line, "Input transform:")
kernel_transform = kernel_transform or extract_time(output_line, "Kernel transform:")
output_transform = output_transform or extract_time(output_line, "Output transform:")
block_multiplication = block_multiplication or extract_time(output_line, "Block multiplication:")
overhead = overhead or extract_time(output_line, "Overhead:")
if verbose:
return (total, input_transform, kernel_transform, output_transform, block_multiplication, overhead)
else:
return (total,)
def fully_connected(mode, batch_size, input_channels, output_channels, threads=None, verbose=False, use_selldr=False):
import subprocess
if use_selldr:
import os
import sys
nacl_sdk_dir = os.getenv("NACL_SDK_ROOT")
if nacl_sdk_dir is None:
print("Error: can not find Native Client SDK: set NACL_SDK_ROOT envorinment variable and try again", file=sys.stderr)
sys.exit(1)
benchmark_args = [os.path.join(nacl_sdk_dir, "tools", "sel_ldr.py"), "--",
"bin/fully-connected-benchmark"]
else:
benchmark_args = ["bin/fully-connected-benchmark"]
benchmark_args += [
"-m", mode,
"-b", str(batch_size),
"-ic", str(input_channels),
"-oc", str(output_channels)
]
if threads is not None:
benchmark_args += ["-t", str(threads)]
benchmark = subprocess.Popen(benchmark_args, stdout=subprocess.PIPE)
benchmark_stdout, _ = benchmark.communicate()
if benchmark.returncode == 0:
output_lines = [line for line in benchmark_stdout.splitlines() if len(line)]
total, input_transform, kernel_transform, block_multiplication, overhead = None, None, None, None, None
for output_line in output_lines:
total = total or extract_time(output_line, "Time:")
input_transform = input_transform or extract_time(output_line, "Input packing:")
kernel_transform = kernel_transform or extract_time(output_line, "Kernel packing:")
block_multiplication = block_multiplication or extract_time(output_line, "Block multiplication:")
overhead = overhead or extract_time(output_line, "Overhead:")
if verbose:
return (total, input_transform, kernel_transform, block_multiplication, overhead)
else:
return (total,)
overfeat_fast_layers = [
("conv2", 96, 256, (24, 24), (5, 5), 0),
("conv3", 256, 512, (12, 12), (3, 3), 1),
("conv4", 512, 1024, (12, 12), (3, 3), 1),
("conv5", 1024, 1024, (12, 12), (3, 3), 1),
("fc6", 36864, 3072),
("fc7", 3072, 4096),
("fc8", 4096, 1000),
]
alexnet_layers = [
("conv2", 64, 192, (27, 27), (5, 5), 2),
("conv3", 192, 384, (13, 13), (3, 3), 1),
("conv4", 384, 256, (13, 13), (3, 3), 1),
("conv5", 256, 256, (13, 13), (3, 3), 1),
("fc6", 12544, 4096),
("fc7", 4096, 4096),
("fc8", 4096, 1000),
]
vgg_a_layers = [
("conv1", 3, 64, (224, 224), (3, 3), 1),
("conv2", 64, 128, (112, 112), (3, 3), 1),
("conv3.1", 128, 256, (56, 56), (3, 3), 1),
("conv3.2", 256, 256, (56, 56), (3, 3), 1),
("conv4.1", 256, 512, (28, 28), (3, 3), 1),
("conv4.2", 512, 512, (28, 28), (3, 3), 1),
("conv5", 512, 512, (14, 14), (3, 3), 1),
("fc6", 25088, 4096),
("fc7", 4096, 4096),
("fc8", 4096, 1000),
]
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="NNPACK benchmarking script")
parser.add_argument("--enable-selldr", dest="use_selldr", action="store_true")
parser.add_argument("-l", "--layer", dest="layer", required=True, choices=["convolution", "fully-connected", "pooling"])
parser.add_argument("-n", "--network", dest="network", required=True, choices=["vgg-a", "alexnet", "overfeat-fast"])
parser.add_argument("-m", "--mode", dest="mode", required=True, choices=["inference", "output", "input-gradient", "kernel-gradient"])
parser.add_argument("--transform-strategy", dest="transform_strategy", default="compute", choices=["compute", "precompute"])
parser.add_argument("-b", "--batch", dest="batch", type=int)
parser.add_argument("-t", "--threads", dest="threads")
parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", default=False)
options = parser.parse_args()
network_layers, default_batch = {
"vgg-a": (vgg_a_layers, 64),
"alexnet": (alexnet_layers, 128),
"overfeat-fast": (overfeat_fast_layers, 128)
}[options.network]
layer_prefix = {
"convolution": "conv",
"fully-connected": "fc",
"pooling": "pool"
}[options.layer]
network_layers = [layer for layer in network_layers if layer[0].startswith(layer_prefix)]
batch = default_batch
if options.batch is not None:
batch = options.batch
if batch != 1 and options.mode == "inference":
raise ValueError("Non-unit batch {batch} is not allowed in inference mode".format(batch=batch))
elif options.mode == "inference":
batch = 1
if options.transform_strategy is not None:
if options.layer != "convolution":
raise ValueError("Transform strategy {transform_strategy} is meaningless for non-convolutional layers".format(transform_strategy=transform_strategy))
elif options.mode != "inference":
raise ValueError("Transform strategy {transform_strategy} is meaningless in non-inference mode".format(transform_strategy=transform_strategy))
if options.layer == "convolution":
for name, input_channels, output_channels, image_size, kernel_size, padding in network_layers:
measurements = [name]
for algorithm in ["implicit-gemm", "ft8x8", "ft16x16", "wt8x8"]:
if algorithm.startswith("wt") and kernel_size != (3, 3):
continue
measurements += list(convolution(options.mode, batch, input_channels, output_channels,
image_size, kernel_size, padding, algorithm,
transform_strategy=options.transform_strategy,
threads=options.threads, verbose=options.verbose, use_selldr=options.use_selldr))
print("\t".join(map(str, measurements)))
elif options.layer == "fully-connected":
for name, input_channels, output_channels in network_layers:
measurements = fully_connected(options.mode, batch, input_channels, output_channels,
threads=options.threads, verbose=options.verbose, use_selldr=options.use_selldr)
print("{name}\t{measurements}".format(name=name, measurements="\t".join(measurements)))