185 lines
7.7 KiB
Python
Executable File
185 lines
7.7 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
from __future__ import print_function
|
|
|
|
|
|
def extract_time(line, prefix):
|
|
if line.startswith(prefix):
|
|
line = line[len(prefix):].lstrip()
|
|
line = line[:line.index(" ms")].rstrip()
|
|
return line
|
|
|
|
|
|
def convolution(mode, batch_size, input_channels, output_channels, image_size, kernel_size, padding, algorithm, transform_strategy=None, threads=None, verbose=False, use_selldr=False):
|
|
import subprocess
|
|
if use_selldr:
|
|
import os
|
|
import sys
|
|
nacl_sdk_dir = os.getenv("NACL_SDK_ROOT")
|
|
if nacl_sdk_dir is None:
|
|
print("Error: can not find Native Client SDK: set NACL_SDK_ROOT envorinment variable and try again", file=sys.stderr)
|
|
sys.exit(1)
|
|
benchmark_args = [os.path.join(nacl_sdk_dir, "tools", "sel_ldr.py"), "--",
|
|
"bin/convolution-benchmark"]
|
|
else:
|
|
benchmark_args = ["bin/convolution-benchmark"]
|
|
benchmark_args += [
|
|
"-m", mode,
|
|
"-b", str(batch_size),
|
|
"-ic", str(input_channels),
|
|
"-oc", str(output_channels),
|
|
"-is", str(image_size[0]), str(image_size[1]),
|
|
"-ip", str(padding),
|
|
"-ks", str(kernel_size[0]), str(kernel_size[1]),
|
|
"-a", algorithm
|
|
]
|
|
if mode == "inference" and transform_strategy is not None:
|
|
benchmark_args += ["-ts", transform_strategy]
|
|
if threads is not None:
|
|
benchmark_args += ["-t", str(threads)]
|
|
benchmark = subprocess.Popen(benchmark_args, stdout=subprocess.PIPE)
|
|
benchmark_stdout, _ = benchmark.communicate()
|
|
if benchmark.returncode == 0:
|
|
output_lines = [line for line in benchmark_stdout.splitlines() if len(line)]
|
|
total, input_transform, kernel_transform, output_transform, block_multiplication, overhead = None, None, None, None, None, None
|
|
for output_line in output_lines:
|
|
total = total or extract_time(output_line, "Time:")
|
|
input_transform = input_transform or extract_time(output_line, "Input transform:")
|
|
kernel_transform = kernel_transform or extract_time(output_line, "Kernel transform:")
|
|
output_transform = output_transform or extract_time(output_line, "Output transform:")
|
|
block_multiplication = block_multiplication or extract_time(output_line, "Block multiplication:")
|
|
overhead = overhead or extract_time(output_line, "Overhead:")
|
|
if verbose:
|
|
return (total, input_transform, kernel_transform, output_transform, block_multiplication, overhead)
|
|
else:
|
|
return (total,)
|
|
|
|
def fully_connected(mode, batch_size, input_channels, output_channels, threads=None, verbose=False, use_selldr=False):
|
|
import subprocess
|
|
if use_selldr:
|
|
import os
|
|
import sys
|
|
nacl_sdk_dir = os.getenv("NACL_SDK_ROOT")
|
|
if nacl_sdk_dir is None:
|
|
print("Error: can not find Native Client SDK: set NACL_SDK_ROOT envorinment variable and try again", file=sys.stderr)
|
|
sys.exit(1)
|
|
benchmark_args = [os.path.join(nacl_sdk_dir, "tools", "sel_ldr.py"), "--",
|
|
"bin/fully-connected-benchmark"]
|
|
else:
|
|
benchmark_args = ["bin/fully-connected-benchmark"]
|
|
benchmark_args += [
|
|
"-m", mode,
|
|
"-b", str(batch_size),
|
|
"-ic", str(input_channels),
|
|
"-oc", str(output_channels)
|
|
]
|
|
if threads is not None:
|
|
benchmark_args += ["-t", str(threads)]
|
|
benchmark = subprocess.Popen(benchmark_args, stdout=subprocess.PIPE)
|
|
benchmark_stdout, _ = benchmark.communicate()
|
|
if benchmark.returncode == 0:
|
|
output_lines = [line for line in benchmark_stdout.splitlines() if len(line)]
|
|
total, input_transform, kernel_transform, block_multiplication, overhead = None, None, None, None, None
|
|
for output_line in output_lines:
|
|
total = total or extract_time(output_line, "Time:")
|
|
input_transform = input_transform or extract_time(output_line, "Input packing:")
|
|
kernel_transform = kernel_transform or extract_time(output_line, "Kernel packing:")
|
|
block_multiplication = block_multiplication or extract_time(output_line, "Block multiplication:")
|
|
overhead = overhead or extract_time(output_line, "Overhead:")
|
|
if verbose:
|
|
return (total, input_transform, kernel_transform, block_multiplication, overhead)
|
|
else:
|
|
return (total,)
|
|
|
|
overfeat_fast_layers = [
|
|
("conv2", 96, 256, (24, 24), (5, 5), 0),
|
|
("conv3", 256, 512, (12, 12), (3, 3), 1),
|
|
("conv4", 512, 1024, (12, 12), (3, 3), 1),
|
|
("conv5", 1024, 1024, (12, 12), (3, 3), 1),
|
|
("fc6", 36864, 3072),
|
|
("fc7", 3072, 4096),
|
|
("fc8", 4096, 1000),
|
|
]
|
|
|
|
alexnet_layers = [
|
|
("conv2", 64, 192, (27, 27), (5, 5), 2),
|
|
("conv3", 192, 384, (13, 13), (3, 3), 1),
|
|
("conv4", 384, 256, (13, 13), (3, 3), 1),
|
|
("conv5", 256, 256, (13, 13), (3, 3), 1),
|
|
("fc6", 12544, 4096),
|
|
("fc7", 4096, 4096),
|
|
("fc8", 4096, 1000),
|
|
]
|
|
|
|
vgg_a_layers = [
|
|
("conv1", 3, 64, (224, 224), (3, 3), 1),
|
|
("conv2", 64, 128, (112, 112), (3, 3), 1),
|
|
("conv3.1", 128, 256, (56, 56), (3, 3), 1),
|
|
("conv3.2", 256, 256, (56, 56), (3, 3), 1),
|
|
("conv4.1", 256, 512, (28, 28), (3, 3), 1),
|
|
("conv4.2", 512, 512, (28, 28), (3, 3), 1),
|
|
("conv5", 512, 512, (14, 14), (3, 3), 1),
|
|
("fc6", 25088, 4096),
|
|
("fc7", 4096, 4096),
|
|
("fc8", 4096, 1000),
|
|
]
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="NNPACK benchmarking script")
|
|
parser.add_argument("--enable-selldr", dest="use_selldr", action="store_true")
|
|
parser.add_argument("-l", "--layer", dest="layer", required=True, choices=["convolution", "fully-connected", "pooling"])
|
|
parser.add_argument("-n", "--network", dest="network", required=True, choices=["vgg-a", "alexnet", "overfeat-fast"])
|
|
parser.add_argument("-m", "--mode", dest="mode", required=True, choices=["inference", "output", "input-gradient", "kernel-gradient"])
|
|
parser.add_argument("--transform-strategy", dest="transform_strategy", default="compute", choices=["compute", "precompute"])
|
|
parser.add_argument("-b", "--batch", dest="batch", type=int)
|
|
parser.add_argument("-t", "--threads", dest="threads")
|
|
parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", default=False)
|
|
|
|
options = parser.parse_args()
|
|
|
|
network_layers, default_batch = {
|
|
"vgg-a": (vgg_a_layers, 64),
|
|
"alexnet": (alexnet_layers, 128),
|
|
"overfeat-fast": (overfeat_fast_layers, 128)
|
|
}[options.network]
|
|
layer_prefix = {
|
|
"convolution": "conv",
|
|
"fully-connected": "fc",
|
|
"pooling": "pool"
|
|
}[options.layer]
|
|
network_layers = [layer for layer in network_layers if layer[0].startswith(layer_prefix)]
|
|
|
|
batch = default_batch
|
|
if options.batch is not None:
|
|
batch = options.batch
|
|
if batch != 1 and options.mode == "inference":
|
|
raise ValueError("Non-unit batch {batch} is not allowed in inference mode".format(batch=batch))
|
|
elif options.mode == "inference":
|
|
batch = 1
|
|
if options.transform_strategy is not None:
|
|
if options.layer != "convolution":
|
|
raise ValueError("Transform strategy {transform_strategy} is meaningless for non-convolutional layers".format(transform_strategy=transform_strategy))
|
|
elif options.mode != "inference":
|
|
raise ValueError("Transform strategy {transform_strategy} is meaningless in non-inference mode".format(transform_strategy=transform_strategy))
|
|
|
|
if options.layer == "convolution":
|
|
for name, input_channels, output_channels, image_size, kernel_size, padding in network_layers:
|
|
measurements = [name]
|
|
for algorithm in ["implicit-gemm", "ft8x8", "ft16x16", "wt8x8"]:
|
|
if algorithm.startswith("wt") and kernel_size != (3, 3):
|
|
continue
|
|
|
|
measurements += list(convolution(options.mode, batch, input_channels, output_channels,
|
|
image_size, kernel_size, padding, algorithm,
|
|
transform_strategy=options.transform_strategy,
|
|
threads=options.threads, verbose=options.verbose, use_selldr=options.use_selldr))
|
|
print("\t".join(map(str, measurements)))
|
|
elif options.layer == "fully-connected":
|
|
for name, input_channels, output_channels in network_layers:
|
|
measurements = fully_connected(options.mode, batch, input_channels, output_channels,
|
|
threads=options.threads, verbose=options.verbose, use_selldr=options.use_selldr)
|
|
print("{name}\t{measurements}".format(name=name, measurements="\t".join(measurements)))
|