100 lines
3.8 KiB
Python
100 lines
3.8 KiB
Python
import os
|
|
import platform
|
|
import statistics
|
|
|
|
import torch
|
|
import torch.utils.benchmark as benchmark
|
|
import torchvision
|
|
|
|
|
|
def print_machine_specs():
|
|
print("Processor:", platform.processor())
|
|
print("Platform:", platform.platform())
|
|
print("Logical CPUs:", os.cpu_count())
|
|
print(f"\nCUDA device: {torch.cuda.get_device_name()}")
|
|
print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
|
|
|
|
|
|
def get_data():
|
|
transform = torchvision.transforms.Compose(
|
|
[
|
|
torchvision.transforms.PILToTensor(),
|
|
]
|
|
)
|
|
path = os.path.join(os.getcwd(), "data")
|
|
testset = torchvision.datasets.Places365(
|
|
root="./data", download=not os.path.exists(path), transform=transform, split="val"
|
|
)
|
|
testloader = torch.utils.data.DataLoader(
|
|
testset, batch_size=1000, shuffle=False, num_workers=1, collate_fn=lambda batch: [r[0] for r in batch]
|
|
)
|
|
return next(iter(testloader))
|
|
|
|
|
|
def run_encoding_benchmark(decoded_images):
|
|
results = []
|
|
for device in ["cpu", "cuda"]:
|
|
decoded_images_device = [t.to(device=device) for t in decoded_images]
|
|
for size in [1, 100, 1000]:
|
|
for num_threads in [1, 12, 24]:
|
|
for stmt, strat in zip(
|
|
[
|
|
"[torchvision.io.encode_jpeg(img) for img in decoded_images_device_trunc]",
|
|
"torchvision.io.encode_jpeg(decoded_images_device_trunc)",
|
|
],
|
|
["unfused", "fused"],
|
|
):
|
|
decoded_images_device_trunc = decoded_images_device[:size]
|
|
t = benchmark.Timer(
|
|
stmt=stmt,
|
|
setup="import torchvision",
|
|
globals={"decoded_images_device_trunc": decoded_images_device_trunc},
|
|
label="Image Encoding",
|
|
sub_label=f"{device.upper()} ({strat}): {stmt}",
|
|
description=f"{size} images",
|
|
num_threads=num_threads,
|
|
)
|
|
results.append(t.blocked_autorange())
|
|
compare = benchmark.Compare(results)
|
|
compare.print()
|
|
|
|
|
|
def run_decoding_benchmark(encoded_images):
|
|
results = []
|
|
for device in ["cpu", "cuda"]:
|
|
for size in [1, 100, 1000]:
|
|
for num_threads in [1, 12, 24]:
|
|
for stmt, strat in zip(
|
|
[
|
|
f"[torchvision.io.decode_jpeg(img, device='{device}') for img in encoded_images_trunc]",
|
|
f"torchvision.io.decode_jpeg(encoded_images_trunc, device='{device}')",
|
|
],
|
|
["unfused", "fused"],
|
|
):
|
|
encoded_images_trunc = encoded_images[:size]
|
|
t = benchmark.Timer(
|
|
stmt=stmt,
|
|
setup="import torchvision",
|
|
globals={"encoded_images_trunc": encoded_images_trunc},
|
|
label="Image Decoding",
|
|
sub_label=f"{device.upper()} ({strat}): {stmt}",
|
|
description=f"{size} images",
|
|
num_threads=num_threads,
|
|
)
|
|
results.append(t.blocked_autorange())
|
|
compare = benchmark.Compare(results)
|
|
compare.print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print_machine_specs()
|
|
decoded_images = get_data()
|
|
mean_h, mean_w = statistics.mean(t.shape[-2] for t in decoded_images), statistics.mean(
|
|
t.shape[-1] for t in decoded_images
|
|
)
|
|
print(f"\nMean image size: {int(mean_h)}x{int(mean_w)}")
|
|
run_encoding_benchmark(decoded_images)
|
|
encoded_images_cuda = torchvision.io.encode_jpeg([img.cuda() for img in decoded_images])
|
|
encoded_images_cpu = [img.cpu() for img in encoded_images_cuda]
|
|
run_decoding_benchmark(encoded_images_cpu)
|