sglang_v0.5.2/vision_0.23.0/benchmarks/encoding_decoding.py

100 lines
3.8 KiB
Python

import os
import platform
import statistics
import torch
import torch.utils.benchmark as benchmark
import torchvision
def print_machine_specs():
print("Processor:", platform.processor())
print("Platform:", platform.platform())
print("Logical CPUs:", os.cpu_count())
print(f"\nCUDA device: {torch.cuda.get_device_name()}")
print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
def get_data():
transform = torchvision.transforms.Compose(
[
torchvision.transforms.PILToTensor(),
]
)
path = os.path.join(os.getcwd(), "data")
testset = torchvision.datasets.Places365(
root="./data", download=not os.path.exists(path), transform=transform, split="val"
)
testloader = torch.utils.data.DataLoader(
testset, batch_size=1000, shuffle=False, num_workers=1, collate_fn=lambda batch: [r[0] for r in batch]
)
return next(iter(testloader))
def run_encoding_benchmark(decoded_images):
results = []
for device in ["cpu", "cuda"]:
decoded_images_device = [t.to(device=device) for t in decoded_images]
for size in [1, 100, 1000]:
for num_threads in [1, 12, 24]:
for stmt, strat in zip(
[
"[torchvision.io.encode_jpeg(img) for img in decoded_images_device_trunc]",
"torchvision.io.encode_jpeg(decoded_images_device_trunc)",
],
["unfused", "fused"],
):
decoded_images_device_trunc = decoded_images_device[:size]
t = benchmark.Timer(
stmt=stmt,
setup="import torchvision",
globals={"decoded_images_device_trunc": decoded_images_device_trunc},
label="Image Encoding",
sub_label=f"{device.upper()} ({strat}): {stmt}",
description=f"{size} images",
num_threads=num_threads,
)
results.append(t.blocked_autorange())
compare = benchmark.Compare(results)
compare.print()
def run_decoding_benchmark(encoded_images):
results = []
for device in ["cpu", "cuda"]:
for size in [1, 100, 1000]:
for num_threads in [1, 12, 24]:
for stmt, strat in zip(
[
f"[torchvision.io.decode_jpeg(img, device='{device}') for img in encoded_images_trunc]",
f"torchvision.io.decode_jpeg(encoded_images_trunc, device='{device}')",
],
["unfused", "fused"],
):
encoded_images_trunc = encoded_images[:size]
t = benchmark.Timer(
stmt=stmt,
setup="import torchvision",
globals={"encoded_images_trunc": encoded_images_trunc},
label="Image Decoding",
sub_label=f"{device.upper()} ({strat}): {stmt}",
description=f"{size} images",
num_threads=num_threads,
)
results.append(t.blocked_autorange())
compare = benchmark.Compare(results)
compare.print()
if __name__ == "__main__":
print_machine_specs()
decoded_images = get_data()
mean_h, mean_w = statistics.mean(t.shape[-2] for t in decoded_images), statistics.mean(
t.shape[-1] for t in decoded_images
)
print(f"\nMean image size: {int(mean_h)}x{int(mean_w)}")
run_encoding_benchmark(decoded_images)
encoded_images_cuda = torchvision.io.encode_jpeg([img.cuda() for img in decoded_images])
encoded_images_cpu = [img.cpu() for img in encoded_images_cuda]
run_decoding_benchmark(encoded_images_cpu)