sglang_v0.5.2/vision_0.22.1/references/video_classification/datasets.py

16 lines
440 B
Python

from typing import Tuple
import torchvision
from torch import Tensor
class KineticsWithVideoId(torchvision.datasets.Kinetics):
def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
video, audio, info, video_idx = self.video_clips.get_clip(idx)
label = self.samples[video_idx][1]
if self.transform is not None:
video = self.transform(video)
return video, audio, label, video_idx