sglang_v0.5.2/vision_0.22.1/test/test_video_reader.py

import collections
import math
import os
from fractions import Fraction

import numpy as np
import pytest
import torch
import torchvision.io as io
from common_utils import assert_equal
from numpy.random import randint
from pytest import approx
from torchvision import set_video_backend
from torchvision.io import _HAS_CPU_VIDEO_DECODER


try:
    import av

    # Do a version test too
    io.video._check_av_available()
except ImportError:
    av = None


VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")

CheckerConfig = [
    "duration",
    "video_fps",
    "audio_sample_rate",
    # We find for some videos (e.g. HMDB51 videos), the decoded audio frames and pts are
    # slightly different between TorchVision decoder and PyAv decoder. So omit it during check
    "check_aframes",
    "check_aframe_pts",
]
GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))

all_check_config = GroundTruth(
    duration=0,
    video_fps=0,
    audio_sample_rate=0,
    check_aframes=True,
    check_aframe_pts=True,
)

test_videos = {
    "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth(
        duration=2.0,
        video_fps=30.0,
        audio_sample_rate=None,
        check_aframes=True,
        check_aframe_pts=True,
    ),
    "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth(
        duration=2.0,
        video_fps=30.0,
        audio_sample_rate=None,
        check_aframes=True,
        check_aframe_pts=True,
    ),
    "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth(
        duration=2.0,
        video_fps=30.0,
        audio_sample_rate=None,
        check_aframes=True,
        check_aframe_pts=True,
    ),
    "v_SoccerJuggling_g23_c01.avi": GroundTruth(
        duration=8.0,
        video_fps=29.97,
        audio_sample_rate=None,
        check_aframes=True,
        check_aframe_pts=True,
    ),
    "v_SoccerJuggling_g24_c01.avi": GroundTruth(
        duration=8.0,
        video_fps=29.97,
        audio_sample_rate=None,
        check_aframes=True,
        check_aframe_pts=True,
    ),
    "R6llTwEh07w.mp4": GroundTruth(
        duration=10.0,
        video_fps=30.0,
        audio_sample_rate=44100,
        # PyAv miss one audio frame at the beginning (pts=0)
        check_aframes=False,
        check_aframe_pts=False,
    ),
    "SOX5yA1l24A.mp4": GroundTruth(
        duration=11.0,
        video_fps=29.97,
        audio_sample_rate=48000,
        # PyAv miss one audio frame at the beginning (pts=0)
        check_aframes=False,
        check_aframe_pts=False,
    ),
    "WUzgd7C1pWA.mp4": GroundTruth(
        duration=11.0,
        video_fps=29.97,
        audio_sample_rate=48000,
        # PyAv miss one audio frame at the beginning (pts=0)
        check_aframes=False,
        check_aframe_pts=False,
    ),
}


DecoderResult = collections.namedtuple("DecoderResult", "vframes vframe_pts vtimebase aframes aframe_pts atimebase")

# av_seek_frame is imprecise so seek to a timestamp earlier by a margin
# The unit of margin is second
SEEK_FRAME_MARGIN = 0.25


def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer_size=4):
    """
    Args:
        container: pyav container
        start_pts/end_pts: the starting/ending Presentation TimeStamp where
            frames are read
        stream: pyav stream
        stream_name: a dictionary of streams. For example, {"video": 0} means
            video stream at stream index 0
        buffer_size: pts of frames decoded by PyAv is not guaranteed to be in
            ascending order. We need to decode more frames even when we meet end
            pts
    """
    # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin
    margin = 1
    seek_offset = max(start_pts - margin, 0)

    container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
    frames = {}
    buffer_count = 0
    for frame in container.decode(**stream_name):
        if frame.pts < start_pts:
            continue
        if frame.pts <= end_pts:
            frames[frame.pts] = frame
        else:
            buffer_count += 1
            if buffer_count >= buffer_size:
                break
    result = [frames[pts] for pts in sorted(frames)]

    return result


def _get_timebase_by_av_module(full_path):
    container = av.open(full_path)
    video_time_base = container.streams.video[0].time_base
    if container.streams.audio:
        audio_time_base = container.streams.audio[0].time_base
    else:
        audio_time_base = None
    return video_time_base, audio_time_base


def _fraction_to_tensor(fraction):
    ret = torch.zeros([2], dtype=torch.int32)
    ret[0] = fraction.numerator
    ret[1] = fraction.denominator
    return ret


def _decode_frames_by_av_module(
    full_path,
    video_start_pts=0,
    video_end_pts=None,
    audio_start_pts=0,
    audio_end_pts=None,
):
    """
    Use PyAv to decode video frames. This provides a reference for our decoder
    to compare the decoding results.
    Input arguments:
        full_path: video file path
        video_start_pts/video_end_pts: the starting/ending Presentation TimeStamp where
            frames are read
    """
    if video_end_pts is None:
        video_end_pts = float("inf")
    if audio_end_pts is None:
        audio_end_pts = float("inf")
    container = av.open(full_path)

    video_frames = []
    vtimebase = torch.zeros([0], dtype=torch.int32)
    if container.streams.video:
        video_frames = _read_from_stream(
            container,
            video_start_pts,
            video_end_pts,
            container.streams.video[0],
            {"video": 0},
        )
        # container.streams.video[0].average_rate is not a reliable estimator of
        # frame rate. It can be wrong for certain codec, such as VP80
        # So we do not return video fps here
        vtimebase = _fraction_to_tensor(container.streams.video[0].time_base)

    audio_frames = []
    atimebase = torch.zeros([0], dtype=torch.int32)
    if container.streams.audio:
        audio_frames = _read_from_stream(
            container,
            audio_start_pts,
            audio_end_pts,
            container.streams.audio[0],
            {"audio": 0},
        )
        atimebase = _fraction_to_tensor(container.streams.audio[0].time_base)

    container.close()
    vframes = [frame.to_rgb().to_ndarray() for frame in video_frames]
    vframes = torch.as_tensor(np.stack(vframes))

    vframe_pts = torch.tensor([frame.pts for frame in video_frames], dtype=torch.int64)

    aframes = [frame.to_ndarray() for frame in audio_frames]
    if aframes:
        aframes = np.transpose(np.concatenate(aframes, axis=1))
        aframes = torch.as_tensor(aframes)
    else:
        aframes = torch.empty((1, 0), dtype=torch.float32)

    aframe_pts = torch.tensor([audio_frame.pts for audio_frame in audio_frames], dtype=torch.int64)

    return DecoderResult(
        vframes=vframes,
        vframe_pts=vframe_pts,
        vtimebase=vtimebase,
        aframes=aframes,
        aframe_pts=aframe_pts,
        atimebase=atimebase,
    )


def _pts_convert(pts, timebase_from, timebase_to, round_func=math.floor):
    """convert pts between different time bases
    Args:
        pts: presentation timestamp, float
        timebase_from: original timebase. Fraction
        timebase_to: new timebase. Fraction
        round_func: rounding function.
    """
    new_pts = Fraction(pts, 1) * timebase_from / timebase_to
    return int(round_func(new_pts))


def _get_video_tensor(video_dir, video_file):
    """open a video file, and represent the video data by a PT tensor"""
    full_path = os.path.join(video_dir, video_file)

    assert os.path.exists(full_path), "File not found: %s" % full_path

    with open(full_path, "rb") as fp:
        video_tensor = torch.frombuffer(fp.read(), dtype=torch.uint8)

    return full_path, video_tensor


@pytest.mark.skipif(av is None, reason="PyAV unavailable")
@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg")
class TestVideoReader:
    def check_separate_decoding_result(self, tv_result, config):
        """check the decoding results from TorchVision decoder"""
        (
            vframes,
            vframe_pts,
            vtimebase,
            vfps,
            vduration,
            aframes,
            aframe_pts,
            atimebase,
            asample_rate,
            aduration,
        ) = tv_result

        video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item())
        assert video_duration == approx(config.duration, abs=0.5)

        assert vfps.item() == approx(config.video_fps, abs=0.5)

        if asample_rate.numel() > 0:
            assert asample_rate.item() == config.audio_sample_rate
            audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item())
            assert audio_duration == approx(config.duration, abs=0.5)

        # check if pts of video frames are sorted in ascending order
        for i in range(len(vframe_pts) - 1):
            assert vframe_pts[i] < vframe_pts[i + 1]

        if len(aframe_pts) > 1:
            # check if pts of audio frames are sorted in ascending order
            for i in range(len(aframe_pts) - 1):
                assert aframe_pts[i] < aframe_pts[i + 1]

    def check_probe_result(self, result, config):
        vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
        video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item())
        assert video_duration == approx(config.duration, abs=0.5)
        assert vfps.item() == approx(config.video_fps, abs=0.5)
        if asample_rate.numel() > 0:
            assert asample_rate.item() == config.audio_sample_rate
            audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item())
            assert audio_duration == approx(config.duration, abs=0.5)

    def check_meta_result(self, result, config):
        assert result.video_duration == approx(config.duration, abs=0.5)
        assert result.video_fps == approx(config.video_fps, abs=0.5)
        if result.has_audio > 0:
            assert result.audio_sample_rate == config.audio_sample_rate
            assert result.audio_duration == approx(config.duration, abs=0.5)

    def compare_decoding_result(self, tv_result, ref_result, config=all_check_config):
        """
        Compare decoding results from two sources.
        Args:
            tv_result: decoding results from TorchVision decoder
            ref_result: reference decoding results which can be from either PyAv
                        decoder or TorchVision decoder with getPtsOnly = 1
            config: config of decoding results checker
        """
        (
            vframes,
            vframe_pts,
            vtimebase,
            _vfps,
            _vduration,
            aframes,
            aframe_pts,
            atimebase,
            _asample_rate,
            _aduration,
        ) = tv_result
        if isinstance(ref_result, list):
            # the ref_result is from new video_reader decoder
            ref_result = DecoderResult(
                vframes=ref_result[0],
                vframe_pts=ref_result[1],
                vtimebase=ref_result[2],
                aframes=ref_result[5],
                aframe_pts=ref_result[6],
                atimebase=ref_result[7],
            )

        if vframes.numel() > 0 and ref_result.vframes.numel() > 0:
            mean_delta = torch.mean(torch.abs(vframes.float() - ref_result.vframes.float()))
            assert mean_delta == approx(0.0, abs=8.0)

        mean_delta = torch.mean(torch.abs(vframe_pts.float() - ref_result.vframe_pts.float()))
        assert mean_delta == approx(0.0, abs=1.0)

        assert_equal(vtimebase, ref_result.vtimebase)

        if config.check_aframes and aframes.numel() > 0 and ref_result.aframes.numel() > 0:
            """Audio stream is available and audio frame is required to return
            from decoder"""
            assert_equal(aframes, ref_result.aframes)

        if config.check_aframe_pts and aframe_pts.numel() > 0 and ref_result.aframe_pts.numel() > 0:
            """Audio stream is available"""
            assert_equal(aframe_pts, ref_result.aframe_pts)

            assert_equal(atimebase, ref_result.atimebase)

    @pytest.mark.parametrize("test_video", test_videos.keys())
    def test_stress_test_read_video_from_file(self, test_video):
        pytest.skip(
            "This stress test will iteratively decode the same set of videos."
            "It helps to detect memory leak but it takes lots of time to run."
            "By default, it is disabled"
        )
        num_iter = 10000
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        for _i in range(num_iter):
            full_path = os.path.join(VIDEO_DIR, test_video)

            # pass 1: decode all frames using new decoder
            torch.ops.video_reader.read_video_from_file(
                full_path,
                SEEK_FRAME_MARGIN,
                0,  # getPtsOnly
                1,  # readVideoStream
                width,
                height,
                min_dimension,
                max_dimension,
                video_start_pts,
                video_end_pts,
                video_timebase_num,
                video_timebase_den,
                1,  # readAudioStream
                samples,
                channels,
                audio_start_pts,
                audio_end_pts,
                audio_timebase_num,
                audio_timebase_den,
            )

    @pytest.mark.parametrize("test_video,config", test_videos.items())
    def test_read_video_from_file(self, test_video, config):
        """
        Test the case when decoder starts with a video file to decode frames.
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path = os.path.join(VIDEO_DIR, test_video)

        # pass 1: decode all frames using new decoder
        tv_result = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        # pass 2: decode all frames using av
        pyav_result = _decode_frames_by_av_module(full_path)
        # check results from TorchVision decoder
        self.check_separate_decoding_result(tv_result, config)
        # compare decoding results
        self.compare_decoding_result(tv_result, pyav_result, config)

    @pytest.mark.parametrize("test_video,config", test_videos.items())
    @pytest.mark.parametrize("read_video_stream,read_audio_stream", [(1, 0), (0, 1)])
    def test_read_video_from_file_read_single_stream_only(
        self, test_video, config, read_video_stream, read_audio_stream
    ):
        """
        Test the case when decoder starts with a video file to decode frames, and
        only reads video stream and ignores audio stream
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path = os.path.join(VIDEO_DIR, test_video)
        # decode all frames using new decoder
        tv_result = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            read_video_stream,
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            read_audio_stream,
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )

        (
            vframes,
            vframe_pts,
            vtimebase,
            vfps,
            vduration,
            aframes,
            aframe_pts,
            atimebase,
            asample_rate,
            aduration,
        ) = tv_result

        assert (vframes.numel() > 0) is bool(read_video_stream)
        assert (vframe_pts.numel() > 0) is bool(read_video_stream)
        assert (vtimebase.numel() > 0) is bool(read_video_stream)
        assert (vfps.numel() > 0) is bool(read_video_stream)

        expect_audio_data = read_audio_stream == 1 and config.audio_sample_rate is not None
        assert (aframes.numel() > 0) is bool(expect_audio_data)
        assert (aframe_pts.numel() > 0) is bool(expect_audio_data)
        assert (atimebase.numel() > 0) is bool(expect_audio_data)
        assert (asample_rate.numel() > 0) is bool(expect_audio_data)

    @pytest.mark.parametrize("test_video", test_videos.keys())
    def test_read_video_from_file_rescale_min_dimension(self, test_video):
        """
        Test the case when decoder starts with a video file to decode frames, and
        video min dimension between height and width is set.
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 128, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path = os.path.join(VIDEO_DIR, test_video)

        tv_result = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2))

    @pytest.mark.parametrize("test_video", test_videos.keys())
    def test_read_video_from_file_rescale_max_dimension(self, test_video):
        """
        Test the case when decoder starts with a video file to decode frames, and
        video min dimension between height and width is set.
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 85
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path = os.path.join(VIDEO_DIR, test_video)

        tv_result = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))

    @pytest.mark.parametrize("test_video", test_videos.keys())
    def test_read_video_from_file_rescale_both_min_max_dimension(self, test_video):
        """
        Test the case when decoder starts with a video file to decode frames, and
        video min dimension between height and width is set.
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 64, 85
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path = os.path.join(VIDEO_DIR, test_video)

        tv_result = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2))
        assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))

    @pytest.mark.parametrize("test_video", test_videos.keys())
    def test_read_video_from_file_rescale_width(self, test_video):
        """
        Test the case when decoder starts with a video file to decode frames, and
        video width is set.
        """
        # video related
        width, height, min_dimension, max_dimension = 256, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path = os.path.join(VIDEO_DIR, test_video)

        tv_result = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        assert tv_result[0].size(2) == width

    @pytest.mark.parametrize("test_video", test_videos.keys())
    def test_read_video_from_file_rescale_height(self, test_video):
        """
        Test the case when decoder starts with a video file to decode frames, and
        video height is set.
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 224, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path = os.path.join(VIDEO_DIR, test_video)

        tv_result = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        assert tv_result[0].size(1) == height

    @pytest.mark.parametrize("test_video", test_videos.keys())
    def test_read_video_from_file_rescale_width_and_height(self, test_video):
        """
        Test the case when decoder starts with a video file to decode frames, and
        both video height and width are set.
        """
        # video related
        width, height, min_dimension, max_dimension = 320, 240, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path = os.path.join(VIDEO_DIR, test_video)

        tv_result = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        assert tv_result[0].size(1) == height
        assert tv_result[0].size(2) == width

    @pytest.mark.parametrize("test_video", test_videos.keys())
    @pytest.mark.parametrize("samples", [9600, 96000])
    def test_read_video_from_file_audio_resampling(self, test_video, samples):
        """
        Test the case when decoder starts with a video file to decode frames, and
        audio waveform are resampled
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        channels = 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path = os.path.join(VIDEO_DIR, test_video)

        tv_result = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        (
            vframes,
            vframe_pts,
            vtimebase,
            vfps,
            vduration,
            aframes,
            aframe_pts,
            atimebase,
            asample_rate,
            aduration,
        ) = tv_result
        if aframes.numel() > 0:
            assert samples == asample_rate.item()
            assert 1 == aframes.size(1)
            # when audio stream is found
            duration = float(aframe_pts[-1]) * float(atimebase[0]) / float(atimebase[1])
            assert aframes.size(0) == approx(int(duration * asample_rate.item()), abs=0.1 * asample_rate.item())

    @pytest.mark.parametrize("test_video,config", test_videos.items())
    def test_compare_read_video_from_memory_and_file(self, test_video, config):
        """
        Test the case when video is already in memory, and decoder reads data in memory
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)

        # pass 1: decode all frames using cpp decoder
        tv_result_memory = torch.ops.video_reader.read_video_from_memory(
            video_tensor,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        self.check_separate_decoding_result(tv_result_memory, config)
        # pass 2: decode all frames from file
        tv_result_file = torch.ops.video_reader.read_video_from_file(
            full_path,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )

        self.check_separate_decoding_result(tv_result_file, config)
        # finally, compare results decoded from memory and file
        self.compare_decoding_result(tv_result_memory, tv_result_file)

    @pytest.mark.parametrize("test_video,config", test_videos.items())
    def test_read_video_from_memory(self, test_video, config):
        """
        Test the case when video is already in memory, and decoder reads data in memory
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)

        # pass 1: decode all frames using cpp decoder
        tv_result = torch.ops.video_reader.read_video_from_memory(
            video_tensor,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        # pass 2: decode all frames using av
        pyav_result = _decode_frames_by_av_module(full_path)

        self.check_separate_decoding_result(tv_result, config)
        self.compare_decoding_result(tv_result, pyav_result, config)

    @pytest.mark.parametrize("test_video,config", test_videos.items())
    def test_read_video_from_memory_get_pts_only(self, test_video, config):
        """
        Test the case when video is already in memory, and decoder reads data in memory.
        Compare frame pts between decoding for pts only and full decoding
        for both pts and frame data
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)

        # pass 1: decode all frames using cpp decoder
        tv_result = torch.ops.video_reader.read_video_from_memory(
            video_tensor,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        assert abs(config.video_fps - tv_result[3].item()) < 0.01

        # pass 2: decode all frames to get PTS only using cpp decoder
        tv_result_pts_only = torch.ops.video_reader.read_video_from_memory(
            video_tensor,
            SEEK_FRAME_MARGIN,
            1,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )

        assert not tv_result_pts_only[0].numel()
        assert not tv_result_pts_only[5].numel()
        self.compare_decoding_result(tv_result, tv_result_pts_only)

    @pytest.mark.parametrize("test_video,config", test_videos.items())
    @pytest.mark.parametrize("num_frames", [4, 8, 16, 32, 64, 128])
    def test_read_video_in_range_from_memory(self, test_video, config, num_frames):
        """
        Test the case when video is already in memory, and decoder reads data in memory.
        In addition, decoder takes meaningful start- and end PTS as input, and decode
        frames within that interval
        """
        full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1
        # pass 1: decode all frames using new decoder
        tv_result = torch.ops.video_reader.read_video_from_memory(
            video_tensor,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )
        (
            vframes,
            vframe_pts,
            vtimebase,
            vfps,
            vduration,
            aframes,
            aframe_pts,
            atimebase,
            asample_rate,
            aduration,
        ) = tv_result
        assert abs(config.video_fps - vfps.item()) < 0.01

        start_pts_ind_max = vframe_pts.size(0) - num_frames
        if start_pts_ind_max <= 0:
            return
        # randomly pick start pts
        start_pts_ind = randint(0, start_pts_ind_max)
        end_pts_ind = start_pts_ind + num_frames - 1
        video_start_pts = vframe_pts[start_pts_ind]
        video_end_pts = vframe_pts[end_pts_ind]

        video_timebase_num, video_timebase_den = vtimebase[0], vtimebase[1]
        if len(atimebase) > 0:
            # when audio stream is available
            audio_timebase_num, audio_timebase_den = atimebase[0], atimebase[1]
            audio_start_pts = _pts_convert(
                video_start_pts.item(),
                Fraction(video_timebase_num.item(), video_timebase_den.item()),
                Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
                math.floor,
            )
            audio_end_pts = _pts_convert(
                video_end_pts.item(),
                Fraction(video_timebase_num.item(), video_timebase_den.item()),
                Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
                math.ceil,
            )

        # pass 2: decode frames in the randomly generated range
        tv_result = torch.ops.video_reader.read_video_from_memory(
            video_tensor,
            SEEK_FRAME_MARGIN,
            0,  # getPtsOnly
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            video_start_pts,
            video_end_pts,
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            audio_start_pts,
            audio_end_pts,
            audio_timebase_num,
            audio_timebase_den,
        )

        # pass 3: decode frames in range using PyAv
        video_timebase_av, audio_timebase_av = _get_timebase_by_av_module(full_path)

        video_start_pts_av = _pts_convert(
            video_start_pts.item(),
            Fraction(video_timebase_num.item(), video_timebase_den.item()),
            Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
            math.floor,
        )
        video_end_pts_av = _pts_convert(
            video_end_pts.item(),
            Fraction(video_timebase_num.item(), video_timebase_den.item()),
            Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
            math.ceil,
        )
        if audio_timebase_av:
            audio_start_pts = _pts_convert(
                video_start_pts.item(),
                Fraction(video_timebase_num.item(), video_timebase_den.item()),
                Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
                math.floor,
            )
            audio_end_pts = _pts_convert(
                video_end_pts.item(),
                Fraction(video_timebase_num.item(), video_timebase_den.item()),
                Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
                math.ceil,
            )

        pyav_result = _decode_frames_by_av_module(
            full_path,
            video_start_pts_av,
            video_end_pts_av,
            audio_start_pts,
            audio_end_pts,
        )

        assert tv_result[0].size(0) == num_frames
        if pyav_result.vframes.size(0) == num_frames:
            # if PyAv decodes a different number of video frames, skip
            # comparing the decoding results between Torchvision video reader
            # and PyAv
            self.compare_decoding_result(tv_result, pyav_result, config)

    @pytest.mark.parametrize("test_video,config", test_videos.items())
    def test_probe_video_from_file(self, test_video, config):
        """
        Test the case when decoder probes a video file
        """
        full_path = os.path.join(VIDEO_DIR, test_video)
        probe_result = torch.ops.video_reader.probe_video_from_file(full_path)
        self.check_probe_result(probe_result, config)

    @pytest.mark.parametrize("test_video,config", test_videos.items())
    def test_probe_video_from_memory(self, test_video, config):
        """
        Test the case when decoder probes a video in memory
        """
        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
        probe_result = torch.ops.video_reader.probe_video_from_memory(video_tensor)
        self.check_probe_result(probe_result, config)

    @pytest.mark.parametrize("test_video,config", test_videos.items())
    def test_probe_video_from_memory_script(self, test_video, config):
        scripted_fun = torch.jit.script(io._probe_video_from_memory)
        assert scripted_fun is not None

        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
        probe_result = scripted_fun(video_tensor)
        self.check_meta_result(probe_result, config)

    @pytest.mark.parametrize("test_video", test_videos.keys())
    def test_read_video_from_memory_scripted(self, test_video):
        """
        Test the case when video is already in memory, and decoder reads data in memory
        """
        # video related
        width, height, min_dimension, max_dimension = 0, 0, 0, 0
        video_start_pts, video_end_pts = 0, -1
        video_timebase_num, video_timebase_den = 0, 1
        # audio related
        samples, channels = 0, 0
        audio_start_pts, audio_end_pts = 0, -1
        audio_timebase_num, audio_timebase_den = 0, 1

        scripted_fun = torch.jit.script(io._read_video_from_memory)
        assert scripted_fun is not None

        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)

        # decode all frames using cpp decoder
        scripted_fun(
            video_tensor,
            SEEK_FRAME_MARGIN,
            1,  # readVideoStream
            width,
            height,
            min_dimension,
            max_dimension,
            [video_start_pts, video_end_pts],
            video_timebase_num,
            video_timebase_den,
            1,  # readAudioStream
            samples,
            channels,
            [audio_start_pts, audio_end_pts],
            audio_timebase_num,
            audio_timebase_den,
        )
        # FUTURE: check value of video / audio frames

    def test_invalid_file(self):
        set_video_backend("video_reader")
        with pytest.raises(RuntimeError):
            io.read_video("foo.mp4")

        set_video_backend("pyav")
        with pytest.raises(RuntimeError):
            io.read_video("foo.mp4")

    @pytest.mark.parametrize("test_video", test_videos.keys())
    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
    @pytest.mark.parametrize("start_offset", [0, 500])
    @pytest.mark.parametrize("end_offset", [3000, None])
    def test_audio_present_pts(self, test_video, backend, start_offset, end_offset):
        """Test if audio frames are returned with pts unit."""
        full_path = os.path.join(VIDEO_DIR, test_video)
        container = av.open(full_path)
        if container.streams.audio:
            set_video_backend(backend)
            _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="pts")
            assert all([dimension > 0 for dimension in audio.shape[:2]])

    @pytest.mark.parametrize("test_video", test_videos.keys())
    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
    @pytest.mark.parametrize("start_offset", [0, 0.1])
    @pytest.mark.parametrize("end_offset", [0.3, None])
    def test_audio_present_sec(self, test_video, backend, start_offset, end_offset):
        """Test if audio frames are returned with sec unit."""
        full_path = os.path.join(VIDEO_DIR, test_video)
        container = av.open(full_path)
        if container.streams.audio:
            set_video_backend(backend)
            _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec")
            assert all([dimension > 0 for dimension in audio.shape[:2]])


if __name__ == "__main__":
    pytest.main([__file__])