feat: add opensora/datasets module and tools/datasets

- Add opensora/datasets (aspect, bucket, dataloader, datasets, parallel, pin_memory_cache, read_video, sampler, utils, video_transforms) - Add tools/datasets pipeline scripts - Fix .gitignore: scope /datasets to root-level only, whitelist opensora/datasets/ Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 02:00:19 -08:00 · 2026-03-06 02:00:19 -08:00 · bdeb2870d4
parent 916ee2126d
commit bdeb2870d4
24 changed files with 5467 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -195,4 +195,5 @@ package.json
 exps
 ckpts
 flash-attention
-datasets
+/datasets
 !opensora/datasets/
--- a/opensora/datasets/init.py
+++ b/opensora/datasets/init.py
@ -0,0 +1,2 @@
 from .datasets import TextDataset, VideoTextDataset
 from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample
--- a/opensora/datasets/aspect.py
+++ b/opensora/datasets/aspect.py
@ -0,0 +1,151 @@
 import math
 import os
 ASPECT_RATIO_LD_LIST = [  # width:height
    "2.39:1",  # cinemascope, 2.39
    "2:1",  # rare, 2
    "16:9",  # rare, 1.89
    "1.85:1",  # american widescreen, 1.85
    "9:16",  # popular, 1.78
    "5:8",  # rare, 1.6
    "3:2",  # rare, 1.5
    "4:3",  # classic, 1.33
    "1:1",  # square
 ]
 def get_ratio(name: str) -> float:
    width, height = map(float, name.split(":"))
    return height / width
 def get_aspect_ratios_dict(
    total_pixels: int = 256 * 256, training: bool = True
 ) -> dict[str, tuple[int, int]]:
    D = int(os.environ.get("AE_SPATIAL_COMPRESSION", 16))
    aspect_ratios_dict = {}
    aspect_ratios_vertical_dict = {}
    for ratio in ASPECT_RATIO_LD_LIST:
        width_ratio, height_ratio = map(float, ratio.split(":"))
        width = int(math.sqrt(total_pixels * (width_ratio / height_ratio)) // D) * D
        height = int((total_pixels / width) // D) * D
        if training:
            # adjust aspect ratio to match total pixels
            diff = abs(height * width - total_pixels)
            candidate = [
                (height - D, width),
                (height + D, width),
                (height, width - D),
                (height, width + D),
            ]
            for h, w in candidate:
                if abs(h * w - total_pixels) < diff:
                    height, width = h, w
                    diff = abs(h * w - total_pixels)
        # remove duplicated aspect ratio
        if (height, width) not in aspect_ratios_dict.values() or not training:
            aspect_ratios_dict[ratio] = (height, width)
            vertial_ratios = ":".join(ratio.split(":")[::-1])
            aspect_ratios_vertical_dict[vertial_ratios] = (width, height)
    aspect_ratios_dict.update(aspect_ratios_vertical_dict)
    return aspect_ratios_dict
 def get_num_pexels(aspect_ratios_dict: dict[str, tuple[int, int]]) -> dict[str, int]:
    return {ratio: h * w for ratio, (h, w) in aspect_ratios_dict.items()}
 def get_num_tokens(aspect_ratios_dict: dict[str, tuple[int, int]]) -> dict[str, int]:
    D = int(os.environ.get("AE_SPATIAL_COMPRESSION", 16))
    return {ratio: h * w // D // D for ratio, (h, w) in aspect_ratios_dict.items()}
 def get_num_pexels_from_name(resolution: str) -> int:
    resolution = resolution.split("_")[0]
    if resolution.endswith("px"):
        size = int(resolution[:-2])
        num_pexels = size * size
    elif resolution.endswith("p"):
        size = int(resolution[:-1])
        num_pexels = int(size * size / 9 * 16)
    else:
        raise ValueError(f"Invalid resolution {resolution}")
    return num_pexels
 def get_resolution_with_aspect_ratio(
    resolution: str,
 ) -> tuple[int, dict[str, tuple[int, int]]]:
    """Get resolution with aspect ratio
    Args:
        resolution (str): resolution name. The format is name only or "{name}_{setting}".
            name supports "256px" or "360p". setting supports "ar1:1" or "max".
    Returns:
        tuple[int, dict[str, tuple[int, int]]]: resolution with aspect ratio
    """
    keys = resolution.split("_")
    if len(keys) == 1:
        resolution = keys[0]
        setting = ""
    else:
        resolution, setting = keys
        assert setting == "max" or setting.startswith(
            "ar"
        ), f"Invalid setting {setting}"
    # get resolution
    num_pexels = get_num_pexels_from_name(resolution)
    # get aspect ratio
    aspect_ratio_dict = get_aspect_ratios_dict(num_pexels)
    # handle setting
    if setting == "max":
        aspect_ratio = max(
            aspect_ratio_dict,
            key=lambda x: aspect_ratio_dict[x][0] * aspect_ratio_dict[x][1],
        )
        aspect_ratio_dict = {aspect_ratio: aspect_ratio_dict[aspect_ratio]}
    elif setting.startswith("ar"):
        aspect_ratio = setting[2:]
        assert (
            aspect_ratio in aspect_ratio_dict
        ), f"Aspect ratio {aspect_ratio} not found"
        aspect_ratio_dict = {aspect_ratio: aspect_ratio_dict[aspect_ratio]}
    return num_pexels, aspect_ratio_dict
 def get_closest_ratio(height: float, width: float, ratios: dict) -> str:
    aspect_ratio = height / width
    closest_ratio = min(
        ratios.keys(), key=lambda ratio: abs(aspect_ratio - get_ratio(ratio))
    )
    return closest_ratio
 def get_image_size(
    resolution: str, ar_ratio: str, training: bool = True
 ) -> tuple[int, int]:
    num_pexels = get_num_pexels_from_name(resolution)
    ar_dict = get_aspect_ratios_dict(num_pexels, training)
    assert ar_ratio in ar_dict, f"Aspect ratio {ar_ratio} not found"
    return ar_dict[ar_ratio]
 def bucket_to_shapes(bucket_config, batch_size=None):
    shapes = []
    for resolution, infos in bucket_config.items():
        for num_frames, (_, bs) in infos.items():
            aspect_ratios = get_aspect_ratios_dict(get_num_pexels_from_name(resolution))
            for ar, (height, width) in aspect_ratios.items():
                if batch_size is not None:
                    bs = batch_size
                shapes.append((bs, 3, num_frames, height, width))
    return shapes
--- a/opensora/datasets/bucket.py
+++ b/opensora/datasets/bucket.py
@ -0,0 +1,139 @@
 from collections import OrderedDict
 import numpy as np
 from opensora.utils.logger import log_message
 from .aspect import get_closest_ratio, get_resolution_with_aspect_ratio
 from .utils import map_target_fps
 class Bucket:
    def __init__(self, bucket_config: dict[str, dict[int, tuple[float, int] | tuple[tuple[float, float], int]]]):
        """
        Args:
            bucket_config (dict): A dictionary containing the bucket configuration.
                The dictionary should be in the following format:
                {
                    "bucket_name": {
                        "time": (probability, batch_size),
                        "time": (probability, batch_size),
                        ...
                    },
                    ...
                }
                Or in the following format:
                {
                    "bucket_name": {
                        "time": ((probability, next_probability), batch_size),
                        "time": ((probability, next_probability), batch_size),
                        ...
                    },
                    ...
                }
                The bucket_name should be the name of the bucket, and the time should be the number of frames in the video.
                The probability should be a float between 0 and 1, and the batch_size should be an integer.
                If the probability is a tuple, the second value should be the probability to skip to the next time.
        """
        aspect_ratios = {key: get_resolution_with_aspect_ratio(key) for key in bucket_config.keys()}
        bucket_probs = OrderedDict()
        bucket_bs = OrderedDict()
        bucket_names = sorted(bucket_config.keys(), key=lambda x: aspect_ratios[x][0], reverse=True)
        for key in bucket_names:
            bucket_time_names = sorted(bucket_config[key].keys(), key=lambda x: x, reverse=True)
            bucket_probs[key] = OrderedDict({k: bucket_config[key][k][0] for k in bucket_time_names})
            bucket_bs[key] = OrderedDict({k: bucket_config[key][k][1] for k in bucket_time_names})
        self.hw_criteria = {k: aspect_ratios[k][0] for k in bucket_names}
        self.t_criteria = {k1: {k2: k2 for k2 in bucket_config[k1].keys()} for k1 in bucket_names}
        self.ar_criteria = {
            k1: {k2: {k3: v3 for k3, v3 in aspect_ratios[k1][1].items()} for k2 in bucket_config[k1].keys()}
            for k1 in bucket_names
        }
        bucket_id_cnt = num_bucket = 0
        bucket_id = dict()
        for k1, v1 in bucket_probs.items():
            bucket_id[k1] = dict()
            for k2, _ in v1.items():
                bucket_id[k1][k2] = bucket_id_cnt
                bucket_id_cnt += 1
                num_bucket += len(aspect_ratios[k1][1])
        self.bucket_probs = bucket_probs
        self.bucket_bs = bucket_bs
        self.bucket_id = bucket_id
        self.num_bucket = num_bucket
        log_message("Number of buckets: %s", num_bucket)
    def get_bucket_id(
        self,
        T: int,
        H: int,
        W: int,
        fps: float,
        path: str | None = None,
        seed: int | None = None,
        fps_max: int = 16,
    ) -> tuple[str, int, int] | None:
        approx = 0.8
        _, sampling_interval = map_target_fps(fps, fps_max)
        T = T // sampling_interval
        resolution = H * W
        rng = np.random.default_rng(seed)
        # Reference to probabilities and criteria for faster access
        bucket_probs = self.bucket_probs
        hw_criteria = self.hw_criteria
        ar_criteria = self.ar_criteria
        # Start searching for the appropriate bucket
        for hw_id, t_criteria in bucket_probs.items():
            # if resolution is too low, skip
            if resolution < hw_criteria[hw_id] * approx:
                continue
            # if sample is an image
            if T == 1:
                if 1 in t_criteria:
                    if rng.random() < t_criteria[1]:
                        return hw_id, 1, get_closest_ratio(H, W, ar_criteria[hw_id][1])
                continue
            # Look for suitable t_id for video
            for t_id, prob in t_criteria.items():
                if T >= t_id and t_id != 1:
                    # if prob is a tuple, use the second value as the threshold to skip
                    # to the next t_id
                    if isinstance(prob, tuple):
                        next_hw_prob, next_t_prob = prob
                        if next_t_prob >= 1 or rng.random() <= next_t_prob:
                            continue
                    else:
                        next_hw_prob = prob
                    if next_hw_prob >= 1 or rng.random() <= next_hw_prob:
                        ar_id = get_closest_ratio(H, W, ar_criteria[hw_id][t_id])
                        return hw_id, t_id, ar_id
                    else:
                        break
        return None
    def get_thw(self, bucket_idx: tuple[str, int, int]) -> tuple[int, int, int]:
        assert len(bucket_idx) == 3
        T = self.t_criteria[bucket_idx[0]][bucket_idx[1]]
        H, W = self.ar_criteria[bucket_idx[0]][bucket_idx[1]][bucket_idx[2]]
        return T, H, W
    def get_prob(self, bucket_idx: tuple[str, int]) -> float:
        return self.bucket_probs[bucket_idx[0]][bucket_idx[1]]
    def get_batch_size(self, bucket_idx: tuple[str, int]) -> int:
        return self.bucket_bs[bucket_idx[0]][bucket_idx[1]]
    def __len__(self) -> int:
        return self.num_bucket
--- a/opensora/datasets/dataloader.py
+++ b/opensora/datasets/dataloader.py
@ -0,0 +1,402 @@
 import collections
 import functools
 import os
 import queue
 import random
 import threading
 import numpy as np
 import torch
 import torch.multiprocessing as multiprocessing
 from torch._utils import ExceptionWrapper
 from torch.distributed import ProcessGroup
 from torch.utils.data import DataLoader, _utils
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
 from torch.utils.data.dataloader import (
    IterDataPipe,
    MapDataPipe,
    _BaseDataLoaderIter,
    _MultiProcessingDataLoaderIter,
    _sharding_worker_init_fn,
    _SingleProcessDataLoaderIter,
 )
 from opensora.acceleration.parallel_states import get_data_parallel_group
 from opensora.registry import DATASETS, build_module
 from opensora.utils.config import parse_configs
 from opensora.utils.logger import create_logger
 from opensora.utils.misc import format_duration
 from opensora.utils.train import setup_device
 from .datasets import TextDataset, VideoTextDataset
 from .pin_memory_cache import PinMemoryCache
 from .sampler import DistributedSampler, VariableVideoBatchSampler
 def _pin_memory_loop(
    in_queue, out_queue, device_id, done_event, device, pin_memory_cache: PinMemoryCache, pin_memory_key: str
 ):
    # This setting is thread local, and prevents the copy in pin_memory from
    # consuming all CPU cores.
    torch.set_num_threads(1)
    if device == "cuda":
        torch.cuda.set_device(device_id)
    elif device == "xpu":
        torch.xpu.set_device(device_id)  # type: ignore[attr-defined]
    elif device == torch._C._get_privateuse1_backend_name():
        custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name())
        custom_device_mod.set_device(device_id)
    def do_one_step():
        try:
            r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
        except queue.Empty:
            return
        idx, data = r
        if not done_event.is_set() and not isinstance(data, ExceptionWrapper):
            try:
                assert isinstance(data, dict)
                if pin_memory_key in data:
                    val = data[pin_memory_key]
                    pin_memory_value = pin_memory_cache.get(val)
                    pin_memory_value.copy_(val)
                    data[pin_memory_key] = pin_memory_value
            except Exception:
                data = ExceptionWrapper(where=f"in pin memory thread for device {device_id}")
            r = (idx, data)
        while not done_event.is_set():
            try:
                out_queue.put(r, timeout=MP_STATUS_CHECK_INTERVAL)
                break
            except queue.Full:
                continue
    # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on the
    # logic of this function.
    while not done_event.is_set():
        # Make sure that we don't preserve any object from one iteration
        # to the next
        do_one_step()
 class _MultiProcessingDataLoaderIterForVideo(_MultiProcessingDataLoaderIter):
    pin_memory_key: str = "video"
    def __init__(self, loader):
        _BaseDataLoaderIter.__init__(self, loader)
        self.pin_memory_cache = PinMemoryCache()
        self._prefetch_factor = loader.prefetch_factor
        assert self._num_workers > 0
        assert self._prefetch_factor > 0
        if loader.multiprocessing_context is None:
            multiprocessing_context = multiprocessing
        else:
            multiprocessing_context = loader.multiprocessing_context
        self._worker_init_fn = loader.worker_init_fn
        # Adds forward compatibilities so classic DataLoader can work with DataPipes:
        #   Additional worker init function will take care of sharding in MP and Distributed
        if isinstance(self._dataset, (IterDataPipe, MapDataPipe)):
            self._worker_init_fn = functools.partial(
                _sharding_worker_init_fn, self._worker_init_fn, self._world_size, self._rank
            )
        # No certainty which module multiprocessing_context is
        self._worker_result_queue = multiprocessing_context.Queue()  # type: ignore[var-annotated]
        self._worker_pids_set = False
        self._shutdown = False
        self._workers_done_event = multiprocessing_context.Event()
        self._index_queues = []
        self._workers = []
        for i in range(self._num_workers):
            # No certainty which module multiprocessing_context is
            index_queue = multiprocessing_context.Queue()  # type: ignore[var-annotated]
            # Need to `cancel_join_thread` here!
            # See sections (2) and (3b) above.
            index_queue.cancel_join_thread()
            w = multiprocessing_context.Process(
                target=_utils.worker._worker_loop,
                args=(
                    self._dataset_kind,
                    self._dataset,
                    index_queue,
                    self._worker_result_queue,
                    self._workers_done_event,
                    self._auto_collation,
                    self._collate_fn,
                    self._drop_last,
                    self._base_seed,
                    self._worker_init_fn,
                    i,
                    self._num_workers,
                    self._persistent_workers,
                    self._shared_seed,
                ),
            )
            w.daemon = True
            # NB: Process.start() actually take some time as it needs to
            #     start a process and pass the arguments over via a pipe.
            #     Therefore, we only add a worker to self._workers list after
            #     it started, so that we do not call .join() if program dies
            #     before it starts, and __del__ tries to join but will get:
            #     AssertionError: can only join a started process.
            w.start()
            self._index_queues.append(index_queue)
            self._workers.append(w)
        if self._pin_memory:
            self._pin_memory_thread_done_event = threading.Event()
            # Queue is not type-annotated
            self._data_queue = queue.Queue()  # type: ignore[var-annotated]
            if self._pin_memory_device == "xpu":
                current_device = torch.xpu.current_device()  # type: ignore[attr-defined]
            elif self._pin_memory_device == torch._C._get_privateuse1_backend_name():
                custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name())
                current_device = custom_device_mod.current_device()
            else:
                current_device = torch.cuda.current_device()  # choose cuda for default
            pin_memory_thread = threading.Thread(
                target=_pin_memory_loop,
                args=(
                    self._worker_result_queue,
                    self._data_queue,
                    current_device,
                    self._pin_memory_thread_done_event,
                    self._pin_memory_device,
                    self.pin_memory_cache,
                    self.pin_memory_key,
                ),
            )
            pin_memory_thread.daemon = True
            pin_memory_thread.start()
            # Similar to workers (see comment above), we only register
            # pin_memory_thread once it is started.
            self._pin_memory_thread = pin_memory_thread
        else:
            self._data_queue = self._worker_result_queue  # type: ignore[assignment]
        # In some rare cases, persistent workers (daemonic processes)
        # would be terminated before `__del__` of iterator is invoked
        # when main process exits
        # It would cause failure when pin_memory_thread tries to read
        # corrupted data from worker_result_queue
        # atexit is used to shutdown thread and child processes in the
        # right sequence before main process exits
        if self._persistent_workers and self._pin_memory:
            import atexit
            for w in self._workers:
                atexit.register(_MultiProcessingDataLoaderIter._clean_up_worker, w)
        # .pid can be None only before process is spawned (not the case, so ignore)
        _utils.signal_handling._set_worker_pids(id(self), tuple(w.pid for w in self._workers))  # type: ignore[misc]
        _utils.signal_handling._set_SIGCHLD_handler()
        self._worker_pids_set = True
        self._reset(loader, first_iter=True)
    def remove_cache(self, output_tensor: torch.Tensor):
        self.pin_memory_cache.remove(output_tensor)
    def get_cache_info(self) -> str:
        return str(self.pin_memory_cache)
 class DataloaderForVideo(DataLoader):
    def _get_iterator(self) -> "_BaseDataLoaderIter":
        if self.num_workers == 0:
            return _SingleProcessDataLoaderIter(self)
        else:
            self.check_worker_number_rationality()
            return _MultiProcessingDataLoaderIterForVideo(self)
 # Deterministic dataloader
 def get_seed_worker(seed):
    def seed_worker(worker_id):
        worker_seed = seed
        if seed is not None:
            np.random.seed(worker_seed)
            torch.manual_seed(worker_seed)
            random.seed(worker_seed)
    return seed_worker
 def prepare_dataloader(
    dataset,
    batch_size=None,
    shuffle=False,
    seed=1024,
    drop_last=False,
    pin_memory=False,
    num_workers=0,
    process_group: ProcessGroup | None = None,
    bucket_config=None,
    num_bucket_build_workers=1,
    prefetch_factor=None,
    cache_pin_memory=False,
    num_groups=1,
    **kwargs,
 ):
    _kwargs = kwargs.copy()
    if isinstance(dataset, VideoTextDataset):
        batch_sampler = VariableVideoBatchSampler(
            dataset,
            bucket_config,
            num_replicas=process_group.size(),
            rank=process_group.rank(),
            shuffle=shuffle,
            seed=seed,
            drop_last=drop_last,
            verbose=True,
            num_bucket_build_workers=num_bucket_build_workers,
            num_groups=num_groups,
        )
        dl_cls = DataloaderForVideo if cache_pin_memory else DataLoader
        return (
            dl_cls(
                dataset,
                batch_sampler=batch_sampler,
                worker_init_fn=get_seed_worker(seed),
                pin_memory=pin_memory,
                num_workers=num_workers,
                collate_fn=collate_fn_default,
                prefetch_factor=prefetch_factor,
                **_kwargs,
            ),
            batch_sampler,
        )
    elif isinstance(dataset, TextDataset):
        if process_group is None:
            return (
                DataLoader(
                    dataset,
                    batch_size=batch_size,
                    shuffle=shuffle,
                    worker_init_fn=get_seed_worker(seed),
                    drop_last=drop_last,
                    pin_memory=pin_memory,
                    num_workers=num_workers,
                    prefetch_factor=prefetch_factor,
                    **_kwargs,
                ),
                None,
            )
        else:
            sampler = DistributedSampler(
                dataset,
                num_replicas=process_group.size(),
                rank=process_group.rank(),
                shuffle=shuffle,
                seed=seed,
                drop_last=drop_last,
            )
            return (
                DataLoader(
                    dataset,
                    sampler=sampler,
                    worker_init_fn=get_seed_worker(seed),
                    pin_memory=pin_memory,
                    num_workers=num_workers,
                    collate_fn=collate_fn_default,
                    prefetch_factor=prefetch_factor,
                    **_kwargs,
                ),
                sampler,
            )
    else:
        raise ValueError(f"Unsupported dataset type: {type(dataset)}")
 def collate_fn_default(batch):
    # filter out None
    batch = [x for x in batch if x is not None]
    assert len(batch) > 0, "batch is empty"
    # HACK: for loading text features
    use_mask = False
    if "mask" in batch[0] and isinstance(batch[0]["mask"], int):
        masks = [x.pop("mask") for x in batch]
        texts = [x.pop("text") for x in batch]
        texts = torch.cat(texts, dim=1)
        use_mask = True
    ret = torch.utils.data.default_collate(batch)
    if use_mask:
        ret["mask"] = masks
        ret["text"] = texts
    return ret
 def collate_fn_batch(batch):
    """
    Used only with BatchDistributedSampler
    """
    # filter out None
    batch = [x for x in batch if x is not None]
    res = torch.utils.data.default_collate(batch)
    # squeeze the first dimension, which is due to torch.stack() in default_collate()
    if isinstance(res, collections.abc.Mapping):
        for k, v in res.items():
            if isinstance(v, torch.Tensor):
                res[k] = v.squeeze(0)
    elif isinstance(res, collections.abc.Sequence):
        res = [x.squeeze(0) if isinstance(x, torch.Tensor) else x for x in res]
    elif isinstance(res, torch.Tensor):
        res = res.squeeze(0)
    else:
        raise TypeError
    return res
 if __name__ == "__main__":
    # NUM_GPU: number of GPUs for training
    # TIME_PER_STEP: time per step in seconds
    # Example usage:
    # torchrun --nproc_per_node 1 -m opensora.datasets.dataloader configs/diffusion/train/video_cond.py
    cfg = parse_configs()
    setup_device()
    logger = create_logger()
    # == build dataset ==
    dataset = build_module(cfg.dataset, DATASETS)
    # == build dataloader ==
    dataloader_args = dict(
        dataset=dataset,
        batch_size=cfg.get("batch_size", None),
        num_workers=cfg.get("num_workers", 4),
        seed=cfg.get("seed", 1024),
        shuffle=True,
        drop_last=True,
        pin_memory=True,
        process_group=get_data_parallel_group(),
        prefetch_factor=cfg.get("prefetch_factor", None),
    )
    dataloader, sampler = prepare_dataloader(
        bucket_config=cfg.get("bucket_config", None),
        num_bucket_build_workers=cfg.get("num_bucket_build_workers", 1),
        **dataloader_args,
    )
    num_steps_per_epoch = len(dataloader)
    num_machines = int(os.environ.get("NUM_MACHINES", 28))
    num_gpu = num_machines * 8
    logger.info("Number of GPUs: %d", num_gpu)
    logger.info("Number of steps per epoch: %d", num_steps_per_epoch // num_gpu)
    time_per_step = int(os.environ.get("TIME_PER_STEP", 20))
    time_training = num_steps_per_epoch // num_gpu * time_per_step
    logger.info("Time per step: %s", format_duration(time_per_step))
    logger.info("Time for training: %s", format_duration(time_training))
--- a/opensora/datasets/datasets.py
+++ b/opensora/datasets/datasets.py
@ -0,0 +1,315 @@
 import os
 import random
 import numpy as np
 import pandas as pd
 import torch
 from PIL import ImageFile
 from torchvision.datasets.folder import pil_loader
 from opensora.registry import DATASETS
 from .read_video import read_video
 from .utils import get_transforms_image, get_transforms_video, is_img, map_target_fps, read_file, temporal_random_crop
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 VALID_KEYS = ("neg", "path")
 K = 10000
 class Iloc:
    def __init__(self, data, sharded_folder, sharded_folders, rows_per_shard):
        self.data = data
        self.sharded_folder = sharded_folder
        self.sharded_folders = sharded_folders
        self.rows_per_shard = rows_per_shard
    def __getitem__(self, index):
        return Item(
            index,
            self.data,
            self.sharded_folder,
            self.sharded_folders,
            self.rows_per_shard,
        )
 class Item:
    def __init__(self, index, data, sharded_folder, sharded_folders, rows_per_shard):
        self.index = index
        self.data = data
        self.sharded_folder = sharded_folder
        self.sharded_folders = sharded_folders
        self.rows_per_shard = rows_per_shard
    def __getitem__(self, key):
        index = self.index
        if key in self.data.columns:
            return self.data[key].iloc[index]
        else:
            shard_idx = index // self.rows_per_shard
            idx = index % self.rows_per_shard
            shard_parquet = os.path.join(self.sharded_folder, self.sharded_folders[shard_idx])
            try:
                text_parquet = pd.read_parquet(shard_parquet, engine="fastparquet")
                path = text_parquet["path"].iloc[idx]
                assert path == self.data["path"].iloc[index]
            except Exception as e:
                print(f"Error reading {shard_parquet}: {e}")
                raise
            return text_parquet[key].iloc[idx]
    def to_dict(self):
        index = self.index
        ret = {}
        ret.update(self.data.iloc[index].to_dict())
        shard_idx = index // self.rows_per_shard
        idx = index % self.rows_per_shard
        shard_parquet = os.path.join(self.sharded_folder, self.sharded_folders[shard_idx])
        try:
            text_parquet = pd.read_parquet(shard_parquet, engine="fastparquet")
            path = text_parquet["path"].iloc[idx]
            assert path == self.data["path"].iloc[index]
            ret.update(text_parquet.iloc[idx].to_dict())
        except Exception as e:
            print(f"Error reading {shard_parquet}: {e}")
            ret.update({"text": ""})
        return ret
 class EfficientParquet:
    def __init__(self, df, sharded_folder):
        self.data = df
        self.total_rows = len(df)
        self.rows_per_shard = (self.total_rows + K - 1) // K
        self.sharded_folder = sharded_folder
        assert os.path.exists(sharded_folder), f"Sharded folder {sharded_folder} does not exist."
        self.sharded_folders = os.listdir(sharded_folder)
        self.sharded_folders = sorted(self.sharded_folders)
    def __len__(self):
        return self.total_rows
    @property
    def iloc(self):
        return Iloc(self.data, self.sharded_folder, self.sharded_folders, self.rows_per_shard)
@DATASETS.register_module("text")
 class TextDataset(torch.utils.data.Dataset):
    """
    Dataset for text data
    """
    def __init__(
        self,
        data_path: str = None,
        tokenize_fn: callable = None,
        fps_max: int = 16,
        vmaf: bool = False,
        memory_efficient: bool = False,
        **kwargs,
    ):
        self.data_path = data_path
        self.data = read_file(data_path, memory_efficient=memory_efficient)
        self.memory_efficient = memory_efficient
        self.tokenize_fn = tokenize_fn
        self.vmaf = vmaf
        if fps_max is not None:
            self.fps_max = fps_max
        else:
            self.fps_max = 999999999
    def to_efficient(self):
        if self.memory_efficient:
            addition_data_path = self.data_path.split(".")[0]
            self._data = self.data
            self.data = EfficientParquet(self._data, addition_data_path)
    def getitem(self, index: int) -> dict:
        ret = dict()
        sample = self.data.iloc[index].to_dict()
        sample_fps = sample.get("fps", np.nan)
        new_fps, sampling_interval = map_target_fps(sample_fps, self.fps_max)
        ret.update({"sampling_interval": sampling_interval})
        if "text" in sample:
            ret["text"] = sample.pop("text")
            postfixs = []
            if new_fps != 0 and self.fps_max < 999:
                postfixs.append(f"{new_fps} FPS")
            if self.vmaf and "score_vmafmotion" in sample and not np.isnan(sample["score_vmafmotion"]):
                postfixs.append(f"{int(sample['score_vmafmotion'] + 0.5)} motion score")
            postfix = " " + ", ".join(postfixs) + "." if postfixs else ""
            ret["text"] = ret["text"] + postfix
            if self.tokenize_fn is not None:
                ret.update({k: v.squeeze(0) for k, v in self.tokenize_fn(ret["text"]).items()})
        if "ref" in sample:  # i2v & v2v reference
            ret["ref"] = sample.pop("ref")
        # name of the generated sample
        if "name" in sample:  # sample name (`dataset_idx`)
            ret["name"] = sample.pop("name")
        else:
            ret["index"] = index  # use index for name
        valid_sample = {k: v for k, v in sample.items() if k in VALID_KEYS}
        ret.update(valid_sample)
        return ret
    def __getitem__(self, index):
        return self.getitem(index)
    def __len__(self):
        return len(self.data)
@DATASETS.register_module("video_text")
 class VideoTextDataset(TextDataset):
    def __init__(
        self,
        transform_name: str = None,
        bucket_class: str = "Bucket",
        rand_sample_interval: int = None,  # random sample_interval value from [1, min(rand_sample_interval, video_allowed_max)]
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.transform_name = transform_name
        self.bucket_class = bucket_class
        self.rand_sample_interval = rand_sample_interval
    def get_image(self, index: int, height: int, width: int) -> dict:
        sample = self.data.iloc[index]
        path = sample["path"]
        # loading
        image = pil_loader(path)
        # transform
        transform = get_transforms_image(self.transform_name, (height, width))
        image = transform(image)
        # CHW -> CTHW
        video = image.unsqueeze(1)
        return {"video": video}
    def get_video(self, index: int, num_frames: int, height: int, width: int, sampling_interval: int) -> dict:
        sample = self.data.iloc[index]
        path = sample["path"]
        # loading
        vframes, vinfo = read_video(path, backend="av")
        if self.rand_sample_interval is not None:
            # randomly sample from 1 - self.rand_sample_interval
            video_allowed_max = min(len(vframes) // num_frames, self.rand_sample_interval)
            sampling_interval = random.randint(1, video_allowed_max)
        # Sampling video frames
        video = temporal_random_crop(vframes, num_frames, sampling_interval)
        video = video.clone()
        del vframes
        # transform
        transform = get_transforms_video(self.transform_name, (height, width))
        video = transform(video)  # T C H W
        video = video.permute(1, 0, 2, 3)
        ret = {"video": video}
        return ret
    def get_image_or_video(self, index: int, num_frames: int, height: int, width: int, sampling_interval: int) -> dict:
        sample = self.data.iloc[index]
        path = sample["path"]
        if is_img(path):
            return self.get_image(index, height, width)
        return self.get_video(index, num_frames, height, width, sampling_interval)
    def getitem(self, index: str) -> dict:
        # a hack to pass in the (time, height, width) info from sampler
        index, num_frames, height, width = [int(val) for val in index.split("-")]
        ret = dict()
        ret.update(super().getitem(index))
        try:
            ret.update(self.get_image_or_video(index, num_frames, height, width, ret["sampling_interval"]))
        except Exception as e:
            path = self.data.iloc[index]["path"]
            print(f"video {path}: {e}")
            return None
        return ret
    def __getitem__(self, index):
        return self.getitem(index)
@DATASETS.register_module("cached_video_text")
 class CachedVideoTextDataset(VideoTextDataset):
    def __init__(
        self,
        transform_name: str = None,
        bucket_class: str = "Bucket",
        rand_sample_interval: int = None,  # random sample_interval value from [1, min(rand_sample_interval, video_allowed_max)]
        cached_video: bool = False,
        cached_text: bool = False,
        return_latents_path: bool = False,
        load_original_video: bool = True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.transform_name = transform_name
        self.bucket_class = bucket_class
        self.rand_sample_interval = rand_sample_interval
        self.cached_video = cached_video
        self.cached_text = cached_text
        self.return_latents_path = return_latents_path
        self.load_original_video = load_original_video
    def get_latents(self, path):
        try:
            latents = torch.load(path, map_location=torch.device("cpu"))
        except Exception as e:
            print(f"Error loading latents from {path}: {e}")
            return torch.zeros_like(torch.randn(1, 1, 1, 1))
        return latents
    def get_conditioning_latents(self, index: int) -> dict:
        sample = self.data.iloc[index]
        latents_path = sample["latents_path"]
        text_t5_path = sample["text_t5_path"]
        text_clip_path = sample["text_clip_path"]
        res = dict()
        if self.cached_video:
            latents = self.get_latents(latents_path)
            res["video_latents"] = latents
        if self.cached_text:
            text_t5 = self.get_latents(text_t5_path)
            text_clip = self.get_latents(text_clip_path)
            res["text_t5"] = text_t5
            res["text_clip"] = text_clip
        if self.return_latents_path:
            res["latents_path"] = latents_path
            res["text_t5_path"] = text_t5_path
            res["text_clip_path"] = text_clip_path
        return res
    def getitem(self, index: str) -> dict:
        # a hack to pass in the (time, height, width) info from sampler
        real_index, num_frames, height, width = [int(val) for val in index.split("-")]
        ret = dict()
        if self.load_original_video:
            ret.update(super().getitem(index))
        try:
            ret.update(self.get_conditioning_latents(real_index))
        except Exception as e:
            path = self.data.iloc[real_index]["path"]
            print(f"video {path}: {e}")
            return None
        return ret
    def __getitem__(self, index):
        return self.getitem(index)
--- a/opensora/datasets/parallel.py
+++ b/opensora/datasets/parallel.py
@ -0,0 +1,176 @@
 import multiprocessing
 from itertools import count
 from multiprocessing.managers import SyncManager
 from typing import Any, Callable, Dict, Tuple, Type, cast
 import dill
 import pandarallel
 import pandas as pd
 from pandarallel.data_types import DataType
 from pandarallel.progress_bars import ProgressBarsType, get_progress_bars, progress_wrapper
 from pandarallel.utils import WorkerStatus
 CONTEXT = multiprocessing.get_context("fork")
 TMP = []
 class WrapWorkFunctionForPipe:
    def __init__(
        self,
        work_function: Callable[
            [
                Any,
                Callable,
                tuple,
                Dict[str, Any],
                Dict[str, Any],
            ],
            Any,
        ],
    ) -> None:
        self.work_function = work_function
    def __call__(
        self,
        progress_bars_type: ProgressBarsType,
        worker_index: int,
        master_workers_queue: multiprocessing.Queue,
        dilled_user_defined_function: bytes,
        user_defined_function_args: tuple,
        user_defined_function_kwargs: Dict[str, Any],
        extra: Dict[str, Any],
    ) -> Any:
        try:
            data = TMP[worker_index]
            data_size = len(data)
            user_defined_function: Callable = dill.loads(dilled_user_defined_function)
            progress_wrapped_user_defined_function = progress_wrapper(
                user_defined_function, master_workers_queue, worker_index, data_size
            )
            used_user_defined_function = (
                progress_wrapped_user_defined_function
                if progress_bars_type
                in (
                    ProgressBarsType.InUserDefinedFunction,
                    ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns,
                )
                else user_defined_function
            )
            results = self.work_function(
                data,
                used_user_defined_function,
                user_defined_function_args,
                user_defined_function_kwargs,
                extra,
            )
            master_workers_queue.put((worker_index, WorkerStatus.Success, None))
            return results
        except:
            master_workers_queue.put((worker_index, WorkerStatus.Error, None))
            raise
 def parallelize_with_pipe(
    nb_requested_workers: int,
    data_type: Type[DataType],
    progress_bars_type: ProgressBarsType,
 ):
    def closure(
        data: Any,
        user_defined_function: Callable,
        *user_defined_function_args: tuple,
        **user_defined_function_kwargs: Dict[str, Any],
    ):
        wrapped_work_function = WrapWorkFunctionForPipe(data_type.work)
        dilled_user_defined_function = dill.dumps(user_defined_function)
        manager: SyncManager = CONTEXT.Manager()
        master_workers_queue = manager.Queue()
        chunks = list(
            data_type.get_chunks(
                nb_requested_workers,
                data,
                user_defined_function_kwargs=user_defined_function_kwargs,
            )
        )
        TMP.extend(chunks)
        nb_workers = len(chunks)
        multiplicator_factor = (
            len(cast(pd.DataFrame, data).columns)
            if progress_bars_type == ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns
            else 1
        )
        progresses_length = [len(chunk_) * multiplicator_factor for chunk_ in chunks]
        work_extra = data_type.get_work_extra(data)
        reduce_extra = data_type.get_reduce_extra(data, user_defined_function_kwargs)
        show_progress_bars = progress_bars_type != ProgressBarsType.No
        progress_bars = get_progress_bars(progresses_length, show_progress_bars)
        progresses = [0] * nb_workers
        workers_status = [WorkerStatus.Running] * nb_workers
        work_args_list = [
            (
                progress_bars_type,
                worker_index,
                master_workers_queue,
                dilled_user_defined_function,
                user_defined_function_args,
                user_defined_function_kwargs,
                {
                    **work_extra,
                    **{
                        "master_workers_queue": master_workers_queue,
                        "show_progress_bars": show_progress_bars,
                        "worker_index": worker_index,
                    },
                },
            )
            for worker_index in range(nb_workers)
        ]
        pool = CONTEXT.Pool(nb_workers)
        results_promise = pool.starmap_async(wrapped_work_function, work_args_list)
        pool.close()
        generation = count()
        while any((worker_status == WorkerStatus.Running for worker_status in workers_status)):
            message: Tuple[int, WorkerStatus, Any] = master_workers_queue.get()
            worker_index, worker_status, payload = message
            workers_status[worker_index] = worker_status
            if worker_status == WorkerStatus.Success:
                progresses[worker_index] = progresses_length[worker_index]
                progress_bars.update(progresses)
            elif worker_status == WorkerStatus.Running:
                progress = cast(int, payload)
                progresses[worker_index] = progress
                if next(generation) % nb_workers == 0:
                    progress_bars.update(progresses)
            elif worker_status == WorkerStatus.Error:
                progress_bars.set_error(worker_index)
        results = results_promise.get()
        TMP.clear()
        return data_type.reduce(results, reduce_extra)
    return closure
 pandarallel.core.WrapWorkFunctionForPipe = WrapWorkFunctionForPipe
 pandarallel.core.parallelize_with_pipe = parallelize_with_pipe
 pandarallel = pandarallel.pandarallel
--- a/opensora/datasets/pin_memory_cache.py
+++ b/opensora/datasets/pin_memory_cache.py
@ -0,0 +1,76 @@
 import threading
 from typing import Dict, List, Optional
 import torch
 class PinMemoryCache:
    force_dtype: Optional[torch.dtype] = None
    min_cache_numel: int = 0
    pre_alloc_numels: List[int] = []
    def __init__(self):
        self.cache: Dict[int, torch.Tensor] = {}
        self.output_to_cache: Dict[int, int] = {}
        self.cache_to_output: Dict[int, int] = {}
        self.lock = threading.Lock()
        self.total_cnt = 0
        self.hit_cnt = 0
        if len(self.pre_alloc_numels) > 0 and self.force_dtype is not None:
            for n in self.pre_alloc_numels:
                cache_tensor = torch.empty(n, dtype=self.force_dtype, device="cpu", pin_memory=True)
                with self.lock:
                    self.cache[id(cache_tensor)] = cache_tensor
    def get(self, tensor: torch.Tensor) -> torch.Tensor:
        """Receive a cpu tensor and return the corresponding pinned tensor. Note that this only manage memory allocation, doesn't copy content.
        Args:
            tensor (torch.Tensor): The tensor to be pinned.
        Returns:
            torch.Tensor: The pinned tensor.
        """
        self.total_cnt += 1
        with self.lock:
            # find free cache
            for cache_id, cache_tensor in self.cache.items():
                if cache_id not in self.cache_to_output and cache_tensor.numel() >= tensor.numel():
                    target_cache_tensor = cache_tensor[: tensor.numel()].view(tensor.shape)
                    out_id = id(target_cache_tensor)
                    self.output_to_cache[out_id] = cache_id
                    self.cache_to_output[cache_id] = out_id
                    self.hit_cnt += 1
                    return target_cache_tensor
        # no free cache, create a new one
        dtype = self.force_dtype if self.force_dtype is not None else tensor.dtype
        cache_numel = max(tensor.numel(), self.min_cache_numel)
        cache_tensor = torch.empty(cache_numel, dtype=dtype, device="cpu", pin_memory=True)
        target_cache_tensor = cache_tensor[: tensor.numel()].view(tensor.shape)
        out_id = id(target_cache_tensor)
        with self.lock:
            self.cache[id(cache_tensor)] = cache_tensor
            self.output_to_cache[out_id] = id(cache_tensor)
            self.cache_to_output[id(cache_tensor)] = out_id
        return target_cache_tensor
    def remove(self, output_tensor: torch.Tensor) -> None:
        """Release corresponding cache tensor.
        Args:
            output_tensor (torch.Tensor): The tensor to be released.
        """
        out_id = id(output_tensor)
        with self.lock:
            if out_id not in self.output_to_cache:
                raise ValueError("Tensor not found in cache.")
            cache_id = self.output_to_cache.pop(out_id)
            del self.cache_to_output[cache_id]
    def __str__(self):
        with self.lock:
            num_cached = len(self.cache)
            num_used = len(self.output_to_cache)
            total_cache_size = sum([v.numel() * v.element_size() for v in self.cache.values()])
        return f"PinMemoryCache(num_cached={num_cached}, num_used={num_used}, total_cache_size={total_cache_size / 1024**3:.2f} GB, hit rate={self.hit_cnt / self.total_cnt:.2f})"
--- a/opensora/datasets/read_video.py
+++ b/opensora/datasets/read_video.py
@ -0,0 +1,257 @@
 import gc
 import math
 import os
 import re
 import warnings
 from fractions import Fraction
 import av
 import cv2
 import numpy as np
 import torch
 from torchvision import get_video_backend
 from torchvision.io.video import _check_av_available
 MAX_NUM_FRAMES = 2500
 def read_video_av(
    filename: str,
    start_pts: float | Fraction = 0,
    end_pts: float | Fraction | None = None,
    pts_unit: str = "pts",
    output_format: str = "THWC",
 ) -> tuple[torch.Tensor, torch.Tensor, dict]:
    """
    Reads a video from a file, returning both the video frames and the audio frames
    This method is modified from torchvision.io.video.read_video, with the following changes:
    1. will not extract audio frames and return empty for aframes
    2. remove checks and only support pyav
    3. add container.close() and gc.collect() to avoid thread leakage
    4. try our best to avoid memory leak
    Args:
        filename (str): path to the video file
        start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
            The start presentation time of the video
        end_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
            The end presentation time
        pts_unit (str, optional): unit in which start_pts and end_pts values will be interpreted,
            either 'pts' or 'sec'. Defaults to 'pts'.
        output_format (str, optional): The format of the output video tensors. Can be either "THWC" (default) or "TCHW".
    Returns:
        vframes (Tensor[T, H, W, C] or Tensor[T, C, H, W]): the `T` video frames
        aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points
        info (dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int)
    """
    # format
    output_format = output_format.upper()
    if output_format not in ("THWC", "TCHW"):
        raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.")
    # file existence
    if not os.path.exists(filename):
        raise RuntimeError(f"File not found: {filename}")
    # backend check
    assert get_video_backend() == "pyav", "pyav backend is required for read_video_av"
    _check_av_available()
    # end_pts check
    if end_pts is None:
        end_pts = float("inf")
    if end_pts < start_pts:
        raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}")
    # == get video info ==
    info = {}
    # TODO: creating an container leads to memory leak (1G for 8 workers 1 GPU)
    container = av.open(filename, metadata_errors="ignore")
    # fps
    video_fps = container.streams.video[0].average_rate
    # guard against potentially corrupted files
    if video_fps is not None:
        info["video_fps"] = float(video_fps)
    iter_video = container.decode(**{"video": 0})
    frame = next(iter_video).to_rgb().to_ndarray()
    height, width = frame.shape[:2]
    total_frames = container.streams.video[0].frames
    if total_frames == 0:
        total_frames = MAX_NUM_FRAMES
        warnings.warn(f"total_frames is 0, using {MAX_NUM_FRAMES} as a fallback")
    container.close()
    del container
    # HACK: must create before iterating stream
    # use np.zeros will not actually allocate memory
    # use np.ones will lead to a little memory leak
    video_frames = np.zeros((total_frames, height, width, 3), dtype=np.uint8)
    # == read ==
    try:
        # TODO: The reading has memory leak (4G for 8 workers 1 GPU)
        container = av.open(filename, metadata_errors="ignore")
        assert container.streams.video is not None
        video_frames = _read_from_stream(
            video_frames,
            container,
            start_pts,
            end_pts,
            pts_unit,
            container.streams.video[0],
            {"video": 0},
            filename=filename,
        )
    except av.AVError as e:
        print(f"[Warning] Error while reading video {filename}: {e}")
    vframes = torch.from_numpy(video_frames).clone()
    del video_frames
    if output_format == "TCHW":
        # [T,H,W,C] --> [T,C,H,W]
        vframes = vframes.permute(0, 3, 1, 2)
    aframes = torch.empty((1, 0), dtype=torch.float32)
    return vframes, aframes, info
 def _read_from_stream(
    video_frames,
    container: "av.container.Container",
    start_offset: float,
    end_offset: float,
    pts_unit: str,
    stream: "av.stream.Stream",
    stream_name: dict[str, int | tuple[int, ...] | list[int] | None],
    filename: str | None = None,
 ) -> list["av.frame.Frame"]:
    if pts_unit == "sec":
        # TODO: we should change all of this from ground up to simply take
        # sec and convert to MS in C++
        start_offset = int(math.floor(start_offset * (1 / stream.time_base)))
        if end_offset != float("inf"):
            end_offset = int(math.ceil(end_offset * (1 / stream.time_base)))
    else:
        warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")
    should_buffer = True
    max_buffer_size = 5
    if stream.type == "video":
        # DivX-style packed B-frames can have out-of-order pts (2 frames in a single pkt)
        # so need to buffer some extra frames to sort everything
        # properly
        extradata = stream.codec_context.extradata
        # overly complicated way of finding if `divx_packed` is set, following
        # https://github.com/FFmpeg/FFmpeg/commit/d5a21172283572af587b3d939eba0091484d3263
        if extradata and b"DivX" in extradata:
            # can't use regex directly because of some weird characters sometimes...
            pos = extradata.find(b"DivX")
            d = extradata[pos:]
            o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d)
            if o is None:
                o = re.search(rb"DivX(\d+)b(\d+)(\w)", d)
            if o is not None:
                should_buffer = o.group(3) == b"p"
    seek_offset = start_offset
    # some files don't seek to the right location, so better be safe here
    seek_offset = max(seek_offset - 1, 0)
    if should_buffer:
        # FIXME this is kind of a hack, but we will jump to the previous keyframe
        # so this will be safe
        seek_offset = max(seek_offset - max_buffer_size, 0)
    try:
        # TODO check if stream needs to always be the video stream here or not
        container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
    except av.AVError as e:
        print(f"[Warning] Error while seeking video {filename}: {e}")
        return []
    # == main ==
    buffer_count = 0
    frames_pts = []
    cnt = 0
    try:
        for _idx, frame in enumerate(container.decode(**stream_name)):
            frames_pts.append(frame.pts)
            video_frames[cnt] = frame.to_rgb().to_ndarray()
            cnt += 1
            if cnt >= len(video_frames):
                break
            if frame.pts >= end_offset:
                if should_buffer and buffer_count < max_buffer_size:
                    buffer_count += 1
                    continue
                break
    except av.AVError as e:
        print(f"[Warning] Error while reading video {filename}: {e}")
    # garbage collection for thread leakage
    container.close()
    del container
    # NOTE: manually garbage collect to close pyav threads
    gc.collect()
    # ensure that the results are sorted wrt the pts
    # NOTE: here we assert frames_pts is sorted
    start_ptr = 0
    end_ptr = cnt
    while start_ptr < end_ptr and frames_pts[start_ptr] < start_offset:
        start_ptr += 1
    while start_ptr < end_ptr and frames_pts[end_ptr - 1] > end_offset:
        end_ptr -= 1
    if start_offset > 0 and start_offset not in frames_pts[start_ptr:end_ptr]:
        # if there is no frame that exactly matches the pts of start_offset
        # add the last frame smaller than start_offset, to guarantee that
        # we will have all the necessary data. This is most useful for audio
        if start_ptr > 0:
            start_ptr -= 1
    result = video_frames[start_ptr:end_ptr].copy()
    return result
 def read_video_cv2(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        # print("Error: Unable to open video")
        raise ValueError
    else:
        fps = cap.get(cv2.CAP_PROP_FPS)
        vinfo = {
            "video_fps": fps,
        }
        frames = []
        while True:
            # Read a frame from the video
            ret, frame = cap.read()
            # If frame is not read correctly, break the loop
            if not ret:
                break
            frames.append(frame[:, :, ::-1])  # BGR to RGB
            # Exit if 'q' is pressed
            if cv2.waitKey(25) & 0xFF == ord("q"):
                break
        # Release the video capture object and close all windows
        cap.release()
        cv2.destroyAllWindows()
        frames = np.stack(frames)
        frames = torch.from_numpy(frames)  # [T, H, W, C=3]
        frames = frames.permute(0, 3, 1, 2)
        return frames, vinfo
 def read_video(video_path, backend="av"):
    if backend == "cv2":
        vframes, vinfo = read_video_cv2(video_path)
    elif backend == "av":
        vframes, _, vinfo = read_video_av(filename=video_path, pts_unit="sec", output_format="TCHW")
    else:
        raise ValueError
    return vframes, vinfo
--- a/opensora/datasets/sampler.py
+++ b/opensora/datasets/sampler.py
@ -0,0 +1,393 @@
 from collections import OrderedDict, defaultdict
 from typing import Iterator
 import numpy as np
 import torch
 import torch.distributed as dist
 from torch.utils.data import Dataset, DistributedSampler
 from opensora.utils.logger import log_message
 from opensora.utils.misc import format_numel_str
 from .aspect import get_num_pexels_from_name
 from .bucket import Bucket
 from .datasets import VideoTextDataset
 from .parallel import pandarallel
 from .utils import sync_object_across_devices
 # use pandarallel to accelerate bucket processing
 # NOTE: pandarallel should only access local variables
 def apply(data, method=None, seed=None, num_bucket=None, fps_max=16):
    return method(
        data["num_frames"],
        data["height"],
        data["width"],
        data["fps"],
        data["path"],
        seed + data["id"] * num_bucket,
        fps_max,
    )
 class StatefulDistributedSampler(DistributedSampler):
    def __init__(
        self,
        dataset: Dataset,
        num_replicas: int | None = None,
        rank: int | None = None,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
    ) -> None:
        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
        self.start_index: int = 0
    def __iter__(self) -> Iterator:
        iterator = super().__iter__()
        indices = list(iterator)
        indices = indices[self.start_index :]
        return iter(indices)
    def __len__(self) -> int:
        return self.num_samples - self.start_index
    def reset(self) -> None:
        self.start_index = 0
    def state_dict(self, step) -> dict:
        return {"start_index": step}
    def load_state_dict(self, state_dict: dict) -> None:
        self.__dict__.update(state_dict)
 class VariableVideoBatchSampler(DistributedSampler):
    def __init__(
        self,
        dataset: VideoTextDataset,
        bucket_config: dict,
        num_replicas: int | None = None,
        rank: int | None = None,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
        verbose: bool = False,
        num_bucket_build_workers: int = 1,
        num_groups: int = 1,
    ) -> None:
        super().__init__(
            dataset=dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle, seed=seed, drop_last=drop_last
        )
        self.dataset = dataset
        assert dataset.bucket_class == "Bucket", "Only support Bucket class for now"
        self.bucket = Bucket(bucket_config)
        self.verbose = verbose
        self.last_micro_batch_access_index = 0
        self.num_bucket_build_workers = num_bucket_build_workers
        self._cached_bucket_sample_dict = None
        self._cached_num_total_batch = None
        self.num_groups = num_groups
        if dist.get_rank() == 0:
            pandarallel.initialize(
                nb_workers=self.num_bucket_build_workers,
                progress_bar=False,
                verbose=0,
                use_memory_fs=False,
            )
    def __iter__(self) -> Iterator[list[int]]:
        bucket_sample_dict, _ = self.group_by_bucket()
        self.clear_cache()
        g = torch.Generator()
        g.manual_seed(self.seed + self.epoch)
        bucket_micro_batch_count = OrderedDict()
        bucket_last_consumed = OrderedDict()
        # process the samples
        for bucket_id, data_list in bucket_sample_dict.items():
            # handle droplast
            bs_per_gpu = self.bucket.get_batch_size(bucket_id)
            remainder = len(data_list) % bs_per_gpu
            if remainder > 0:
                if not self.drop_last:
                    # if there is remainder, we pad to make it divisible
                    data_list += data_list[: bs_per_gpu - remainder]
                else:
                    # we just drop the remainder to make it divisible
                    data_list = data_list[:-remainder]
            bucket_sample_dict[bucket_id] = data_list
            # handle shuffle
            if self.shuffle:
                data_indices = torch.randperm(len(data_list), generator=g).tolist()
                data_list = [data_list[i] for i in data_indices]
                bucket_sample_dict[bucket_id] = data_list
            # compute how many micro-batches each bucket has
            num_micro_batches = len(data_list) // bs_per_gpu
            bucket_micro_batch_count[bucket_id] = num_micro_batches
        # compute the bucket access order
        # each bucket may have more than one batch of data
        # thus bucket_id may appear more than 1 time
        bucket_id_access_order = []
        for bucket_id, num_micro_batch in bucket_micro_batch_count.items():
            bucket_id_access_order.extend([bucket_id] * num_micro_batch)
        # randomize the access order
        if self.shuffle:
            bucket_id_access_order_indices = torch.randperm(len(bucket_id_access_order), generator=g).tolist()
            bucket_id_access_order = [bucket_id_access_order[i] for i in bucket_id_access_order_indices]
        # make the number of bucket accesses divisible by dp size
        remainder = len(bucket_id_access_order) % self.num_replicas
        if remainder > 0:
            if self.drop_last:
                bucket_id_access_order = bucket_id_access_order[: len(bucket_id_access_order) - remainder]
            else:
                bucket_id_access_order += bucket_id_access_order[: self.num_replicas - remainder]
        # prepare each batch from its bucket
        # according to the predefined bucket access order
        num_iters = len(bucket_id_access_order) // self.num_replicas
        start_iter_idx = self.last_micro_batch_access_index // self.num_replicas
        # re-compute the micro-batch consumption
        # this is useful when resuming from a state dict with a different number of GPUs
        self.last_micro_batch_access_index = start_iter_idx * self.num_replicas
        for i in range(self.last_micro_batch_access_index):
            bucket_id = bucket_id_access_order[i]
            bucket_bs = self.bucket.get_batch_size(bucket_id)
            if bucket_id in bucket_last_consumed:
                bucket_last_consumed[bucket_id] += bucket_bs
            else:
                bucket_last_consumed[bucket_id] = bucket_bs
        for i in range(start_iter_idx, num_iters):
            bucket_access_list = bucket_id_access_order[i * self.num_replicas : (i + 1) * self.num_replicas]
            self.last_micro_batch_access_index += self.num_replicas
            # compute the data samples consumed by each access
            bucket_access_boundaries = []
            for bucket_id in bucket_access_list:
                bucket_bs = self.bucket.get_batch_size(bucket_id)
                last_consumed_index = bucket_last_consumed.get(bucket_id, 0)
                bucket_access_boundaries.append([last_consumed_index, last_consumed_index + bucket_bs])
                # update consumption
                if bucket_id in bucket_last_consumed:
                    bucket_last_consumed[bucket_id] += bucket_bs
                else:
                    bucket_last_consumed[bucket_id] = bucket_bs
            # compute the range of data accessed by each GPU
            bucket_id = bucket_access_list[self.rank]
            boundary = bucket_access_boundaries[self.rank]
            cur_micro_batch = bucket_sample_dict[bucket_id][boundary[0] : boundary[1]]
            # encode t, h, w into the sample index
            real_t, real_h, real_w = self.bucket.get_thw(bucket_id)
            cur_micro_batch = [f"{idx}-{real_t}-{real_h}-{real_w}" for idx in cur_micro_batch]
            yield cur_micro_batch
        self.reset()
    def __len__(self) -> int:
        return self.get_num_batch() // self.num_groups
    def get_num_batch(self) -> int:
        _, num_total_batch = self.group_by_bucket()
        return num_total_batch
    def clear_cache(self):
        self._cached_bucket_sample_dict = None
        self._cached_num_total_batch = 0
    def group_by_bucket(self) -> dict:
        """
        Group the dataset samples into buckets.
        This method will set `self._cached_bucket_sample_dict` to the bucket sample dict.
        Returns:
            dict: a dictionary with bucket id as key and a list of sample indices as value
        """
        if self._cached_bucket_sample_dict is not None:
            return self._cached_bucket_sample_dict, self._cached_num_total_batch
        # use pandarallel to accelerate bucket processing
        log_message("Building buckets using %d workers...", self.num_bucket_build_workers)
        bucket_ids = None
        if dist.get_rank() == 0:
            data = self.dataset.data.copy(deep=True)
            data["id"] = data.index
            bucket_ids = data.parallel_apply(
                apply,
                axis=1,
                method=self.bucket.get_bucket_id,
                seed=self.seed + self.epoch,
                num_bucket=self.bucket.num_bucket,
                fps_max=self.dataset.fps_max,
            )
        dist.barrier()
        bucket_ids = sync_object_across_devices(bucket_ids)
        dist.barrier()
        # group by bucket
        # each data sample is put into a bucket with a similar image/video size
        bucket_sample_dict = defaultdict(list)
        bucket_ids_np = np.array(bucket_ids)
        valid_indices = np.where(bucket_ids_np != None)[0]
        for i in valid_indices:
            bucket_sample_dict[bucket_ids_np[i]].append(i)
        # cache the bucket sample dict
        self._cached_bucket_sample_dict = bucket_sample_dict
        # num total batch
        num_total_batch = self.print_bucket_info(bucket_sample_dict)
        self._cached_num_total_batch = num_total_batch
        return bucket_sample_dict, num_total_batch
    def print_bucket_info(self, bucket_sample_dict: dict) -> int:
        # collect statistics
        num_total_samples = num_total_batch = 0
        num_total_img_samples = num_total_vid_samples = 0
        num_total_img_batch = num_total_vid_batch = 0
        num_total_vid_batch_256 = num_total_vid_batch_768 = 0
        num_aspect_dict = defaultdict(lambda: [0, 0])
        num_hwt_dict = defaultdict(lambda: [0, 0])
        for k, v in bucket_sample_dict.items():
            size = len(v)
            num_batch = size // self.bucket.get_batch_size(k[:-1])
            num_total_samples += size
            num_total_batch += num_batch
            if k[1] == 1:
                num_total_img_samples += size
                num_total_img_batch += num_batch
            else:
                if k[0] == "256px":
                    num_total_vid_batch_256 += num_batch
                elif k[0] == "768px":
                    num_total_vid_batch_768 += num_batch
                num_total_vid_samples += size
                num_total_vid_batch += num_batch
            num_aspect_dict[k[-1]][0] += size
            num_aspect_dict[k[-1]][1] += num_batch
            num_hwt_dict[k[:-1]][0] += size
            num_hwt_dict[k[:-1]][1] += num_batch
        # sort
        num_aspect_dict = dict(sorted(num_aspect_dict.items(), key=lambda x: x[0]))
        num_hwt_dict = dict(
            sorted(num_hwt_dict.items(), key=lambda x: (get_num_pexels_from_name(x[0][0]), x[0][1]), reverse=True)
        )
        num_hwt_img_dict = {k: v for k, v in num_hwt_dict.items() if k[1] == 1}
        num_hwt_vid_dict = {k: v for k, v in num_hwt_dict.items() if k[1] > 1}
        # log
        if dist.get_rank() == 0 and self.verbose:
            log_message("Bucket Info:")
            log_message("Bucket [#sample, #batch] by aspect ratio:")
            for k, v in num_aspect_dict.items():
                log_message("(%s): #sample: %s, #batch: %s", k, format_numel_str(v[0]), format_numel_str(v[1]))
            log_message("===== Image Info =====")
            log_message("Image Bucket by HxWxT:")
            for k, v in num_hwt_img_dict.items():
                log_message("%s: #sample: %s, #batch: %s", k, format_numel_str(v[0]), format_numel_str(v[1]))
            log_message("--------------------------------")
            log_message(
                "#image sample: %s, #image batch: %s",
                format_numel_str(num_total_img_samples),
                format_numel_str(num_total_img_batch),
            )
            log_message("===== Video Info =====")
            log_message("Video Bucket by HxWxT:")
            for k, v in num_hwt_vid_dict.items():
                log_message("%s: #sample: %s, #batch: %s", k, format_numel_str(v[0]), format_numel_str(v[1]))
            log_message("--------------------------------")
            log_message(
                "#video sample: %s, #video batch: %s",
                format_numel_str(num_total_vid_samples),
                format_numel_str(num_total_vid_batch),
            )
            log_message("===== Summary =====")
            log_message("#non-empty buckets: %s", len(bucket_sample_dict))
            log_message(
                "Img/Vid sample ratio: %.2f",
                num_total_img_samples / num_total_vid_samples if num_total_vid_samples > 0 else 0,
            )
            log_message(
                "Img/Vid batch ratio: %.2f", num_total_img_batch / num_total_vid_batch if num_total_vid_batch > 0 else 0
            )
            log_message(
                "vid batch 256: %s, vid batch 768: %s", format_numel_str(num_total_vid_batch_256), format_numel_str(num_total_vid_batch_768)
            )
            log_message(
                "Vid batch ratio (256px/768px): %.2f", num_total_vid_batch_256 / num_total_vid_batch_768 if num_total_vid_batch_768 > 0 else 0
            )
            log_message(
                "#training sample: %s, #training batch: %s",
                format_numel_str(num_total_samples),
                format_numel_str(num_total_batch),
            )
        return num_total_batch
    def reset(self):
        self.last_micro_batch_access_index = 0
    def set_step(self, start_step: int):
        self.last_micro_batch_access_index = start_step * self.num_replicas
    def state_dict(self, num_steps: int) -> dict:
        # the last_micro_batch_access_index in the __iter__ is often
        # not accurate during multi-workers and data prefetching
        # thus, we need the user to pass the actual steps which have been executed
        # to calculate the correct last_micro_batch_access_index
        return {"seed": self.seed, "epoch": self.epoch, "last_micro_batch_access_index": num_steps * self.num_replicas}
    def load_state_dict(self, state_dict: dict) -> None:
        self.__dict__.update(state_dict)
 class BatchDistributedSampler(DistributedSampler):
    """
    Used with BatchDataset;
    Suppose len_buffer == 5, num_buffers == 6, #GPUs == 3, then
           | buffer {i}          | buffer {i+1}
    ------ | ------------------- | -------------------
    rank 0 |  0,  1,  2,  3,  4, |  5,  6,  7,  8,  9
    rank 1 | 10, 11, 12, 13, 14, | 15, 16, 17, 18, 19
    rank 2 | 20, 21, 22, 23, 24, | 25, 26, 27, 28, 29
    """
    def __init__(self, dataset: Dataset, **kwargs):
        super().__init__(dataset, **kwargs)
        self.start_index = 0
    def __iter__(self):
        num_buffers = self.dataset.num_buffers
        len_buffer = self.dataset.len_buffer
        num_buffers_i = num_buffers // self.num_replicas
        num_samples_i = len_buffer * num_buffers_i
        indices_i = np.arange(self.start_index, num_samples_i) + self.rank * num_samples_i
        indices_i = indices_i.tolist()
        return iter(indices_i)
    def reset(self):
        self.start_index = 0
    def state_dict(self, step) -> dict:
        return {"start_index": step}
    def load_state_dict(self, state_dict: dict):
        self.start_index = state_dict["start_index"] + 1
--- a/opensora/datasets/utils.py
+++ b/opensora/datasets/utils.py
@ -0,0 +1,419 @@
 import math
 import os
 import random
 import re
 from typing import Any
 import numpy as np
 import pandas as pd
 import requests
 import torch
 import torch.distributed as dist
 import torchvision
 import torchvision.transforms as transforms
 from PIL import Image
 from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader
 from torchvision.io import write_video
 from torchvision.utils import save_image
 from . import video_transforms
 from .read_video import read_video
 try:
    import dask.dataframe as dd
    SUPPORT_DASK = True
 except:
    SUPPORT_DASK = False
 VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
 regex = re.compile(
    r"^(?:http|ftp)s?://"  # http:// or https://
    r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"  # domain...
    r"localhost|"  # localhost...
    r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
    r"(?::\d+)?"  # optional port
    r"(?:/?|[/?]\S+)$",
    re.IGNORECASE,
 )
 def is_img(path):
    ext = os.path.splitext(path)[-1].lower()
    return ext in IMG_EXTENSIONS
 def is_vid(path):
    ext = os.path.splitext(path)[-1].lower()
    return ext in VID_EXTENSIONS
 def is_url(url):
    return re.match(regex, url) is not None
 def read_file(input_path, memory_efficient=False):
    if input_path.endswith(".csv"):
        assert not memory_efficient, "Memory efficient mode is not supported for CSV files"
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        columns = None
        if memory_efficient:
            columns = ["path", "num_frames", "height", "width", "aspect_ratio", "fps", "resolution"]
        if SUPPORT_DASK:
            ret = dd.read_parquet(input_path, columns=columns).compute()
        else:
            ret = pd.read_parquet(input_path, columns=columns)
        return ret
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")
 def download_url(input_path):
    output_dir = "cache"
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.basename(input_path)
    output_path = os.path.join(output_dir, base_name)
    img_data = requests.get(input_path).content
    with open(output_path, "wb", encoding="utf-8") as handler:
        handler.write(img_data)
    print(f"URL {input_path} downloaded to {output_path}")
    return output_path
 def temporal_random_crop(
    vframes: torch.Tensor, num_frames: int, frame_interval: int, return_frame_indices: bool = False
 ) -> torch.Tensor | tuple[torch.Tensor, np.ndarray]:
    temporal_sample = video_transforms.TemporalRandomCrop(num_frames * frame_interval)
    total_frames = len(vframes)
    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
    assert (
        end_frame_ind - start_frame_ind >= num_frames
    ), f"Not enough frames to sample, {end_frame_ind} - {start_frame_ind} < {num_frames}"
    frame_indices = np.linspace(start_frame_ind, end_frame_ind - 1, num_frames, dtype=int)
    video = vframes[frame_indices]
    if return_frame_indices:
        return video, frame_indices
    else:
        return video
 def get_transforms_video(name="center", image_size=(256, 256)):
    if name is None:
        return None
    elif name == "center":
        assert image_size[0] == image_size[1], "image_size must be square for center crop"
        transform_video = transforms.Compose(
            [
                video_transforms.ToTensorVideo(),  # TCHW
                # video_transforms.RandomHorizontalFlipVideo(),
                video_transforms.UCFCenterCropVideo(image_size[0]),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    elif name == "resize_crop":
        transform_video = transforms.Compose(
            [
                video_transforms.ToTensorVideo(),  # TCHW
                video_transforms.ResizeCrop(image_size),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    elif name == "rand_size_crop":
        transform_video = transforms.Compose(
            [
                video_transforms.ToTensorVideo(),  # TCHW
                video_transforms.RandomSizedCrop(image_size),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    else:
        raise NotImplementedError(f"Transform {name} not implemented")
    return transform_video
 def get_transforms_image(name="center", image_size=(256, 256)):
    if name is None:
        return None
    elif name == "center":
        assert image_size[0] == image_size[1], "Image size must be square for center crop"
        transform = transforms.Compose(
            [
                transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, image_size[0])),
                # transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    elif name == "resize_crop":
        transform = transforms.Compose(
            [
                transforms.Lambda(lambda pil_image: resize_crop_to_fill(pil_image, image_size)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    elif name == "rand_size_crop":
        transform = transforms.Compose(
            [
                transforms.Lambda(lambda pil_image: rand_size_crop_arr(pil_image, image_size)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    else:
        raise NotImplementedError(f"Transform {name} not implemented")
    return transform
 def read_image_from_path(path, transform=None, transform_name="center", num_frames=1, image_size=(256, 256)):
    image = pil_loader(path)
    if transform is None:
        transform = get_transforms_image(image_size=image_size, name=transform_name)
    image = transform(image)
    video = image.unsqueeze(0).repeat(num_frames, 1, 1, 1)
    video = video.permute(1, 0, 2, 3)
    return video
 def read_video_from_path(path, transform=None, transform_name="center", image_size=(256, 256)):
    vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit="sec", output_format="TCHW")
    if transform is None:
        transform = get_transforms_video(image_size=image_size, name=transform_name)
    video = transform(vframes)  # T C H W
    video = video.permute(1, 0, 2, 3)
    return video
 def read_from_path(path, image_size, transform_name="center"):
    if is_url(path):
        path = download_url(path)
    ext = os.path.splitext(path)[-1].lower()
    if ext.lower() in VID_EXTENSIONS:
        return read_video_from_path(path, image_size=image_size, transform_name=transform_name)
    else:
        assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}"
        return read_image_from_path(path, image_size=image_size, transform_name=transform_name)
 def save_sample(
    x,
    save_path=None,
    fps=8,
    normalize=True,
    value_range=(-1, 1),
    force_video=False,
    verbose=True,
    crf=23,
 ):
    """
    Args:
        x (Tensor): shape [C, T, H, W]
    """
    assert x.ndim == 4
    if not force_video and x.shape[1] == 1:  # T = 1: save as image
        save_path += ".png"
        x = x.squeeze(1)
        save_image([x], save_path, normalize=normalize, value_range=value_range)
    else:
        save_path += ".mp4"
        if normalize:
            low, high = value_range
            x.clamp_(min=low, max=high)
            x.sub_(low).div_(max(high - low, 1e-5))
        x = x.mul_(255).add_(0.5).clamp_(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8)
        write_video(save_path, x, fps=fps, video_codec="h264", options={"crf": str(crf)})
    if verbose:
        print(f"Saved to {save_path}")
    return save_path
 def center_crop_arr(pil_image, image_size):
    """
    Center cropping implementation from ADM.
    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
    """
    while min(*pil_image.size) >= 2 * image_size:
        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)
    scale = image_size / min(*pil_image.size)
    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC)
    arr = np.array(pil_image)
    crop_y = (arr.shape[0] - image_size) // 2
    crop_x = (arr.shape[1] - image_size) // 2
    return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])
 def rand_size_crop_arr(pil_image, image_size):
    """
    Randomly crop image for height and width, ranging from image_size[0] to image_size[1]
    """
    arr = np.array(pil_image)
    # get random target h w
    height = random.randint(image_size[0], image_size[1])
    width = random.randint(image_size[0], image_size[1])
    # ensure that h w are factors of 8
    height = height - height % 8
    width = width - width % 8
    # get random start pos
    h_start = random.randint(0, max(len(arr) - height, 0))
    w_start = random.randint(0, max(len(arr[0]) - height, 0))
    # crop
    return Image.fromarray(arr[h_start : h_start + height, w_start : w_start + width])
 def resize_crop_to_fill(pil_image, image_size):
    w, h = pil_image.size  # PIL is (W, H)
    th, tw = image_size
    rh, rw = th / h, tw / w
    if rh > rw:
        sh, sw = th, round(w * rh)
        image = pil_image.resize((sw, sh), Image.BICUBIC)
        i = 0
        j = int(round((sw - tw) / 2.0))
    else:
        sh, sw = round(h * rw), tw
        image = pil_image.resize((sw, sh), Image.BICUBIC)
        i = int(round((sh - th) / 2.0))
        j = 0
    arr = np.array(image)
    assert i + th <= arr.shape[0] and j + tw <= arr.shape[1]
    return Image.fromarray(arr[i : i + th, j : j + tw])
 def map_target_fps(
    fps: float,
    max_fps: float,
 ) -> tuple[float, int]:
    """
    Map fps to a new fps that is less than max_fps.
    Args:
        fps (float): Original fps.
        max_fps (float): Maximum fps.
    Returns:
        tuple[float, int]: New fps and sampling interval.
    """
    if math.isnan(fps):
        return 0, 1
    if fps < max_fps:
        return fps, 1
    sampling_interval = math.ceil(fps / max_fps)
    new_fps = math.floor(fps / sampling_interval)
    return new_fps, sampling_interval
 def sync_object_across_devices(obj: Any, rank: int = 0):
    """
    Synchronizes any picklable object across devices in a PyTorch distributed setting
    using `broadcast_object_list` with CUDA support.
    Parameters:
    obj (Any): The object to synchronize. Can be any picklable object (e.g., list, dict, custom class).
    rank (int): The rank of the device from which to broadcast the object state. Default is 0.
    Note: Ensure torch.distributed is initialized before using this function and CUDA is available.
    """
    # Move the object to a list for broadcasting
    object_list = [obj]
    # Broadcast the object list from the source rank to all other ranks
    dist.broadcast_object_list(object_list, src=rank, device="cuda")
    # Retrieve the synchronized object
    obj = object_list[0]
    return obj
 def rescale_image_by_path(path: str, height: int, width: int):
    """
    Rescales an image to the specified height and width and saves it back to the original path.
    Args:
        path (str): The file path of the image.
        height (int): The target height of the image.
        width (int): The target width of the image.
    """
    try:
        # read image
        image = Image.open(path)
        # check if image is valid
        if image is None:
            raise ValueError("The image is invalid or empty.")
        # resize image
        resize_transform = transforms.Resize((width, height))
        resized_image = resize_transform(image)
        # save resized image back to the original path
        resized_image.save(path)
    except Exception as e:
        print(f"Error rescaling image: {e}")
 def rescale_video_by_path(path: str, height: int, width: int):
    """
    Rescales an MP4 video (without audio) to the specified height and width.
    Args:
        path (str): The file path of the video.
        height (int): The target height of the video.
        width (int): The target width of the video.
    """
    try:
        # Read video and metadata
        video, info = read_video(path, backend="av")
        # Check if video is valid
        if video is None or video.size(0) == 0:
            raise ValueError("The video is invalid or empty.")
        # Resize video frames using a performant method
        resize_transform = transforms.Compose([transforms.Resize((width, height))])
        resized_video = torch.stack([resize_transform(frame) for frame in video])
        # Save resized video back to the original path
        resized_video = resized_video.permute(0, 2, 3, 1)
        write_video(path, resized_video, fps=int(info["video_fps"]), video_codec="h264")
    except Exception as e:
        print(f"Error rescaling video: {e}")
 def save_tensor_to_disk(tensor, path, exist_handling="overwrite"):
    if os.path.exists(path):
        if exist_handling == "ignore":
            return
        elif exist_handling == "raise":
            raise UserWarning(f"File {path} already exists, rewriting!")
    torch.save(tensor, path)
 def save_tensor_to_internet(tensor, path):
    raise NotImplementedError("save_tensor_to_internet is not implemented yet!")
 def save_latent(latent, path, exist_handling="overwrite"):
    if path.startswith(("http://", "https://", "ftp://", "sftp://")):
        save_tensor_to_internet(latent, path)
    else:
        save_tensor_to_disk(latent, path, exist_handling=exist_handling)
 def cache_latents(latents, path, exist_handling="overwrite"):
    for i in range(latents.shape[0]):
        save_latent(latents[i], path[i], exist_handling=exist_handling)
--- a/opensora/datasets/video_transforms.py
+++ b/opensora/datasets/video_transforms.py
@ -0,0 +1,595 @@
 # Copyright 2024 Vchitect/Latte
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #     http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.# Modified from Latte
 import numbers
 # - This file is adapted from https://github.com/Vchitect/Latte/blob/main/datasets/video_transforms.py
 import random
 import numpy as np
 import torch
 def _is_tensor_video_clip(clip):
    if not torch.is_tensor(clip):
        raise TypeError("clip should be Tensor. Got %s" % type(clip))
    if not clip.ndimension() == 4:
        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
    return True
 def crop(clip, i, j, h, w):
    """
    Args:
        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
    """
    if len(clip.size()) != 4:
        raise ValueError("clip should be a 4D tensor")
    return clip[..., i : i + h, j : j + w]
 def resize(clip, target_size, interpolation_mode):
    if len(target_size) != 2:
        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)
 def resize_scale(clip, target_size, interpolation_mode):
    if len(target_size) != 2:
        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
    H, W = clip.size(-2), clip.size(-1)
    scale_ = target_size[0] / min(H, W)
    th, tw = int(round(H * scale_)), int(round(W * scale_))
    return torch.nn.functional.interpolate(clip, size=(th, tw), mode=interpolation_mode, align_corners=False)
 def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
    """
    Do spatial cropping and resizing to the video clip
    Args:
        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        i (int): i in (i,j) i.e coordinates of the upper left corner.
        j (int): j in (i,j) i.e coordinates of the upper left corner.
        h (int): Height of the cropped region.
        w (int): Width of the cropped region.
        size (tuple(int, int)): height and width of resized clip
    Returns:
        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    clip = crop(clip, i, j, h, w)
    clip = resize(clip, size, interpolation_mode)
    return clip
 def center_crop(clip, crop_size):
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    th, tw = crop_size
    if h < th or w < tw:
        raise ValueError("height and width must be no smaller than crop_size")
    i = int(round((h - th) / 2.0))
    j = int(round((w - tw) / 2.0))
    return crop(clip, i, j, th, tw)
 def center_crop_using_short_edge(clip):
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    if h < w:
        th, tw = h, h
        i = 0
        j = int(round((w - tw) / 2.0))
    else:
        th, tw = w, w
        i = int(round((h - th) / 2.0))
        j = 0
    return crop(clip, i, j, th, tw)
 def resize_crop_to_fill(clip, target_size):
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    th, tw = target_size[0], target_size[1]
    rh, rw = th / h, tw / w
    if rh > rw:
        sh, sw = th, round(w * rh)
        clip = resize(clip, (sh, sw), "bilinear")
        i = 0
        j = int(round(sw - tw) / 2.0)
    else:
        sh, sw = round(h * rw), tw
        clip = resize(clip, (sh, sw), "bilinear")
        i = int(round(sh - th) / 2.0)
        j = 0
    assert i + th <= clip.size(-2) and j + tw <= clip.size(-1)
    return crop(clip, i, j, th, tw)
 # def rand_crop_h_w(clip, target_size_range, multiples_of: int = 8):
 #     # NOTE: for some reason, if don't re-import, gives same randint results
 #     import sys
 #     del sys.modules["random"]
 #     import random
 #     if not _is_tensor_video_clip(clip):
 #         raise ValueError("clip should be a 4D torch.tensor")
 #     h, w = clip.size(-2), clip.size(-1)
 #     # get random target h w
 #     th = random.randint(target_size_range[0], target_size_range[1])
 #     tw = random.randint(target_size_range[0], target_size_range[1])
 #     # ensure that h w are factors of 8
 #     th = th - th % multiples_of
 #     tw = tw - tw % multiples_of
 #     # get random start pos
 #     i = random.randint(0, h-th) if h > th else 0
 #     j = random.randint(0, w-tw) if w > tw else 0
 #     th = th if th < h else h
 #     tw = tw if tw < w else w
 #     # print("target size range:",target_size_range)
 #     # print("original size:", h, w)
 #     # print("crop size:", th, tw)
 #     # print(f"crop:{i}-{i+th}, {j}-{j+tw}")
 #     return (crop(clip, i, j, th, tw), th, tw)
 def random_shift_crop(clip):
    """
    Slide along the long edge, with the short edge as crop size
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    if h <= w:
        short_edge = h
    else:
        short_edge = w
    th, tw = short_edge, short_edge
    i = torch.randint(0, h - th + 1, size=(1,)).item()
    j = torch.randint(0, w - tw + 1, size=(1,)).item()
    return crop(clip, i, j, th, tw)
 def to_tensor(clip):
    """
    Convert tensor data type from uint8 to float, divide value by 255.0 and
    permute the dimensions of clip tensor
    Args:
        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
    Return:
        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
    """
    _is_tensor_video_clip(clip)
    if not clip.dtype == torch.uint8:
        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
    # return clip.float().permute(3, 0, 1, 2) / 255.0
    return clip.float() / 255.0
 def normalize(clip, mean, std, inplace=False):
    """
    Args:
        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
        mean (tuple): pixel RGB mean. Size is (3)
        std (tuple): pixel standard deviation. Size is (3)
    Returns:
        normalized clip (torch.tensor): Size is (T, C, H, W)
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    if not inplace:
        clip = clip.clone()
    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
    # print(mean)
    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
    return clip
 def hflip(clip):
    """
    Args:
        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
    Returns:
        flipped clip (torch.tensor): Size is (T, C, H, W)
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    return clip.flip(-1)
 class ResizeCrop:
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size
    def __call__(self, clip):
        clip = resize_crop_to_fill(clip, self.size)
        return clip
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size})"
 class RandomSizedCrop:
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size
    def __call__(self, clip):
        i, j, h, w = self.get_params(clip)
        # self.size = (h, w)
        return crop(clip, i, j, h, w)
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size})"
    def get_params(self, clip, multiples_of=8):
        h, w = clip.shape[-2:]
        # get random target h w
        th = random.randint(self.size[0], self.size[1])
        tw = random.randint(self.size[0], self.size[1])
        # ensure that h w are factors of 8
        th = th - th % multiples_of
        tw = tw - tw % multiples_of
        if h < th:
            th = h - h % multiples_of
        if w < tw:
            tw = w - w % multiples_of
        if w == tw and h == th:
            return 0, 0, h, w
        else:
            # get random start pos
            i = random.randint(0, h - th)
            j = random.randint(0, w - tw)
        return i, j, th, tw
 class RandomCropVideo:
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size
    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: randomly cropped video clip.
                size is (T, C, OH, OW)
        """
        i, j, h, w = self.get_params(clip)
        return crop(clip, i, j, h, w)
    def get_params(self, clip):
        h, w = clip.shape[-2:]
        th, tw = self.size
        if h < th or w < tw:
            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
        if w == tw and h == th:
            return 0, 0, h, w
        i = torch.randint(0, h - th + 1, size=(1,)).item()
        j = torch.randint(0, w - tw + 1, size=(1,)).item()
        return i, j, th, tw
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size})"
 class CenterCropResizeVideo:
    """
    First use the short side for cropping length,
    center crop video, then resize to the specified size
    """
    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)
        self.interpolation_mode = interpolation_mode
    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: scale resized / center cropped video clip.
                size is (T, C, crop_size, crop_size)
        """
        clip_center_crop = center_crop_using_short_edge(clip)
        clip_center_crop_resize = resize(
            clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode
        )
        return clip_center_crop_resize
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
 class UCFCenterCropVideo:
    """
    First scale to the specified size in equal proportion to the short edge,
    then center cropping
    """
    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)
        self.interpolation_mode = interpolation_mode
    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: scale resized / center cropped video clip.
                size is (T, C, crop_size, crop_size)
        """
        clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
        clip_center_crop = center_crop(clip_resize, self.size)
        return clip_center_crop
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
 class KineticsRandomCropResizeVideo:
    """
    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
    """
    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)
        self.interpolation_mode = interpolation_mode
    def __call__(self, clip):
        clip_random_crop = random_shift_crop(clip)
        clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
        return clip_resize
 class CenterCropVideo:
    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)
        self.interpolation_mode = interpolation_mode
    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: center cropped video clip.
                size is (T, C, crop_size, crop_size)
        """
        clip_center_crop = center_crop(clip, self.size)
        return clip_center_crop
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
 class NormalizeVideo:
    """
    Normalize the video clip by mean subtraction and division by standard deviation
    Args:
        mean (3-tuple): pixel RGB mean
        std (3-tuple): pixel RGB standard deviation
        inplace (boolean): whether do in-place normalization
    """
    def __init__(self, mean, std, inplace=False):
        self.mean = mean
        self.std = std
        self.inplace = inplace
    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
        """
        return normalize(clip, self.mean, self.std, self.inplace)
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
 class ToTensorVideo:
    """
    Convert tensor data type from uint8 to float, divide value by 255.0 and
    permute the dimensions of clip tensor
    """
    def __init__(self):
        pass
    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
        Return:
            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
        """
        return to_tensor(clip)
    def __repr__(self) -> str:
        return self.__class__.__name__
 class RandomHorizontalFlipVideo:
    """
    Flip the video clip along the horizontal direction with a given probability
    Args:
        p (float): probability of the clip being flipped. Default value is 0.5
    """
    def __init__(self, p=0.5):
        self.p = p
    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Size is (T, C, H, W)
        Return:
            clip (torch.tensor): Size is (T, C, H, W)
        """
        if random.random() < self.p:
            clip = hflip(clip)
        return clip
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(p={self.p})"
 #  ------------------------------------------------------------
 #  ---------------------  Sampling  ---------------------------
 #  ------------------------------------------------------------
 class TemporalRandomCrop(object):
    """Temporally crop the given frame indices at a random location.
    Args:
            size (int): Desired length of frames will be seen in the model.
    """
    def __init__(self, size):
        self.size = size
    def __call__(self, total_frames):
        rand_end = max(0, total_frames - self.size - 1)
        begin_index = random.randint(0, rand_end)
        end_index = min(begin_index + self.size, total_frames)
        return begin_index, end_index
 if __name__ == "__main__":
    import os
    import numpy as np
    import torchvision.io as io
    from torchvision import transforms
    from torchvision.utils import save_image
    vframes, aframes, info = io.read_video(filename="./v_Archery_g01_c03.avi", pts_unit="sec", output_format="TCHW")
    trans = transforms.Compose(
        [
            ToTensorVideo(),
            RandomHorizontalFlipVideo(),
            UCFCenterCropVideo(512),
            # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
        ]
    )
    target_video_len = 32
    frame_interval = 1
    total_frames = len(vframes)
    print(total_frames)
    temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)
    # Sampling video frames
    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
    # print(start_frame_ind)
    # print(end_frame_ind)
    assert end_frame_ind - start_frame_ind >= target_video_len
    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int)
    print(frame_indice)
    select_vframes = vframes[frame_indice]
    print(select_vframes.shape)
    print(select_vframes.dtype)
    select_vframes_trans = trans(select_vframes)
    print(select_vframes_trans.shape)
    print(select_vframes_trans.dtype)
    select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8)
    print(select_vframes_trans_int.dtype)
    print(select_vframes_trans_int.permute(0, 2, 3, 1).shape)
    io.write_video("./test.avi", select_vframes_trans_int.permute(0, 2, 3, 1), fps=8)
    for i in range(target_video_len):
        save_image(
            select_vframes_trans[i], os.path.join("./test000", "%04d.png" % i), normalize=True, value_range=(-1, 1)
        )
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@ -0,0 +1,282 @@
 # Dataset Management
 - [Dataset Management](#dataset-management)
  - [Dataset Format](#dataset-format)
  - [Dataset to CSV](#dataset-to-csv)
  - [Manage datasets](#manage-datasets)
    - [Requirement](#requirement)
    - [Basic Usage](#basic-usage)
    - [Score filtering](#score-filtering)
    - [Documentation](#documentation)
  - [Transform datasets](#transform-datasets)
    - [Resize](#resize)
    - [Frame extraction](#frame-extraction)
    - [Crop Midjourney 4 grid](#crop-midjourney-4-grid)
  - [Analyze datasets](#analyze-datasets)
  - [Data Process Pipeline](#data-process-pipeline)
 After preparing the raw dataset according to the [instructions](/docs/datasets.md), you can use the following commands to manage the dataset.
 ## Dataset Format
 All dataset should be provided in a `.csv` file (or `parquet.gzip` to save space), which is used for both training and data preprocessing. The columns should follow the words below:
 - `path`: the relative/absolute path or url to the image or video file. Required.
 - `text`: the caption or description of the image or video. Required for training.
 - `num_frames`: the number of frames in the video. Required for training.
 - `width`: the width of the video frame. Required for dynamic bucket.
 - `height`: the height of the video frame. Required for dynamic bucket.
 - `aspect_ratio`: the aspect ratio of the video frame (height / width). Required for dynamic bucket.
 - `resolution`: height x width. For analysis.
 - `text_len`: the number of tokens in the text. For analysis.
 - `aes`: aesthetic score calculated by [asethetic scorer](/tools/aesthetic/README.md). For filtering.
 - `flow`: optical flow score calculated by [UniMatch](/tools/scoring/README.md). For filtering.
 - `match`: matching score of a image-text/video-text pair calculated by [CLIP](/tools/scoring/README.md). For filtering.
 - `fps`: the frame rate of the video. Optional.
 - `cmotion`: the camera motion.
 An example ready for training:
 ```csv
 path, text, num_frames, width, height, aspect_ratio
 /absolute/path/to/image1.jpg, caption, 1, 720, 1280, 0.5625
 /absolute/path/to/video1.mp4, caption, 120, 720, 1280, 0.5625
 /absolute/path/to/video2.mp4, caption, 20, 256, 256, 1
 ```
 We use pandas to manage the `.csv` or `.parquet` files. The following code is for reading and writing files:
 ```python
 df = pd.read_csv(input_path)
 df = df.to_csv(output_path, index=False)
 # or use parquet, which is smaller
 df = pd.read_parquet(input_path)
 df = df.to_parquet(output_path, index=False)
 ```
 ## Dataset to CSV
 As a start point, `convert.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file:
 ```bash
 python -m tools.datasets.convert DATASET-TYPE DATA_FOLDER
 # general video folder
 python -m tools.datasets.convert video VIDEO_FOLDER --output video.csv
 # general image folder
 python -m tools.datasets.convert image IMAGE_FOLDER --output image.csv
 # imagenet
 python -m tools.datasets.convert imagenet IMAGENET_FOLDER --split train
 # ucf101
 python -m tools.datasets.convert ucf101 UCF101_FOLDER --split videos
 # vidprom
 python -m tools.datasets.convert vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv
 ```
 ## Manage datasets
 Use `datautil` to manage the dataset.
 ### Requirement
 Follow our [installation guide](../../docs/installation.md)'s "Data Dependencies" and "Datasets" section to install the required packages.
 <!-- To accelerate processing speed, you can install [pandarallel](https://github.com/nalepae/pandarallel):
 ```bash
 pip install pandarallel
 ``` -->
 <!-- To get image and video information, you need to install [opencv-python](https://github.com/opencv/opencv-python): -->
 <!-- ```bash
 pip install opencv-python
 # If your videos are in av1 codec instead of h264, you need to
 # - install ffmpeg first
 # - install via conda to support av1 codec
 conda install -c conda-forge opencv
 ``` -->
 <!-- Or to get video information, you can install ffmpeg and ffmpeg-python:
 ```bash
 pip install ffmpeg-python
 ``` -->
 <!-- To filter a specific language, you need to install [lingua](https://github.com/pemistahl/lingua-py):
 ```bash
 pip install lingua-language-detector
 ``` -->
 ### Basic Usage
 You can use the following commands to process the `csv` or `parquet` files. The output file will be saved in the same directory as the input, with different suffixes indicating the processed method.
 ```bash
 # datautil takes multiple CSV files as input and merge them into one CSV file
 # output: DATA1+DATA2.csv
 python -m tools.datasets.datautil DATA1.csv DATA2.csv
 # shard CSV files into multiple CSV files
 # output: DATA1_0.csv, DATA1_1.csv, ...
 python -m tools.datasets.datautil DATA1.csv --shard 10
 # filter frames between 128 and 256, with captions
 # output: DATA1_fmin_128_fmax_256.csv
 python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256
 # Disable parallel processing
 python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256 --disable-parallel
 # Compute num_frames, height, width, fps, aspect_ratio for videos or images
 # output: IMG_DATA+VID_DATA_vinfo.csv
 python -m tools.datasets.datautil IMG_DATA.csv VID_DATA.csv --video-info
 # You can run multiple operations at the same time.
 python -m tools.datasets.datautil DATA.csv --video-info --remove-empty-caption --remove-url --lang en
 ```
 ### Score filtering
 To examine and filter the quality of the dataset by aesthetic score and clip score, you can use the following commands:
 ```bash
 # sort the dataset by aesthetic score
 # output: DATA_sort.csv
 python -m tools.datasets.datautil DATA.csv --sort aesthetic_score
 # View examples of high aesthetic score
 head -n 10 DATA_sort.csv
 # View examples of low aesthetic score
 tail -n 10 DATA_sort.csv
 # sort the dataset by clip score
 # output: DATA_sort.csv
 python -m tools.datasets.datautil DATA.csv --sort clip_score
 # filter the dataset by aesthetic score
 # output: DATA_aesmin_0.5.csv
 python -m tools.datasets.datautil DATA.csv --aesmin 0.5
 # filter the dataset by clip score
 # output: DATA_matchmin_0.5.csv
 python -m tools.datasets.datautil DATA.csv --matchmin 0.5
 ```
 ### Documentation
 You can also use `python -m tools.datasets.datautil --help` to see usage.
 | Args                        | File suffix    | Description                                                   |
 | --------------------------- | -------------- | ------------------------------------------------------------- |
 | `--output OUTPUT`           |                | Output path                                                   |
 | `--format FORMAT`           |                | Output format (csv, parquet, parquet.gzip)                    |
 | `--disable-parallel`        |                | Disable `pandarallel`                                         |
 | `--seed SEED`               |                | Random seed                                                   |
 | `--shard SHARD`             | `_0`,`_1`, ... | Shard the dataset                                             |
 | `--sort KEY`                | `_sort`        | Sort the dataset by KEY                                       |
 | `--sort-descending KEY`     | `_sort`        | Sort the dataset by KEY in descending order                   |
 | `--difference DATA.csv`     |                | Remove the paths in DATA.csv from the dataset                 |
 | `--intersection DATA.csv`   |                | Keep the paths in DATA.csv from the dataset and merge columns |
 | `--info`                    | `_info`        | Get the basic information of each video and image (cv2)       |
 | `--ext`                     | `_ext`         | Remove rows if the file does not exist                        |
 | `--relpath`                 | `_relpath`     | Modify the path to relative path by root given                |
 | `--abspath`                 | `_abspath`     | Modify the path to absolute path by root given                |
 | `--remove-empty-caption`    | `_noempty`     | Remove rows with empty caption                                |
 | `--remove-url`              | `_nourl`       | Remove rows with url in caption                               |
 | `--lang LANG`               | `_lang`        | Remove rows with other language                               |
 | `--remove-path-duplication` | `_noduppath`   | Remove rows with duplicated path                              |
 | `--remove-text-duplication` | `_noduptext`   | Remove rows with duplicated caption                           |
 | `--refine-llm-caption`      | `_llm`         | Modify the caption generated by LLM                           |
 | `--clean-caption MODEL`     | `_clean`       | Modify the caption according to T5 pipeline to suit training  |
 | `--unescape`                | `_unescape`    | Unescape the caption                                          |
 | `--merge-cmotion`           | `_cmotion`     | Merge the camera motion to the caption                        |
 | `--count-num-token`         | `_ntoken`      | Count the number of tokens in the caption                     |
 | `--load-caption EXT`        | `_load`        | Load the caption from the file                                |
 | `--fmin FMIN`               | `_fmin`        | Filter the dataset by minimum number of frames                |
 | `--fmax FMAX`               | `_fmax`        | Filter the dataset by maximum number of frames                |
 | `--hwmax HWMAX`             | `_hwmax`       | Filter the dataset by maximum height x width                  |
 | `--aesmin AESMIN`           | `_aesmin`      | Filter the dataset by minimum aesthetic score                 |
 | `--matchmin MATCHMIN`       | `_matchmin`    | Filter the dataset by minimum clip score                      |
 | `--flowmin FLOWMIN`         | `_flowmin`     | Filter the dataset by minimum optical flow score              |
 ## Transform datasets
 The `tools.datasets.transform` module provides a set of tools to transform the dataset. The general usage is as follows:
 ```bash
 python -m tools.datasets.transform TRANSFORM_TYPE META.csv ORIGINAL_DATA_FOLDER DATA_FOLDER_TO_SAVE_RESULTS --additional-args
 ```
 ### Resize
 Sometimes you may need to resize the images or videos to a specific resolution. You can use the following commands to resize the dataset:
 ```bash
 python -m tools.datasets.transform meta.csv /path/to/raw/data /path/to/new/data --length 2160
 ```
 ### Frame extraction
 To extract frames from videos, you can use the following commands:
 ```bash
 python -m tools.datasets.transform vid_frame_extract meta.csv /path/to/raw/data /path/to/new/data --points 0.1 0.5 0.9
 ```
 ### Crop Midjourney 4 grid
 Randomly select one of the 4 images in the 4 grid generated by Midjourney.
 ```bash
 python -m tools.datasets.transform img_rand_crop meta.csv /path/to/raw/data /path/to/new/data
 ```
 ## Analyze datasets
 You can easily get basic information about a `.csv` dataset by using the following commands:
 ```bash
 # examine the first 10 rows of the CSV file
 head -n 10 DATA1.csv
 # count the number of data in the CSV file (approximately)
 wc -l DATA1.csv
 ```
 For the dataset provided in a `.csv` or `.parquet` file, you can easily analyze the dataset using the following commands. Plots will be automatically saved.
 ```python
 pyhton -m tools.datasets.analyze DATA_info.csv
 ```
 ## Data Process Pipeline
 ```bash
 # Suppose videos and images under ~/dataset/
 # 1. Convert dataset to CSV
 python -m tools.datasets.convert video ~/dataset --output meta.csv
 # 2. Get video information
 python -m tools.datasets.datautil meta.csv --info --fmin 1
 # 3. Get caption
 # 3.1. generate caption
 torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava meta_info_fmin1.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
 # merge generated results
 python -m tools.datasets.datautil meta_info_fmin1_caption_part*.csv --output meta_caption.csv
 # merge caption and info
 python -m tools.datasets.datautil meta_info_fmin1.csv --intersection meta_caption.csv --output meta_caption_info.csv
 # clean caption
 python -m tools.datasets.datautil meta_caption_info.csv --clean-caption --refine-llm-caption --remove-empty-caption --output meta_caption_processed.csv
 # 3.2. extract caption
 python -m tools.datasets.datautil meta_info_fmin1.csv --load-caption json --remove-empty-caption --clean-caption
 # 4. Scoring
 # aesthetic scoring
 torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta_caption_processed.csv
 python -m tools.datasets.datautil meta_caption_processed_part*.csv --output meta_caption_processed_aes.csv
 # optical flow scoring
 torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference meta_caption_processed.csv
 # matching scoring
 torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference meta_caption_processed.csv
 # camera motion
 python -m tools.caption.camera_motion_detect meta_caption_processed.csv
 ```
--- a/tools/datasets/init.py
+++ b/tools/datasets/init.py
--- a/tools/datasets/analyze.py
+++ b/tools/datasets/analyze.py
@ -0,0 +1,96 @@
 import argparse
 import os
 import matplotlib.pyplot as plt
 import pandas as pd
 def read_file(input_path):
    if input_path.endswith(".csv"):
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        return pd.read_parquet(input_path)
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="Path to the input dataset")
    parser.add_argument("--save-img", type=str, default="samples/infos/", help="Path to save the image")
    return parser.parse_args()
 def plot_data(data, column, bins, name):
    plt.clf()
    data.hist(column=column, bins=bins)
    os.makedirs(os.path.dirname(name), exist_ok=True)
    plt.savefig(name)
    print(f"Saved {name}")
 def plot_categorical_data(data, column, name):
    plt.clf()
    data[column].value_counts().plot(kind="bar")
    os.makedirs(os.path.dirname(name), exist_ok=True)
    plt.savefig(name)
    print(f"Saved {name}")
 COLUMNS = {
    "num_frames": 100,
    "resolution": 100,
    "text_len": 100,
    "aes": 100,
    "match": 100,
    "flow": 100,
    "cmotion": None,
 }
 def main(args):
    data = read_file(args.input)
    # === Image Data Info ===
    image_index = data["num_frames"] == 1
    if image_index.sum() > 0:
        print("=== Image Data Info ===")
        img_data = data[image_index]
        print(f"Number of images: {len(img_data)}")
        print(img_data.head())
        print(img_data.describe())
        if args.save_img:
            for column in COLUMNS:
                if column in img_data.columns and column not in ["num_frames", "cmotion"]:
                    if COLUMNS[column] is None:
                        plot_categorical_data(img_data, column, os.path.join(args.save_img, f"image_{column}.png"))
                    else:
                        plot_data(img_data, column, COLUMNS[column], os.path.join(args.save_img, f"image_{column}.png"))
    # === Video Data Info ===
    if not image_index.all():
        print("=== Video Data Info ===")
        video_data = data[~image_index]
        print(f"Number of videos: {len(video_data)}")
        if "num_frames" in video_data.columns:
            total_num_frames = video_data["num_frames"].sum()
            print(f"Number of frames: {total_num_frames}")
            DEFAULT_FPS = 30
            total_hours = total_num_frames / DEFAULT_FPS / 3600
            print(f"Total hours (30 FPS): {int(total_hours)}")
        print(video_data.head())
        print(video_data.describe())
        if args.save_img:
            for column in COLUMNS:
                if column in video_data.columns:
                    if COLUMNS[column] is None:
                        plot_categorical_data(video_data, column, os.path.join(args.save_img, f"video_{column}.png"))
                    else:
                        plot_data(
                            video_data, column, COLUMNS[column], os.path.join(args.save_img, f"video_{column}.png")
                        )
 if __name__ == "__main__":
    args = parse_args()
    main(args)
--- a/tools/datasets/check_integrity.py
+++ b/tools/datasets/check_integrity.py
@ -0,0 +1,79 @@
 import argparse
 import subprocess
 import pandas as pd
 from tqdm import tqdm
 tqdm.pandas()
 try:
    from pandarallel import pandarallel
    PANDA_USE_PARALLEL = True
 except ImportError:
    PANDA_USE_PARALLEL = False
 import shutil
 if not shutil.which("ffmpeg"):
    raise ImportError("FFmpeg is not installed")
 def apply(df, func, **kwargs):
    if PANDA_USE_PARALLEL:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)
 def check_video_integrity(video_path):
    # try:
    can_open_result = subprocess.run(
        ["ffmpeg", "-v", "error", "-i", video_path, "-t", "0", "-f", "null", "-"],  # open video and capture 0 seconds
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
    fast_scan_result = subprocess.run(
        ["ffmpeg", "-v", "error", "-analyzeduration", "10M", "-probesize", "10M", "-i", video_path, "-f", "null", "-"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
    if can_open_result.stderr == "" and fast_scan_result.stderr == "":
        return True
    else:
        return False
    # except Exception as e:
    #     return False
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="path to the input dataset")
    parser.add_argument("--disable-parallel", action="store_true", help="disable parallel processing")
    parser.add_argument("--num-workers", type=int, default=None, help="number of workers")
    args = parser.parse_args()
    if args.disable_parallel:
        PANDA_USE_PARALLEL = False
    if PANDA_USE_PARALLEL:
        if args.num_workers is not None:
            pandarallel.initialize(nb_workers=args.num_workers, progress_bar=True)
        else:
            pandarallel.initialize(progress_bar=True)
    data = pd.read_csv(args.input)
    assert "path" in data.columns
    data["integrity"] = apply(data["path"], check_video_integrity)
    integrity_file_path = args.input.replace(".csv", "_intact.csv")
    broken_file_path = args.input.replace(".csv", "_broken.csv")
    intact_data = data[data["integrity"] == True].drop(columns=["integrity"])
    intact_data.to_csv(integrity_file_path, index=False)
    broken_data = data[data["integrity"] == False].drop(columns=["integrity"])
    broken_data.to_csv(broken_file_path, index=False)
    print(
        f"Integrity check completed. Intact videos saved to: {integrity_file_path}, broken videos saved to {broken_file_path}."
    )
--- a/tools/datasets/convert.py
+++ b/tools/datasets/convert.py
@ -0,0 +1,144 @@
 import argparse
 import os
 import time
 import pandas as pd
 from torchvision.datasets import ImageNet
 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
 VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv", ".m2ts")
 def scan_recursively(root):
    num = 0
    for entry in os.scandir(root):
        if entry.is_file():
            yield entry
        elif entry.is_dir():
            num += 1
            if num % 100 == 0:
                print(f"Scanned {num} directories.")
            yield from scan_recursively(entry.path)
 def get_filelist(file_path, exts=None):
    filelist = []
    time_start = time.time()
    # == OS Walk ==
    # for home, dirs, files in os.walk(file_path):
    #     for filename in files:
    #         ext = os.path.splitext(filename)[-1].lower()
    #         if exts is None or ext in exts:
    #             filelist.append(os.path.join(home, filename))
    # == Scandir ==
    obj = scan_recursively(file_path)
    for entry in obj:
        if entry.is_file():
            ext = os.path.splitext(entry.name)[-1].lower()
            if exts is None or ext in exts:
                filelist.append(entry.path)
    time_end = time.time()
    print(f"Scanned {len(filelist)} files in {time_end - time_start:.2f} seconds.")
    return filelist
 def split_by_capital(name):
    # BoxingPunchingBag -> Boxing Punching Bag
    new_name = ""
    for i in range(len(name)):
        if name[i].isupper() and i != 0:
            new_name += " "
        new_name += name[i]
    return new_name
 def process_imagenet(root, split):
    root = os.path.expanduser(root)
    data = ImageNet(root, split=split)
    samples = [(path, data.classes[label][0]) for path, label in data.samples]
    output = f"imagenet_{split}.csv"
    df = pd.DataFrame(samples, columns=["path", "text"])
    df.to_csv(output, index=False)
    print(f"Saved {len(samples)} samples to {output}.")
 def process_ucf101(root, split):
    root = os.path.expanduser(root)
    video_lists = get_filelist(os.path.join(root, split))
    classes = [x.split("/")[-2] for x in video_lists]
    classes = [split_by_capital(x) for x in classes]
    samples = list(zip(video_lists, classes))
    output = f"ucf101_{split}.csv"
    df = pd.DataFrame(samples, columns=["path", "text"])
    df.to_csv(output, index=False)
    print(f"Saved {len(samples)} samples to {output}.")
 def process_vidprom(root, info):
    root = os.path.expanduser(root)
    video_lists = get_filelist(root)
    video_set = set(video_lists)
    # read info csv
    infos = pd.read_csv(info)
    abs_path = infos["uuid"].apply(lambda x: os.path.join(root, f"pika-{x}.mp4"))
    is_exist = abs_path.apply(lambda x: x in video_set)
    df = pd.DataFrame(dict(path=abs_path[is_exist], text=infos["prompt"][is_exist]))
    df.to_csv("vidprom.csv", index=False)
    print(f"Saved {len(df)} samples to vidprom.csv.")
 def process_general_images(root, output):
    root = os.path.expanduser(root)
    if not os.path.exists(root):
        return
    path_list = get_filelist(root, IMG_EXTENSIONS)
    fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
    relpath_list = [os.path.relpath(x, root) for x in path_list]
    df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list))
    os.makedirs(os.path.dirname(output), exist_ok=True)
    df.to_csv(output, index=False)
    print(f"Saved {len(df)} samples to {output}.")
 def process_general_videos(root, output):
    root = os.path.expanduser(root)
    if not os.path.exists(root):
        return
    path_list = get_filelist(root, VID_EXTENSIONS)
    path_list = list(set(path_list))  # remove duplicates
    fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
    relpath_list = [os.path.relpath(x, root) for x in path_list]
    df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list))
    os.makedirs(os.path.dirname(output), exist_ok=True)
    df.to_csv(output, index=False)
    print(f"Saved {len(df)} samples to {output}.")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101", "vidprom", "image", "video"])
    parser.add_argument("root", type=str)
    parser.add_argument("--split", type=str, default="train")
    parser.add_argument("--info", type=str, default=None)
    parser.add_argument("--output", type=str, default=None, required=True, help="Output path")
    args = parser.parse_args()
    if args.dataset == "imagenet":
        process_imagenet(args.root, args.split)
    elif args.dataset == "ucf101":
        process_ucf101(args.root, args.split)
    elif args.dataset == "vidprom":
        process_vidprom(args.root, args.info)
    elif args.dataset == "image":
        process_general_images(args.root, args.output)
    elif args.dataset == "video":
        process_general_videos(args.root, args.output)
    else:
        raise ValueError("Invalid dataset")
--- a/tools/datasets/csv2txt.py
+++ b/tools/datasets/csv2txt.py
@ -0,0 +1,14 @@
 import argparse
 import pandas as pd
 parser = argparse.ArgumentParser(description="Convert CSV file to txt file")
 parser.add_argument("csv_file", type=str, help="CSV file to convert")
 parser.add_argument("txt_file", type=str, help="TXT file to save")
 args = parser.parse_args()
 data = pd.read_csv(args.csv_file)
 text = data["text"].to_list()
 text = "\n".join(text)
 with open(args.txt_file, "w") as f:
    f.write(text)
--- a/tools/datasets/datautil.py
+++ b/tools/datasets/datautil.py
--- a/tools/datasets/filter_panda10m.py
+++ b/tools/datasets/filter_panda10m.py
@ -0,0 +1,262 @@
 # TODO: remove this file before releasing
 import argparse
 import html
 import os
 import re
 import pandas as pd
 from tqdm import tqdm
 tqdm.pandas()
 try:
    from pandarallel import pandarallel
    pandarallel.initialize(progress_bar=True)
    pandas_has_parallel = True
 except ImportError:
    pandas_has_parallel = False
 def apply(df, func, **kwargs):
    if pandas_has_parallel:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)
 def basic_clean(text):
    import ftfy
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()
 BAD_PUNCT_REGEX = re.compile(
    r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
 )  # noqa
 def clean_caption(caption):
    import urllib.parse as ul
    from bs4 import BeautifulSoup
    caption = str(caption)
    caption = ul.unquote_plus(caption)
    caption = caption.strip().lower()
    caption = re.sub("<person>", "person", caption)
    # urls:
    caption = re.sub(
        r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    caption = re.sub(
        r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    # html:
    caption = BeautifulSoup(caption, features="html.parser").text
    # @<nickname>
    caption = re.sub(r"@[\w\d]+\b", "", caption)
    # 31C0—31EF CJK Strokes
    # 31F0—31FF Katakana Phonetic Extensions
    # 3200—32FF Enclosed CJK Letters and Months
    # 3300—33FF CJK Compatibility
    # 3400—4DBF CJK Unified Ideographs Extension A
    # 4DC0—4DFF Yijing Hexagram Symbols
    # 4E00—9FFF CJK Unified Ideographs
    caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
    caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
    caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
    caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
    caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
    caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
    caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
    #######################################################
    # все виды тире / all types of dash --> "-"
    caption = re.sub(
        r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
        "-",
        caption,
    )
    # кавычки к одному стандарту
    caption = re.sub(r"[`´«»“”¨]", '"', caption)
    caption = re.sub(r"[‘’]", "'", caption)
    # &quot;
    caption = re.sub(r"&quot;?", "", caption)
    # &amp
    caption = re.sub(r"&amp", "", caption)
    # ip adresses:
    caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
    # article ids:
    caption = re.sub(r"\d:\d\d\s+$", "", caption)
    # \n
    caption = re.sub(r"\\n", " ", caption)
    # "#123"
    caption = re.sub(r"#\d{1,3}\b", "", caption)
    # "#12345.."
    caption = re.sub(r"#\d{5,}\b", "", caption)
    # "123456.."
    caption = re.sub(r"\b\d{6,}\b", "", caption)
    # filenames:
    caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
    #
    caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
    caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
    caption = re.sub(BAD_PUNCT_REGEX, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
    caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
    # this-is-my-cute-cat / this_is_my_cute_cat
    regex2 = re.compile(r"(?:\-|\_)")
    if len(re.findall(regex2, caption)) > 3:
        caption = re.sub(regex2, " ", caption)
    caption = basic_clean(caption)
    caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
    caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
    caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
    caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
    caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
    caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
    caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
    caption = re.sub(r"\bpage\s+\d+\b", "", caption)
    caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
    caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
    caption = re.sub(r"\b\s+\:\s+", r": ", caption)
    caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
    caption = re.sub(r"\s+", " ", caption)
    caption.strip()
    caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
    caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
    caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
    caption = re.sub(r"^\.\S+$", "", caption)
    return caption.strip()
 def get_10m_set():
    meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv"
    meta_10m = pd.read_csv(meta_path_10m)
    def process_single_caption(row):
        text_list = eval(row["caption"])
        clean_list = [clean_caption(x) for x in text_list]
        return str(clean_list)
    ret = apply(meta_10m, process_single_caption, axis=1)
    # ret = meta_10m.progress_apply(process_single_caption, axis=1)
    print("==> text processed.")
    text_list = []
    for x in ret:
        text_list += eval(x)
        # text_set = text_set.union(set(eval(x)))
    text_set = set(text_list)
    # meta_10m['caption_new'] = ret
    # meta_10m.to_csv('/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m_new-cap.csv')
    # video_id_set = set(meta_10m['videoID'])
    # id2t = {}
    # for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)):
    #     video_id = row['videoID']
    #     text_list = eval(row['caption'])
    #     id2t[video_id] = set(text_list)
    print(f"==> Loaded meta_10m from '{meta_path_10m}'")
    return text_set
 def filter_panda10m_text(meta_path, text_set):
    def process_single_row(row):
        # path = row['path']
        t = row["text"]
        # fname = os.path.basename(path)
        # video_id = fname[:fname.rindex('_')]
        if t not in text_set:
            return False
        return True
    meta = pd.read_csv(meta_path)
    ret = apply(meta, process_single_row, axis=1)
    # ret = meta.progress_apply(process_single_row, axis=1)
    meta = meta[ret]
    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_filter-10m{ext}"
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) saved to '{out_path}'.")
 def filter_panda10m_timestamp(meta_path):
    meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv"
    meta_10m = pd.read_csv(meta_path_10m)
    id2t = {}
    for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)):
        video_id = row["videoID"]
        timestamp = eval(row["timestamp"])
        timestamp = [str(tuple(x)) for x in timestamp]
        id2t[video_id] = timestamp
    # video_id_set_10m = set(meta_10m['videoID'])
    print(f"==> Loaded meta_10m from '{meta_path_10m}'")
    def process_single_row(row):
        path = row["path"]
        t = row["timestamp"]
        fname = os.path.basename(path)
        video_id = fname[: fname.rindex("_")]
        if video_id not in id2t:
            return False
        if t not in id2t[video_id]:
            return False
        return True
        # return video_id in video_id_set_10m
    meta = pd.read_csv(meta_path)
    ret = apply(meta, process_single_row, axis=1)
    meta = meta[ret]
    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_filter-10m{ext}"
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) saved to '{out_path}'.")
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--meta_path", type=str, nargs="+")
    parser.add_argument("--num_workers", default=5, type=int)
    args = parser.parse_args()
    return args
 if __name__ == "__main__":
    args = parse_args()
    text_set = get_10m_set()
    for x in args.meta_path:
        filter_panda10m_text(x, text_set)
--- a/tools/datasets/save_first_frame.py
+++ b/tools/datasets/save_first_frame.py
@ -0,0 +1,66 @@
 import argparse
 import os
 import cv2
 import pandas as pd
 from tqdm import tqdm
 tqdm.pandas()
 try:
    from pandarallel import pandarallel
    PANDA_USE_PARALLEL = True
 except ImportError:
    PANDA_USE_PARALLEL = False
 def save_first_frame(video_path, img_dir):
    if not os.path.exists(video_path):
        print(f"Video not found: {video_path}")
        return ""
    try:
        cap = cv2.VideoCapture(video_path)
        success, frame = cap.read()
        if success:
            video_name = os.path.basename(video_path)
            image_name = os.path.splitext(video_name)[0] + "_first_frame.jpg"
            image_path = os.path.join(img_dir, image_name)
            cv2.imwrite(image_path, frame)
        else:
            raise ValueError("Video broken.")
        cap.release()
        return image_path
    except Exception as e:
        print(f"Save first frame of `{video_path}` failed. {e}")
        return ""
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="path to the input csv dataset")
    parser.add_argument("--img-dir", type=str, help="path to save first frame image")
    parser.add_argument("--disable-parallel", action="store_true", help="disable parallel processing")
    parser.add_argument("--num-workers", type=int, default=None, help="number of workers")
    args = parser.parse_args()
    if args.disable_parallel:
        PANDA_USE_PARALLEL = False
    if PANDA_USE_PARALLEL:
        if args.num_workers is not None:
            pandarallel.initialize(nb_workers=args.num_workers, progress_bar=True)
        else:
            pandarallel.initialize(progress_bar=True)
    if not os.path.exists(args.img_dir):
        os.makedirs(args.img_dir)
    data = pd.read_csv(args.input)
    data["first_frame_path"] = data["path"].parallel_apply(save_first_frame, img_dir=args.img_dir)
    data_filtered = data.loc[data["first_frame_path"] != ""]
    output_csv_path = args.input.replace(".csv", "_first-frame.csv")
    data_filtered.to_csv(output_csv_path, index=False)
    print(f"First frame csv saved to: {output_csv_path}, first frame images saved to {args.img_dir}.")
--- a/tools/datasets/split.py
+++ b/tools/datasets/split.py
@ -0,0 +1,72 @@
 import argparse
 from typing import List
 import pandas as pd
 from mmengine.config import Config
 from opensora.datasets.bucket import Bucket
 def split_by_bucket(
    bucket: Bucket,
    input_files: List[str],
    output_path: str,
    limit: int,
    frame_interval: int,
 ):
    print(f"Split {len(input_files)} files into {len(bucket)} buckets")
    total_limit = len(bucket) * limit
    bucket_cnt = {}
    # get all bucket id
    for hw_id, d in bucket.ar_criteria.items():
        for t_id, v in d.items():
            for ar_id in v.keys():
                bucket_id = (hw_id, t_id, ar_id)
                bucket_cnt[bucket_id] = 0
    output_df = None
    # split files
    for path in input_files:
        df = pd.read_csv(path)
        if output_df is None:
            output_df = pd.DataFrame(columns=df.columns)
        for i in range(len(df)):
            row = df.iloc[i]
            t, h, w = row["num_frames"], row["height"], row["width"]
            bucket_id = bucket.get_bucket_id(t, h, w, frame_interval)
            if bucket_id is None:
                continue
            if bucket_cnt[bucket_id] < limit:
                bucket_cnt[bucket_id] += 1
                output_df = pd.concat([output_df, pd.DataFrame([row])], ignore_index=True)
                if len(output_df) >= total_limit:
                    break
        if len(output_df) >= total_limit:
            break
    assert len(output_df) <= total_limit
    if len(output_df) == total_limit:
        print(f"All buckets are full ({total_limit} samples)")
    else:
        print(f"Only {len(output_df)} files are used")
    output_df.to_csv(output_path, index=False)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, nargs="+")
    parser.add_argument("-o", "--output", required=True)
    parser.add_argument("-c", "--config", required=True)
    parser.add_argument("-l", "--limit", default=200, type=int)
    args = parser.parse_args()
    assert args.limit > 0
    cfg = Config.fromfile(args.config)
    bucket_config = cfg.bucket_config
    # rewrite bucket_config
    for ar, d in bucket_config.items():
        for frames, t in d.items():
            p, bs = t
            if p > 0.0:
                p = 1.0
            d[frames] = (p, bs)
    bucket = Bucket(bucket_config)
    split_by_bucket(bucket, args.input, args.output, args.limit, cfg.dataset.frame_interval)
--- a/tools/datasets/transform.py
+++ b/tools/datasets/transform.py
@ -0,0 +1,306 @@
 import argparse
 import os
 import random
 import shutil
 import subprocess
 import cv2
 import ffmpeg
 import numpy as np
 import pandas as pd
 from pandarallel import pandarallel
 from tqdm import tqdm
 from .utils import IMG_EXTENSIONS, extract_frames
 tqdm.pandas()
 USE_PANDARALLEL = True
 def apply(df, func, **kwargs):
    if USE_PANDARALLEL:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)
 def get_new_path(path, input_dir, output):
    path_new = os.path.join(output, os.path.relpath(path, input_dir))
    os.makedirs(os.path.dirname(path_new), exist_ok=True)
    return path_new
 def resize_longer(path, length, input_dir, output_dir):
    path_new = get_new_path(path, input_dir, output_dir)
    ext = os.path.splitext(path)[1].lower()
    assert ext in IMG_EXTENSIONS
    img = cv2.imread(path)
    if img is not None:
        h, w = img.shape[:2]
        if min(h, w) > length:
            if h > w:
                new_h = length
                new_w = int(w / h * length)
            else:
                new_w = length
                new_h = int(h / w * length)
            img = cv2.resize(img, (new_w, new_h))
        cv2.imwrite(path_new, img)
    else:
        path_new = ""
    return path_new
 def resize_shorter(path, length, input_dir, output_dir):
    path_new = get_new_path(path, input_dir, output_dir)
    if os.path.exists(path_new):
        return path_new
    ext = os.path.splitext(path)[1].lower()
    assert ext in IMG_EXTENSIONS
    img = cv2.imread(path)
    if img is not None:
        h, w = img.shape[:2]
        if min(h, w) > length:
            if h > w:
                new_w = length
                new_h = int(h / w * length)
            else:
                new_h = length
                new_w = int(w / h * length)
            img = cv2.resize(img, (new_w, new_h))
        cv2.imwrite(path_new, img)
    else:
        path_new = ""
    return path_new
 def rand_crop(path, input_dir, output):
    ext = os.path.splitext(path)[1].lower()
    path_new = get_new_path(path, input_dir, output)
    assert ext in IMG_EXTENSIONS
    img = cv2.imread(path)
    if img is not None:
        h, w = img.shape[:2]
        width, height, _ = img.shape
        pos = random.randint(0, 3)
        if pos == 0:
            img_cropped = img[: width // 2, : height // 2]
        elif pos == 1:
            img_cropped = img[width // 2 :, : height // 2]
        elif pos == 2:
            img_cropped = img[: width // 2, height // 2 :]
        else:
            img_cropped = img[width // 2 :, height // 2 :]
        cv2.imwrite(path_new, img_cropped)
    else:
        path_new = ""
    return path_new
 def m2ts_to_mp4(row, output_dir):
    input_path = row["path"]
    output_name = os.path.basename(input_path).replace(".m2ts", ".mp4")
    output_path = os.path.join(output_dir, output_name)
    # create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    try:
        ffmpeg.input(input_path).output(output_path).overwrite_output().global_args("-loglevel", "quiet").run(
            capture_stdout=True
        )
        row["path"] = output_path
        row["relpath"] = os.path.splitext(row["relpath"])[0] + ".mp4"
    except Exception as e:
        print(f"Error converting {input_path} to mp4: {e}")
        row["path"] = ""
        row["relpath"] = ""
        return row
    return row
 def mkv_to_mp4(row, output_dir):
    # str_to_replace and str_to_replace_with account for the different directory structure
    input_path = row["path"]
    output_name = os.path.basename(input_path).replace(".mkv", ".mp4")
    output_path = os.path.join(output_dir, output_name)
    # create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    try:
        ffmpeg.input(input_path).output(output_path).overwrite_output().global_args("-loglevel", "quiet").run(
            capture_stdout=True
        )
        row["path"] = output_path
        row["relpath"] = os.path.splitext(row["relpath"])[0] + ".mp4"
    except Exception as e:
        print(f"Error converting {input_path} to mp4: {e}")
        row["path"] = ""
        row["relpath"] = ""
        return row
    return row
 def mp4_to_mp4(row, output_dir):
    # str_to_replace and str_to_replace_with account for the different directory structure
    input_path = row["path"]
    # 检查输入文件是否为.mp4文件
    if not input_path.lower().endswith(".mp4"):
        print(f"Error: {input_path} is not an .mp4 file.")
        row["path"] = ""
        row["relpath"] = ""
        return row
    output_name = os.path.basename(input_path)
    output_path = os.path.join(output_dir, output_name)
    # create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    try:
        shutil.copy2(input_path, output_path)  # 使用shutil复制文件
        row["path"] = output_path
        row["relpath"] = os.path.splitext(row["relpath"])[0] + ".mp4"
    except Exception as e:
        print(f"Error coy {input_path} to mp4: {e}")
        row["path"] = ""
        row["relpath"] = ""
        return row
    return row
 def crop_to_square(input_path, output_path):
    cmd = (
        f"ffmpeg -i {input_path} "
        f"-vf \"crop='min(in_w,in_h)':'min(in_w,in_h)':'(in_w-min(in_w,in_h))/2':'(in_h-min(in_w,in_h))/2'\" "
        f"-c:v libx264 -an "
        f"-map 0:v {output_path}"
    )
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
    stdout, stderr = proc.communicate()
 def vid_crop_center(row, input_dir, output_dir):
    input_path = row["path"]
    relpath = os.path.relpath(input_path, input_dir)
    assert not relpath.startswith("..")
    output_path = os.path.join(output_dir, relpath)
    # create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    try:
        crop_to_square(input_path, output_path)
        size = min(row["height"], row["width"])
        row["path"] = output_path
        row["height"] = size
        row["width"] = size
        row["aspect_ratio"] = 1.0
        row["resolution"] = size**2
    except Exception as e:
        print(f"Error cropping {input_path} to center: {e}")
        row["path"] = ""
    return row
 def main():
    args = parse_args()
    global USE_PANDARALLEL
    assert args.num_workers is None or not args.disable_parallel
    if args.disable_parallel:
        USE_PANDARALLEL = False
    if args.num_workers is not None:
        pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
    else:
        pandarallel.initialize(progress_bar=True)
    random.seed(args.seed)
    data = pd.read_csv(args.meta_path)
    if args.task == "img_rand_crop":
        data["path"] = apply(data["path"], lambda x: rand_crop(x, args.input_dir, args.output_dir))
        output_csv = args.meta_path.replace(".csv", "_rand_crop.csv")
    elif args.task == "img_resize_longer":
        data["path"] = apply(data["path"], lambda x: resize_longer(x, args.length, args.input_dir, args.output_dir))
        output_csv = args.meta_path.replace(".csv", f"_resize-longer-{args.length}.csv")
    elif args.task == "img_resize_shorter":
        data["path"] = apply(data["path"], lambda x: resize_shorter(x, args.length, args.input_dir, args.output_dir))
        output_csv = args.meta_path.replace(".csv", f"_resize-shorter-{args.length}.csv")
    elif args.task == "vid_frame_extract":
        points = args.points if args.points is not None else args.points_index
        data = pd.DataFrame(np.repeat(data.values, 3, axis=0), columns=data.columns)
        num_points = len(points)
        data["point"] = np.nan
        for i, point in enumerate(points):
            if isinstance(point, int):
                data.loc[i::num_points, "point"] = point
            else:
                data.loc[i::num_points, "point"] = data.loc[i::num_points, "num_frames"] * point
        data["path"] = apply(
            data, lambda x: extract_frames(x["path"], args.input_dir, args.output_dir, x["point"]), axis=1
        )
        output_csv = args.meta_path.replace(".csv", "_vid_frame_extract.csv")
    elif args.task == "m2ts_to_mp4":
        print(f"m2ts_to_mp4作业开始：{args.output_dir}")
        assert args.meta_path.endswith("_m2ts.csv"), "Input file must end with '_m2ts.csv'"
        m2ts_to_mp4_partial = lambda x: m2ts_to_mp4(x, args.output_dir)
        data = apply(data, m2ts_to_mp4_partial, axis=1)
        data = data[data["path"] != ""]
        output_csv = args.meta_path.replace("_m2ts.csv", ".csv")
    elif args.task == "mkv_to_mp4":
        print(f"mkv_to_mp4作业开始：{args.output_dir}")
        assert args.meta_path.endswith("_mkv.csv"), "Input file must end with '_mkv.csv'"
        mkv_to_mp4_partial = lambda x: mkv_to_mp4(x, args.output_dir)
        data = apply(data, mkv_to_mp4_partial, axis=1)
        data = data[data["path"] != ""]
        output_csv = args.meta_path.replace("_mkv.csv", ".csv")
    elif args.task == "mp4_to_mp4":
        # assert args.meta_path.endswith("meta.csv"), "Input file must end with '_mkv.csv'"
        print(f"MP4复制作业开始：{args.output_dir}")
        mkv_to_mp4_partial = lambda x: mp4_to_mp4(x, args.output_dir)
        data = apply(data, mkv_to_mp4_partial, axis=1)
        data = data[data["path"] != ""]
        output_csv = args.meta_path
    elif args.task == "vid_crop_center":
        vid_crop_center_partial = lambda x: vid_crop_center(x, args.input_dir, args.output_dir)
        data = apply(data, vid_crop_center_partial, axis=1)
        data = data[data["path"] != ""]
        output_csv = args.meta_path.replace(".csv", "_center-crop.csv")
    else:
        raise ValueError
    data.to_csv(output_csv, index=False)
    print(f"Saved to {output_csv}")
    raise SystemExit(0)
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--task",
        type=str,
        required=True,
        choices=[
            "img_resize_longer",
            "img_resize_shorter",
            "img_rand_crop",
            "vid_frame_extract",
            "m2ts_to_mp4",
            "mkv_to_mp4",
            "mp4_to_mp4",
            "vid_crop_center",
        ],
    )
    parser.add_argument("--meta_path", type=str, required=True)
    parser.add_argument("--input_dir", type=str)
    parser.add_argument("--output_dir", type=str)
    parser.add_argument("--length", type=int, default=1080)
    parser.add_argument("--disable-parallel", action="store_true")
    parser.add_argument("--num_workers", type=int, default=None)
    parser.add_argument("--seed", type=int, default=42, help="seed for random")
    parser.add_argument("--points", nargs="+", type=float, default=None)
    parser.add_argument("--points_index", nargs="+", type=int, default=None)
    args = parser.parse_args()
    return args
 if __name__ == "__main__":
    main()
--- a/tools/datasets/utils.py
+++ b/tools/datasets/utils.py
@ -0,0 +1,130 @@
 import os
 import cv2
 import numpy as np
 from PIL import Image
 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
 VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
 def is_video(filename):
    ext = os.path.splitext(filename)[-1].lower()
    return ext in VID_EXTENSIONS
 def extract_frames(
    video_path,
    frame_inds=None,
    points=None,
    backend="opencv",
    return_length=False,
    num_frames=None,
 ):
    """
    Args:
        video_path (str): path to video
        frame_inds (List[int]): indices of frames to extract
        points (List[float]): values within [0, 1); multiply #frames to get frame indices
    Return:
        List[PIL.Image]
    """
    assert backend in ["av", "opencv", "decord"]
    assert (frame_inds is None) or (points is None)
    if backend == "av":
        import av
        container = av.open(video_path)
        if num_frames is not None:
            total_frames = num_frames
        else:
            total_frames = container.streams.video[0].frames
        if points is not None:
            frame_inds = [int(p * total_frames) for p in points]
        frames = []
        for idx in frame_inds:
            if idx >= total_frames:
                idx = total_frames - 1
            target_timestamp = int(idx * av.time_base / container.streams.video[0].average_rate)
            container.seek(target_timestamp)  # return the nearest key frame, not the precise timestamp!!!
            frame = next(container.decode(video=0)).to_image()
            frames.append(frame)
        if return_length:
            return frames, total_frames
        return frames
    elif backend == "decord":
        import decord
        container = decord.VideoReader(video_path, num_threads=1)
        if num_frames is not None:
            total_frames = num_frames
        else:
            total_frames = len(container)
        if points is not None:
            frame_inds = [int(p * total_frames) for p in points]
        frame_inds = np.array(frame_inds).astype(np.int32)
        frame_inds[frame_inds >= total_frames] = total_frames - 1
        frames = container.get_batch(frame_inds).asnumpy()  # [N, H, W, C]
        frames = [Image.fromarray(x) for x in frames]
        if return_length:
            return frames, total_frames
        return frames
    elif backend == "opencv":
        cap = cv2.VideoCapture(video_path)
        if num_frames is not None:
            total_frames = num_frames
        else:
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if points is not None:
            frame_inds = [int(p * total_frames) for p in points]
        frames = []
        for idx in frame_inds:
            if idx >= total_frames:
                idx = total_frames - 1
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            # HACK: sometimes OpenCV fails to read frames, return a black frame instead
            try:
                ret, frame = cap.read()
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = Image.fromarray(frame)
            except Exception as e:
                print(f"[Warning] Error reading frame {idx} from {video_path}: {e}")
                # First, try to read the first frame
                try:
                    print(f"[Warning] Try reading first frame.")
                    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
                    ret, frame = cap.read()
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    frame = Image.fromarray(frame)
                # If that fails, return a black frame
                except Exception as e:
                    print(f"[Warning] Error in reading first frame from {video_path}: {e}")
                    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                    frame = Image.new("RGB", (width, height), (0, 0, 0))
            # HACK: if height or width is 0, return a black frame instead
            if frame.height == 0 or frame.width == 0:
                height = width = 256
                frame = Image.new("RGB", (width, height), (0, 0, 0))
            frames.append(frame)
        if return_length:
            return frames, total_frames
        return frames
    else:
        raise ValueError
		`@ -0,0 +1,2 @@`
							`from .datasets import TextDataset, VideoTextDataset`
							`from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample`