import os import cv2 import numpy as np import torch from mmengine.runner import set_random_seed from tqdm import tqdm from opensora.datasets import video_transforms from opensora.datasets.utils import read_file from opensora.utils.misc import to_torch_dtype # data_path = "~/data/issue.csv" data_path = "~/data/test.csv" save_dir = "samples/debug_original_video_read_write" num_frames = 17 frame_interval = 1 image_size = 1024 set_random_seed(1024) os.makedirs(save_dir, exist_ok=True) data = read_file(data_path) device = "cuda" if torch.cuda.is_available() else "cpu" dtype = to_torch_dtype("bf16") def temporal_random_crop(vframes, num_frames, frame_interval): temporal_sample = video_transforms.TemporalRandomCrop(num_frames * frame_interval) total_frames = len(vframes) start_frame_ind, end_frame_ind = temporal_sample(total_frames) assert ( end_frame_ind - start_frame_ind >= num_frames ), f"Not enough frames to sample, {end_frame_ind} - {start_frame_ind} < {num_frames}" frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, num_frames, dtype=int) video = vframes[frame_indice] return video def to_tensor(clip): """ Convert tensor data type from uint8 to float, divide value by 255.0 and permute the dimensions of clip tensor Args: clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W) Return: clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W) """ if not clip.dtype == torch.uint8: raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype)) # breakpoint() return clip.float() / 255.0 def read_video_cv2(video_path): cap = cv2.VideoCapture(video_path) if not cap.isOpened(): raise ValueError else: fps = cap.get(cv2.CAP_PROP_FPS) vinfo = { "video_fps": fps, } frames = [] while True: # Read a frame from the video ret, frame = cap.read() # If frame is not read correctly, break the loop if not ret: break # frames.append(frame[:, :, ::-1]) # BGR to RGB frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Exit if 'q' is pressed if cv2.waitKey(25) & 0xFF == ord("q"): break # Release the video capture object and close all windows cap.release() cv2.destroyAllWindows() frames = np.stack(frames) frames = torch.from_numpy(frames) # [T, H, W, C=3] frames = frames.permute(0, 3, 1, 2) return frames, vinfo def write_video_cv2(path, video, fps=24, image_size=(1920, 1080)): # Set the video codec and create a VideoWriter object fourcc = cv2.VideoWriter_fourcc(*"mp4v") output = cv2.VideoWriter(path, fourcc, fps, image_size) for frame_idx in range(video.size(0)): frame = np.array(video[frame_idx].permute(1, 2, 0)) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) output.write(frame) output.release() for video_path in tqdm(data["path"]): name = os.path.basename(video_path) # # DEBUG: read image and save as if video: no issue # image = cv2.imread('/home/shenchenhui/data/ship-in-coffee-image.png') # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # image = torch.Tensor(image) # fake_vid = image.repeat(48, 1, 1, 1) # write_video(f"{save_dir}/fake.mp4", fake_vid, fps=24, video_codec="h264") # # ===== data loading ====== # # # loading # vframes, vinfo = read_video(video_path, backend="cv2") vframes, vinfo = read_video_cv2(video_path) video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24 print("fps:", video_fps) # # Sampling video frames video = vframes video = temporal_random_crop(vframes, num_frames, frame_interval) # not this issue # # breakpoint() # video = to_tensor(video) # # # transform # # transform_video = transforms.Compose( # # [ # # # video_transforms.ToTensorVideo(), # moved up # # # video_transforms.UCFCenterCropVideo(image_size), # not this issue # # transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), # not this issues # # ] # # ) # # video = transform_video(video) # T C H W ### write each frame as image # for frame_idx in range(video.size(0)): # frame = np.array(video[frame_idx].permute(1,2,0)) # frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # cv2.imwrite(f'{save_dir}/{frame_idx}.jpg', frame) # # # TCHW -> CTHW # video = video.permute(1, 0, 2, 3) # # # # ===== model training ====== # # # # video = video.to(device, dtype) # # # ===== data saving ====== # # # # # Normalize # # # low, high = -1,1 # # # video.clamp_(min=low, max=high) # # # video.sub_(low).div_(max(high - low, 1e-5)) # # # video = video.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8) # # # breakpoint() # # # video = video.permute(1, 2, 3, 0).to("cpu", torch.uint8) # video = video.permute(1, 2, 3, 0) # write_video_cv2(f"{save_dir}/{name}", video, fps=video_fps) # # prep to [T, H, W, C] in order to write # write_video(f"{save_dir}/{name}", video, fps=video_fps, video_codec="h264")