132 lines
4.1 KiB
Python
132 lines
4.1 KiB
Python
import argparse
|
|
import json
|
|
import os
|
|
from functools import partial
|
|
|
|
import cv2
|
|
import numpy as np
|
|
import pandas as pd
|
|
from mmengine.logging import print_log
|
|
from moviepy.editor import VideoFileClip
|
|
from pandarallel import pandarallel
|
|
from tqdm import tqdm
|
|
|
|
tqdm.pandas()
|
|
|
|
|
|
def is_intact_video(video_path, mode="moviepy", verbose=False, logger=None):
|
|
if not os.path.exists(video_path):
|
|
if verbose:
|
|
print_log(f"Could not find '{video_path}'", logger=logger)
|
|
return False
|
|
|
|
if mode == "moviepy":
|
|
try:
|
|
VideoFileClip(video_path)
|
|
if verbose:
|
|
print_log(f"The video file '{video_path}' is intact.", logger=logger)
|
|
return True
|
|
except Exception as e:
|
|
if verbose:
|
|
print_log(f"Error: {e}", logger=logger)
|
|
print_log(f"The video file '{video_path}' is not intact.", logger=logger)
|
|
return False
|
|
elif mode == "cv2":
|
|
try:
|
|
cap = cv2.VideoCapture(video_path)
|
|
if cap.isOpened():
|
|
if verbose:
|
|
print_log(f"The video file '{video_path}' is intact.", logger=logger)
|
|
return True
|
|
except Exception as e:
|
|
if verbose:
|
|
print_log(f"Error: {e}", logger=logger)
|
|
print_log(f"The video file '{video_path}' is not intact.", logger=logger)
|
|
return False
|
|
else:
|
|
raise ValueError
|
|
|
|
|
|
def has_downloaded_success(json_path):
|
|
if not os.path.exists(json_path):
|
|
return False
|
|
|
|
try:
|
|
with open(json_path, "r") as f:
|
|
data = json.load(f)
|
|
if "success" not in data or isinstance(data["success"], bool) is False or data["success"] is False:
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("meta_path", type=str)
|
|
parser.add_argument("--folder_path", type=str, required=True)
|
|
parser.add_argument("--mode", type=str, default=None)
|
|
parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
|
|
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
meta_path = args.meta_path
|
|
folder_path = args.folder_path
|
|
mode = args.mode
|
|
|
|
def is_intact(row, mode=None):
|
|
video_id = row["id"]
|
|
video_path = os.path.join(folder_path, f"{video_id}.mp4")
|
|
row["path"] = video_path
|
|
|
|
if mode == ".mp4":
|
|
if is_intact_video(video_path):
|
|
return True, video_path
|
|
return False, video_path
|
|
elif mode == ".json":
|
|
# json_path = os.path.join(root_raw, f"data/{split}/{video_id}.json")
|
|
json_path = os.path.join(folder_path, f"{video_id}.json")
|
|
if has_downloaded_success(json_path):
|
|
return True, video_path
|
|
return False, video_path
|
|
elif mode is None:
|
|
return True, video_path
|
|
else:
|
|
raise ValueError
|
|
|
|
meta_dirpath = os.path.dirname(meta_path)
|
|
meta_fname = os.path.basename(meta_path)
|
|
wo_ext, ext = os.path.splitext(meta_fname)
|
|
|
|
if args.num_workers is not None:
|
|
pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
|
|
else:
|
|
pandarallel.initialize(progress_bar=True)
|
|
is_intact_partial = partial(is_intact, mode=mode)
|
|
|
|
meta = pd.read_csv(meta_path)
|
|
ret = meta.parallel_apply(is_intact_partial, axis=1)
|
|
intact, paths = list(zip(*ret))
|
|
|
|
meta["intact"] = intact
|
|
meta["path"] = paths
|
|
out_path = os.path.join(meta_dirpath, f"{wo_ext}_path_intact.csv")
|
|
meta.to_csv(out_path, index=False)
|
|
print(f"New meta (shape={meta.shape}) with intact info saved to '{out_path}'")
|
|
|
|
meta_format = meta[np.array(intact)]
|
|
meta_format.drop("intact", axis=1, inplace=True)
|
|
out_path = os.path.join(meta_dirpath, f"{wo_ext}_path-filtered.csv")
|
|
meta_format.to_csv(out_path, index=False)
|
|
print(f"New meta (shape={meta_format.shape}) with format info saved to '{out_path}'")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|