mysora/scripts/cnv/shard.py

import os

import pandas as pd
from tqdm import tqdm

try:
    import dask.dataframe as dd

    SUPPORT_DASK = True
except:
    SUPPORT_DASK = False


def shard_parquet(input_path, k):
    # 检查输入路径是否存在
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input file {input_path} does not exist.")

    # 读取 Parquet 文件为 Pandas DataFrame
    if SUPPORT_DASK:
        df = dd.read_parquet(input_path).compute()
    else:
        df = pd.read_parquet(input_path)

    # 去除指定的列
    columns_to_remove = [
        "num_frames",
        "height",
        "width",
        "aspect_ratio",
        "fps",
        "resolution",
    ]
    df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])

    # 计算每个分片的大小
    total_rows = len(df)
    rows_per_shard = (total_rows + k - 1) // k  # 向上取整

    # 创建与原始文件同名的文件夹
    base_dir = os.path.dirname(input_path)
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    output_dir = os.path.join(base_dir, base_name)
    os.makedirs(output_dir, exist_ok=True)

    # 创建分片并保存到文件夹
    for i in tqdm(range(k)):
        start_idx = i * rows_per_shard
        end_idx = min(start_idx + rows_per_shard, total_rows)

        shard_df = df.iloc[start_idx:end_idx]
        if shard_df.empty:
            continue

        shard_file_name = f"{i + 1:05d}.parquet"
        shard_path = os.path.join(output_dir, shard_file_name)

        shard_df.to_parquet(shard_path, index=False)

        # print(f"Shard saved to {shard_path}, rows: {len(shard_df)}")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Shard a Parquet file.")
    parser.add_argument("input_path", type=str, help="Path to the input Parquet file.")
    parser.add_argument(
        "k", type=int, help="Number of shards to create.", default=10000
    )

    args = parser.parse_args()

    shard_parquet(args.input_path, args.k)