feat: add smoke test config and synthetic data generator

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 02:21:58 -08:00 · 2026-03-06 02:21:58 -08:00 · ae3e448c8a
parent bdeb2870d4
commit ae3e448c8a
2 changed files with 152 additions and 0 deletions
--- a/configs/diffusion/train/smoke_test.py
+++ b/configs/diffusion/train/smoke_test.py
@ -0,0 +1,77 @@
 _base_ = ["image.py"]
 # ===== Smoke Test Config =====
 # cached_video + cached_text: 跳过 VAE / T5 / CLIP，直接用预算好的 latent
 # 仅用 1 GPU，3 个训练步，验证训练 loop 通路
 # --- Dataset ---
 dataset = dict(
    type="cached_video_text",
    data_path="/data/train-input/smoke_test/data.csv",
    cached_video=True,
    cached_text=True,
    load_original_video=False,
    memory_efficient=False,
    vmaf=False,
 )
 # --- Bucket: 256px 单帧，batch_size=1 ---
 bucket_config = {
    "_delete_": True,
    "256px": {
        1: (1.0, 1),
    },
 }
 # --- 跳过 VAE / T5 / CLIP 加载 ---
 cached_video = True
 cached_text = True
 # --- Null 向量路径（空提示的 embedding，用于 dropout） ---
 # train.py 里 cached_text 模式下 hardcode 读这两个路径
 # 我们在 /mnt/ddn/sora/tmp_load/ 放假的 null 向量
 # --- 模型：随机初始化，不加载预训练权重 ---
 model = dict(
    from_pretrained=None,
    strict_load=False,
    grad_ckpt_settings=(1, 100),
 )
 # --- 优化器 ---
 lr = 1e-5
 optim = dict(
    cls="HybridAdam",
    lr=lr,
    eps=1e-15,
    weight_decay=0.0,
    adamw_mode=True,
 )
 # --- 训练参数 ---
 epochs = 1
 ckpt_every = 0      # 不保存 checkpoint
 log_every = 1
 warmup_steps = 0
 grad_clip = 1.0
 accumulation_steps = 1
 ema_decay = None    # 不用 EMA
 # --- 加速 ---
 dtype = "bf16"
 plugin = "zero2"
 plugin_config = dict(
    reduce_bucket_size_in_m=128,
    overlap_allgather=False,
 )
 pin_memory_cache_pre_alloc_numels = None  # 关掉 pin memory 预分配
 # --- 其他 ---
 num_workers = 0
 prefetch_factor = None
 num_bucket_build_workers = 1
 seed = 42
 outputs = "/data/train-output/smoke_test_outputs"
 grad_checkpoint = False  # 关掉 activation checkpoint，减少复杂度
 # dropout ratio 保持原样（0.31...），null vector 在 /mnt/ddn/sora/tmp_load/ 下
--- a/scripts/diffusion/gen_smoke_data.py
+++ b/scripts/diffusion/gen_smoke_data.py
@ -0,0 +1,75 @@
 """
 生成 smoke test 所需的合成数据（fake latents）。
 数据布局：
  /data/train-input/smoke_test/
    data.csv
    latents/sample_{i}.pt     # video latent [16, 1, 32, 32]
    t5/sample_{i}.pt          # T5 embedding  [256, 4096]
    clip/sample_{i}.pt        # CLIP embedding [768]
  /mnt/ddn/sora/tmp_load/
    null_t5.pt                # null T5  [1, 256, 4096]
    null_clip.pt              # null CLIP [1, 768]
 Shape 说明（256px 单帧，AE_SPATIAL_COMPRESSION=16，patch_size=2）：
  VAE latent: [C=16, T=1, H_lat=32, W_lat=32]
    32 = patch_size(2) * ceil(256 / AE_SPATIAL_COMPRESSION(16)) = 2 * 16
  T5: [seq=256, dim=4096]
  CLIP pooled: [dim=768]
 """
 import os
 import csv
 import torch
 N_SAMPLES = 4           # 生成 4 条假样本
 BASE_DIR  = "/data/train-input/smoke_test"
 NULL_DIR  = "/mnt/ddn/sora/tmp_load"
 LAT_C, LAT_T, LAT_H, LAT_W = 16, 1, 32, 32   # video latent shape
 T5_SEQ,  T5_DIM  = 256, 4096                   # T5 embedding
 CLIP_DIM          = 768                         # CLIP pooled
 def main():
    os.makedirs(f"{BASE_DIR}/latents", exist_ok=True)
    os.makedirs(f"{BASE_DIR}/t5",      exist_ok=True)
    os.makedirs(f"{BASE_DIR}/clip",    exist_ok=True)
    os.makedirs(NULL_DIR, exist_ok=True)
    rows = []
    for i in range(N_SAMPLES):
        lat_path  = f"{BASE_DIR}/latents/sample_{i}.pt"
        t5_path   = f"{BASE_DIR}/t5/sample_{i}.pt"
        clip_path = f"{BASE_DIR}/clip/sample_{i}.pt"
        torch.save(torch.randn(LAT_C, LAT_T, LAT_H, LAT_W), lat_path)
        torch.save(torch.randn(T5_SEQ, T5_DIM),              t5_path)
        torch.save(torch.randn(CLIP_DIM),                     clip_path)
        rows.append({
            "path":           lat_path,   # 占位（cached 模式不读原始视频）
            "text":           f"smoke test sample {i}",
            "latents_path":   lat_path,
            "text_t5_path":   t5_path,
            "text_clip_path": clip_path,
        })
    csv_path = f"{BASE_DIR}/data.csv"
    with open(csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
        writer.writeheader()
        writer.writerows(rows)
    print(f"Wrote {N_SAMPLES} samples to {csv_path}")
    # null vectors（空提示的 embedding，用于 cfg dropout）
    null_t5   = torch.zeros(1, T5_SEQ, T5_DIM)
    null_clip = torch.zeros(1, CLIP_DIM)
    torch.save(null_t5,   f"{NULL_DIR}/null_t5.pt")
    torch.save(null_clip, f"{NULL_DIR}/null_clip.pt")
    print(f"Wrote null vectors to {NULL_DIR}/")
    print("Done.")
 if __name__ == "__main__":
    main()