mysora/configs/diffusion/inference/256px.py

save_dir = "samples"  # save directory
seed = 42  # random seed (except seed for z)
batch_size = 1
dtype = "bf16"

cond_type = "t2v"
# conditional inference options:
# t2v: text-to-video
# i2v_head: image-to-video (head)
# i2v_tail: image-to-video (tail)
# i2v_loop: connect images
# v2v_head_half: video extension with first half
# v2v_tail_half: video extension with second half

dataset = dict(type="text")
sampling_option = dict(
    resolution="256px",  # 256px or 768px
    aspect_ratio="16:9",  # 9:16 or 16:9 or 1:1
    num_frames=129,  # number of frames
    num_steps=50,  # number of steps
    shift=True,
    temporal_reduction=4,
    is_causal_vae=True,
    guidance=7.5,  # guidance for text-to-video
    guidance_img=3.0,  # guidance for image-to-video
    text_osci=True,  # enable text guidance oscillation
    image_osci=True,  # enable image guidance oscillation
    scale_temporal_osci=True,
    method="i2v",  # hard-coded for now
    seed=None,  # random seed for z
)
motion_score = "4"  # motion score for video generation
fps_save = 24  # fps for video generation and saving

# Define model components
model = dict(
    type="flux",
    from_pretrained="./ckpts/Open_Sora_v2.safetensors",
    guidance_embed=False,
    fused_qkv=False,
    use_liger_rope=True,
    # model architecture
    in_channels=64,
    vec_in_dim=768,
    context_in_dim=4096,
    hidden_size=3072,
    mlp_ratio=4.0,
    num_heads=24,
    depth=19,
    depth_single_blocks=38,
    axes_dim=[16, 56, 56],
    theta=10_000,
    qkv_bias=True,
    cond_embed=True,
)
ae = dict(
    type="hunyuan_vae",
    from_pretrained="./ckpts/hunyuan_vae.safetensors",
    in_channels=3,
    out_channels=3,
    layers_per_block=2,
    latent_channels=16,
    use_spatial_tiling=True,
    use_temporal_tiling=False,
)
t5 = dict(
    type="text_embedder",
    from_pretrained="./ckpts/google/t5-v1_1-xxl",
    max_length=512,
    shardformer=True,
)
clip = dict(
    type="text_embedder",
    from_pretrained="./ckpts/openai/clip-vit-large-patch14",
    max_length=77,
)