77 lines
2.0 KiB
Python
77 lines
2.0 KiB
Python
save_dir = "samples" # save directory
|
|
seed = 42 # random seed (except seed for z)
|
|
batch_size = 1
|
|
dtype = "bf16"
|
|
|
|
cond_type = "t2v"
|
|
# conditional inference options:
|
|
# t2v: text-to-video
|
|
# i2v_head: image-to-video (head)
|
|
# i2v_tail: image-to-video (tail)
|
|
# i2v_loop: connect images
|
|
# v2v_head_half: video extension with first half
|
|
# v2v_tail_half: video extension with second half
|
|
|
|
dataset = dict(type="text")
|
|
sampling_option = dict(
|
|
resolution="256px", # 256px or 768px
|
|
aspect_ratio="16:9", # 9:16 or 16:9 or 1:1
|
|
num_frames=129, # number of frames
|
|
num_steps=50, # number of steps
|
|
shift=True,
|
|
temporal_reduction=4,
|
|
is_causal_vae=True,
|
|
guidance=7.5, # guidance for text-to-video
|
|
guidance_img=3.0, # guidance for image-to-video
|
|
text_osci=True, # enable text guidance oscillation
|
|
image_osci=True, # enable image guidance oscillation
|
|
scale_temporal_osci=True,
|
|
method="i2v", # hard-coded for now
|
|
seed=None, # random seed for z
|
|
)
|
|
motion_score = "4" # motion score for video generation
|
|
fps_save = 24 # fps for video generation and saving
|
|
|
|
# Define model components
|
|
model = dict(
|
|
type="flux",
|
|
from_pretrained="./ckpts/Open_Sora_v2.safetensors",
|
|
guidance_embed=False,
|
|
fused_qkv=False,
|
|
use_liger_rope=True,
|
|
# model architecture
|
|
in_channels=64,
|
|
vec_in_dim=768,
|
|
context_in_dim=4096,
|
|
hidden_size=3072,
|
|
mlp_ratio=4.0,
|
|
num_heads=24,
|
|
depth=19,
|
|
depth_single_blocks=38,
|
|
axes_dim=[16, 56, 56],
|
|
theta=10_000,
|
|
qkv_bias=True,
|
|
cond_embed=True,
|
|
)
|
|
ae = dict(
|
|
type="hunyuan_vae",
|
|
from_pretrained="./ckpts/hunyuan_vae.safetensors",
|
|
in_channels=3,
|
|
out_channels=3,
|
|
layers_per_block=2,
|
|
latent_channels=16,
|
|
use_spatial_tiling=True,
|
|
use_temporal_tiling=False,
|
|
)
|
|
t5 = dict(
|
|
type="text_embedder",
|
|
from_pretrained="./ckpts/google/t5-v1_1-xxl",
|
|
max_length=512,
|
|
shardformer=True,
|
|
)
|
|
clip = dict(
|
|
type="text_embedder",
|
|
from_pretrained="./ckpts/openai/clip-vit-large-patch14",
|
|
max_length=77,
|
|
)
|