mysora/configs/diffusion/inference/256px.py

77 lines
2.0 KiB
Python

save_dir = "samples" # save directory
seed = 42 # random seed (except seed for z)
batch_size = 1
dtype = "bf16"
cond_type = "t2v"
# conditional inference options:
# t2v: text-to-video
# i2v_head: image-to-video (head)
# i2v_tail: image-to-video (tail)
# i2v_loop: connect images
# v2v_head_half: video extension with first half
# v2v_tail_half: video extension with second half
dataset = dict(type="text")
sampling_option = dict(
resolution="256px", # 256px or 768px
aspect_ratio="16:9", # 9:16 or 16:9 or 1:1
num_frames=129, # number of frames
num_steps=50, # number of steps
shift=True,
temporal_reduction=4,
is_causal_vae=True,
guidance=7.5, # guidance for text-to-video
guidance_img=3.0, # guidance for image-to-video
text_osci=True, # enable text guidance oscillation
image_osci=True, # enable image guidance oscillation
scale_temporal_osci=True,
method="i2v", # hard-coded for now
seed=None, # random seed for z
)
motion_score = "4" # motion score for video generation
fps_save = 24 # fps for video generation and saving
# Define model components
model = dict(
type="flux",
from_pretrained="./ckpts/Open_Sora_v2.safetensors",
guidance_embed=False,
fused_qkv=False,
use_liger_rope=True,
# model architecture
in_channels=64,
vec_in_dim=768,
context_in_dim=4096,
hidden_size=3072,
mlp_ratio=4.0,
num_heads=24,
depth=19,
depth_single_blocks=38,
axes_dim=[16, 56, 56],
theta=10_000,
qkv_bias=True,
cond_embed=True,
)
ae = dict(
type="hunyuan_vae",
from_pretrained="./ckpts/hunyuan_vae.safetensors",
in_channels=3,
out_channels=3,
layers_per_block=2,
latent_channels=16,
use_spatial_tiling=True,
use_temporal_tiling=False,
)
t5 = dict(
type="text_embedder",
from_pretrained="./ckpts/google/t5-v1_1-xxl",
max_length=512,
shardformer=True,
)
clip = dict(
type="text_embedder",
from_pretrained="./ckpts/openai/clip-vit-large-patch14",
max_length=77,
)